diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000000000000000000000000000000000..a76c39a2b572f010ff85525e54cae1fe3bf49acc --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "mast3r"] + path = mast3r + url = https://github.com/naver/mast3r \ No newline at end of file diff --git a/app.py b/app.py index 227ffab7b97dced8c1477be7f86f1157e0240a0e..c11c8092d33b170e94e3397ff686de77799e9c95 100644 --- a/app.py +++ b/app.py @@ -42,7 +42,8 @@ import torchvision.transforms as tvf sys.path.append(os.path.abspath('./modules/ultralytics')) from transformers import AutoTokenizer, AutoModel, AutoProcessor, SamModel -from modules.mast3r.model import AsymmetricMASt3R +# from modules.mast3r.model import AsymmetricMASt3R +from mast3r.model import AsymmetricMASt3R # from modules.sam2.build_sam import build_sam2_video_predictor from modules.mobilesamv2.promt_mobilesamv2 import ObjectAwareModel @@ -52,8 +53,8 @@ from sam2.sam2_video_predictor import SAM2VideoPredictor silent = False -# device = 'cuda' if torch.cuda.is_available() else 'cpu' #'cpu' # -# pe3r = Models('cpu') # +device = 'cuda' if torch.cuda.is_available() else 'cpu' #'cpu' # +# pe3r = Models('cpu') # 'cpu' # # print(device) def _convert_scene_output_to_glb(outdir, imgs, pts3d, mask, focals, cams2world, cam_size=0.05, @@ -259,7 +260,7 @@ def slerp_multiple(vectors, t_values): @torch.no_grad def get_mask_from_img_sam1(yolov8, mobilesamv2, sam1_image, yolov8_image, original_size, input_size, transform): - device = 'cuda' if torch.cuda.is_available() else 'cpu' + # device = 'cuda' if torch.cuda.is_available() else 'cpu' sam_mask=[] @@ -316,7 +317,7 @@ def get_mask_from_img_sam1(yolov8, mobilesamv2, sam1_image, yolov8_image, origin @torch.no_grad def get_cog_feats(images, sam2, siglip, siglip_processor, yolov8, mobilesamv2): - device = 'cuda' if torch.cuda.is_available() else 'cpu' + # device = 'cuda' if torch.cuda.is_available() else 'cpu' cog_seg_maps = [] rev_cog_seg_maps = [] @@ -462,7 +463,8 @@ def get_reconstructed_scene(outdir, filelist, schedule, niter, min_conf_thr, then run get_3D_model_from_scene """ - device = 'cuda' if torch.cuda.is_available() else 'cpu' + # device = 'cuda' if torch.cuda.is_available() else 'cpu' + MAST3R_CKP = 'naver/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric' mast3r = AsymmetricMASt3R.from_pretrained(MAST3R_CKP).to(device) @@ -542,10 +544,7 @@ def get_reconstructed_scene(outdir, filelist, schedule, niter, min_conf_thr, outfile = get_3D_model_from_scene(outdir, scene, min_conf_thr, as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size) - print(scene) scene.to('cpu') - print(scene) - torch.cuda.empty_cache() return scene, outfile diff --git a/modules/croco/LICENSE b/modules/croco/LICENSE deleted file mode 100644 index d9b84b1a65f9db6d8920a9048d162f52ba3ea56d..0000000000000000000000000000000000000000 --- a/modules/croco/LICENSE +++ /dev/null @@ -1,52 +0,0 @@ -CroCo, Copyright (c) 2022-present Naver Corporation, is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 license. - -A summary of the CC BY-NC-SA 4.0 license is located here: - https://creativecommons.org/licenses/by-nc-sa/4.0/ - -The CC BY-NC-SA 4.0 license is located here: - https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode - - -SEE NOTICE BELOW WITH RESPECT TO THE FILE: models/pos_embed.py, models/blocks.py - -*************************** - -NOTICE WITH RESPECT TO THE FILE: models/pos_embed.py - -This software is being redistributed in a modifiled form. The original form is available here: - -https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py - -This software in this file incorporates parts of the following software available here: - -Transformer: https://github.com/tensorflow/models/blob/master/official/legacy/transformer/model_utils.py -available under the following license: https://github.com/tensorflow/models/blob/master/LICENSE - -MoCo v3: https://github.com/facebookresearch/moco-v3 -available under the following license: https://github.com/facebookresearch/moco-v3/blob/main/LICENSE - -DeiT: https://github.com/facebookresearch/deit -available under the following license: https://github.com/facebookresearch/deit/blob/main/LICENSE - - -ORIGINAL COPYRIGHT NOTICE AND PERMISSION NOTICE AVAILABLE HERE IS REPRODUCE BELOW: - -https://github.com/facebookresearch/mae/blob/main/LICENSE - -Attribution-NonCommercial 4.0 International - -*************************** - -NOTICE WITH RESPECT TO THE FILE: models/blocks.py - -This software is being redistributed in a modifiled form. The original form is available here: - -https://github.com/rwightman/pytorch-image-models - -ORIGINAL COPYRIGHT NOTICE AND PERMISSION NOTICE AVAILABLE HERE IS REPRODUCE BELOW: - -https://github.com/rwightman/pytorch-image-models/blob/master/LICENSE - -Apache License -Version 2.0, January 2004 -http://www.apache.org/licenses/ \ No newline at end of file diff --git a/modules/croco/NOTICE b/modules/croco/NOTICE deleted file mode 100644 index d51bb365036c12d428d6e3a4fd00885756d5261c..0000000000000000000000000000000000000000 --- a/modules/croco/NOTICE +++ /dev/null @@ -1,21 +0,0 @@ -CroCo -Copyright 2022-present NAVER Corp. - -This project contains subcomponents with separate copyright notices and license terms. -Your use of the source code for these subcomponents is subject to the terms and conditions of the following licenses. - -==== - -facebookresearch/mae -https://github.com/facebookresearch/mae - -Attribution-NonCommercial 4.0 International - -==== - -rwightman/pytorch-image-models -https://github.com/rwightman/pytorch-image-models - -Apache License -Version 2.0, January 2004 -http://www.apache.org/licenses/ \ No newline at end of file diff --git a/modules/croco/README.MD b/modules/croco/README.MD deleted file mode 100644 index 38e33b001a60bd16749317fb297acd60f28a6f1b..0000000000000000000000000000000000000000 --- a/modules/croco/README.MD +++ /dev/null @@ -1,124 +0,0 @@ -# CroCo + CroCo v2 / CroCo-Stereo / CroCo-Flow - -[[`CroCo arXiv`](https://arxiv.org/abs/2210.10716)] [[`CroCo v2 arXiv`](https://arxiv.org/abs/2211.10408)] [[`project page and demo`](https://croco.europe.naverlabs.com/)] - -This repository contains the code for our CroCo model presented in our NeurIPS'22 paper [CroCo: Self-Supervised Pre-training for 3D Vision Tasks by Cross-View Completion](https://openreview.net/pdf?id=wZEfHUM5ri) and its follow-up extension published at ICCV'23 [Improved Cross-view Completion Pre-training for Stereo Matching and Optical Flow](https://openaccess.thecvf.com/content/ICCV2023/html/Weinzaepfel_CroCo_v2_Improved_Cross-view_Completion_Pre-training_for_Stereo_Matching_and_ICCV_2023_paper.html), refered to as CroCo v2: - -![image](assets/arch.jpg) - -```bibtex -@inproceedings{croco, - title={{CroCo: Self-Supervised Pre-training for 3D Vision Tasks by Cross-View Completion}}, - author={{Weinzaepfel, Philippe and Leroy, Vincent and Lucas, Thomas and Br\'egier, Romain and Cabon, Yohann and Arora, Vaibhav and Antsfeld, Leonid and Chidlovskii, Boris and Csurka, Gabriela and Revaud J\'er\^ome}}, - booktitle={{NeurIPS}}, - year={2022} -} - -@inproceedings{croco_v2, - title={{CroCo v2: Improved Cross-view Completion Pre-training for Stereo Matching and Optical Flow}}, - author={Weinzaepfel, Philippe and Lucas, Thomas and Leroy, Vincent and Cabon, Yohann and Arora, Vaibhav and Br{\'e}gier, Romain and Csurka, Gabriela and Antsfeld, Leonid and Chidlovskii, Boris and Revaud, J{\'e}r{\^o}me}, - booktitle={ICCV}, - year={2023} -} -``` - -## License - -The code is distributed under the CC BY-NC-SA 4.0 License. See [LICENSE](LICENSE) for more information. -Some components are based on code from [MAE](https://github.com/facebookresearch/mae) released under the CC BY-NC-SA 4.0 License and [timm](https://github.com/rwightman/pytorch-image-models) released under the Apache 2.0 License. -Some components for stereo matching and optical flow are based on code from [unimatch](https://github.com/autonomousvision/unimatch) released under the MIT license. - -## Preparation - -1. Install dependencies on a machine with a NVidia GPU using e.g. conda. Note that `habitat-sim` is required only for the interactive demo and the synthetic pre-training data generation. If you don't plan to use it, you can ignore the line installing it and use a more recent python version. - -```bash -conda create -n croco python=3.7 cmake=3.14.0 -conda activate croco -conda install habitat-sim headless -c conda-forge -c aihabitat -conda install pytorch torchvision -c pytorch -conda install notebook ipykernel matplotlib -conda install ipywidgets widgetsnbextension -conda install scikit-learn tqdm quaternion opencv # only for pretraining / habitat data generation - -``` - -2. Compile cuda kernels for RoPE - -CroCo v2 relies on RoPE positional embeddings for which you need to compile some cuda kernels. -```bash -cd models/curope/ -python setup.py build_ext --inplace -cd ../../ -``` - -This can be a bit long as we compile for all cuda architectures, feel free to update L9 of `models/curope/setup.py` to compile for specific architectures only. -You might also need to set the environment `CUDA_HOME` in case you use a custom cuda installation. - -In case you cannot provide, we also provide a slow pytorch version, which will be automatically loaded. - -3. Download pre-trained model - -We provide several pre-trained models: - -| modelname | pre-training data | pos. embed. | Encoder | Decoder | -|------------------------------------------------------------------------------------------------------------------------------------|-------------------|-------------|---------|---------| -| [`CroCo.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo.pth) | Habitat | cosine | ViT-B | Small | -| [`CroCo_V2_ViTBase_SmallDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTBase_SmallDecoder.pth) | Habitat + real | RoPE | ViT-B | Small | -| [`CroCo_V2_ViTBase_BaseDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTBase_BaseDecoder.pth) | Habitat + real | RoPE | ViT-B | Base | -| [`CroCo_V2_ViTLarge_BaseDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTLarge_BaseDecoder.pth) | Habitat + real | RoPE | ViT-L | Base | - -To download a specific model, i.e., the first one (`CroCo.pth`) -```bash -mkdir -p pretrained_models/ -wget https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo.pth -P pretrained_models/ -``` - -## Reconstruction example - -Simply run after downloading the `CroCo_V2_ViTLarge_BaseDecoder` pretrained model (or update the corresponding line in `demo.py`) -```bash -python demo.py -``` - -## Interactive demonstration of cross-view completion reconstruction on the Habitat simulator - -First download the test scene from Habitat: -```bash -python -m habitat_sim.utils.datasets_download --uids habitat_test_scenes --data-path habitat-sim-data/ -``` - -Then, run the Notebook demo `interactive_demo.ipynb`. - -In this demo, you should be able to sample a random reference viewpoint from an [Habitat](https://github.com/facebookresearch/habitat-sim) test scene. Use the sliders to change viewpoint and select a masked target view to reconstruct using CroCo. -![croco_interactive_demo](https://user-images.githubusercontent.com/1822210/200516576-7937bc6a-55f8-49ed-8618-3ddf89433ea4.jpg) - -## Pre-training - -### CroCo - -To pre-train CroCo, please first generate the pre-training data from the Habitat simulator, following the instructions in [datasets/habitat_sim/README.MD](datasets/habitat_sim/README.MD) and then run the following command: -``` -torchrun --nproc_per_node=4 pretrain.py --output_dir ./output/pretraining/ -``` - -Our CroCo pre-training was launched on a single server with 4 GPUs. -It should take around 10 days with A100 or 15 days with V100 to do the 400 pre-training epochs, but decent performances are obtained earlier in training. -Note that, while the code contains the same scaling rule of the learning rate as MAE when changing the effective batch size, we did not experimented if it is valid in our case. -The first run can take a few minutes to start, to parse all available pre-training pairs. - -### CroCo v2 - -For CroCo v2 pre-training, in addition to the generation of the pre-training data from the Habitat simulator above, please pre-extract the crops from the real datasets following the instructions in [datasets/crops/README.MD](datasets/crops/README.MD). -Then, run the following command for the largest model (ViT-L encoder, Base decoder): -``` -torchrun --nproc_per_node=8 pretrain.py --model "CroCoNet(enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_num_heads=12, dec_depth=12, pos_embed='RoPE100')" --dataset "habitat_release+ARKitScenes+MegaDepth+3DStreetView+IndoorVL" --warmup_epochs 12 --max_epoch 125 --epochs 250 --amp 0 --keep_freq 5 --output_dir ./output/pretraining_crocov2/ -``` - -Our CroCo v2 pre-training was launched on a single server with 8 GPUs for the largest model, and on a single server with 4 GPUs for the smaller ones, keeping a batch size of 64 per gpu in all cases. -The largest model should take around 12 days on A100. -Note that, while the code contains the same scaling rule of the learning rate as MAE when changing the effective batch size, we did not experimented if it is valid in our case. - -## Stereo matching and Optical flow downstream tasks - -For CroCo-Stereo and CroCo-Flow, please refer to [stereoflow/README.MD](stereoflow/README.MD). diff --git a/modules/croco/datasets/__init__.py b/modules/croco/datasets/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/modules/croco/datasets/crops/README.MD b/modules/croco/datasets/crops/README.MD deleted file mode 100644 index 47ddabebb177644694ee247ae878173a3a16644f..0000000000000000000000000000000000000000 --- a/modules/croco/datasets/crops/README.MD +++ /dev/null @@ -1,104 +0,0 @@ -## Generation of crops from the real datasets - -The instructions below allow to generate the crops used for pre-training CroCo v2 from the following real-world datasets: ARKitScenes, MegaDepth, 3DStreetView and IndoorVL. - -### Download the metadata of the crops to generate - -First, download the metadata and put them in `./data/`: -``` -mkdir -p data -cd data/ -wget https://download.europe.naverlabs.com/ComputerVision/CroCo/data/crop_metadata.zip -unzip crop_metadata.zip -rm crop_metadata.zip -cd .. -``` - -### Prepare the original datasets - -Second, download the original datasets in `./data/original_datasets/`. -``` -mkdir -p data/original_datasets -``` - -##### ARKitScenes - -Download the `raw` dataset from https://github.com/apple/ARKitScenes/blob/main/DATA.md and put it in `./data/original_datasets/ARKitScenes/`. -The resulting file structure should be like: -``` -./data/original_datasets/ARKitScenes/ -└───Training - └───40753679 - │ │ ultrawide - │ │ ... - └───40753686 - │ - ... -``` - -##### MegaDepth - -Download `MegaDepth v1 Dataset` from https://www.cs.cornell.edu/projects/megadepth/ and put it in `./data/original_datasets/MegaDepth/`. -The resulting file structure should be like: - -``` -./data/original_datasets/MegaDepth/ -└───0000 -│ └───images -│ │ │ 1000557903_87fa96b8a4_o.jpg -│ │ └ ... -│ └─── ... -└───0001 -│ │ -│ └ ... -└─── ... -``` - -##### 3DStreetView - -Download `3D_Street_View` dataset from https://github.com/amir32002/3D_Street_View and put it in `./data/original_datasets/3DStreetView/`. -The resulting file structure should be like: - -``` -./data/original_datasets/3DStreetView/ -└───dataset_aligned -│ └───0002 -│ │ │ 0000002_0000001_0000002_0000001.jpg -│ │ └ ... -│ └─── ... -└───dataset_unaligned -│ └───0003 -│ │ │ 0000003_0000001_0000002_0000001.jpg -│ │ └ ... -│ └─── ... -``` - -##### IndoorVL - -Download the `IndoorVL` datasets using [Kapture](https://github.com/naver/kapture). - -``` -pip install kapture -mkdir -p ./data/original_datasets/IndoorVL -cd ./data/original_datasets/IndoorVL -kapture_download_dataset.py update -kapture_download_dataset.py install "HyundaiDepartmentStore_*" -kapture_download_dataset.py install "GangnamStation_*" -cd - -``` - -### Extract the crops - -Now, extract the crops for each of the dataset: -``` -for dataset in ARKitScenes MegaDepth 3DStreetView IndoorVL; -do - python3 datasets/crops/extract_crops_from_images.py --crops ./data/crop_metadata/${dataset}/crops_release.txt --root-dir ./data/original_datasets/${dataset}/ --output-dir ./data/${dataset}_crops/ --imsize 256 --nthread 8 --max-subdir-levels 5 --ideal-number-pairs-in-dir 500; -done -``` - -##### Note for IndoorVL - -Due to some legal issues, we can only release 144,228 pairs out of the 1,593,689 pairs used in the paper. -To account for it in terms of number of pre-training iterations, the pre-training command in this repository uses 125 training epochs including 12 warm-up epochs and learning rate cosine schedule of 250, instead of 100, 10 and 200 respectively. -The impact on the performance is negligible. diff --git a/modules/croco/datasets/crops/extract_crops_from_images.py b/modules/croco/datasets/crops/extract_crops_from_images.py deleted file mode 100644 index eb66a0474ce44b54c44c08887cbafdb045b11ff3..0000000000000000000000000000000000000000 --- a/modules/croco/datasets/crops/extract_crops_from_images.py +++ /dev/null @@ -1,159 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). -# -# -------------------------------------------------------- -# Extracting crops for pre-training -# -------------------------------------------------------- - -import os -import argparse -from tqdm import tqdm -from PIL import Image -import functools -from multiprocessing import Pool -import math - - -def arg_parser(): - parser = argparse.ArgumentParser('Generate cropped image pairs from image crop list') - - parser.add_argument('--crops', type=str, required=True, help='crop file') - parser.add_argument('--root-dir', type=str, required=True, help='root directory') - parser.add_argument('--output-dir', type=str, required=True, help='output directory') - parser.add_argument('--imsize', type=int, default=256, help='size of the crops') - parser.add_argument('--nthread', type=int, required=True, help='number of simultaneous threads') - parser.add_argument('--max-subdir-levels', type=int, default=5, help='maximum number of subdirectories') - parser.add_argument('--ideal-number-pairs-in-dir', type=int, default=500, help='number of pairs stored in a dir') - return parser - - -def main(args): - listing_path = os.path.join(args.output_dir, 'listing.txt') - - print(f'Loading list of crops ... ({args.nthread} threads)') - crops, num_crops_to_generate = load_crop_file(args.crops) - - print(f'Preparing jobs ({len(crops)} candidate image pairs)...') - num_levels = min(math.ceil(math.log(num_crops_to_generate, args.ideal_number_pairs_in_dir)), args.max_subdir_levels) - num_pairs_in_dir = math.ceil(num_crops_to_generate ** (1/num_levels)) - - jobs = prepare_jobs(crops, num_levels, num_pairs_in_dir) - del crops - - os.makedirs(args.output_dir, exist_ok=True) - mmap = Pool(args.nthread).imap_unordered if args.nthread > 1 else map - call = functools.partial(save_image_crops, args) - - print(f"Generating cropped images to {args.output_dir} ...") - with open(listing_path, 'w') as listing: - listing.write('# pair_path\n') - for results in tqdm(mmap(call, jobs), total=len(jobs)): - for path in results: - listing.write(f'{path}\n') - print('Finished writing listing to', listing_path) - - -def load_crop_file(path): - data = open(path).read().splitlines() - pairs = [] - num_crops_to_generate = 0 - for line in tqdm(data): - if line.startswith('#'): - continue - line = line.split(', ') - if len(line) < 8: - img1, img2, rotation = line - pairs.append((img1, img2, int(rotation), [])) - else: - l1, r1, t1, b1, l2, r2, t2, b2 = map(int, line) - rect1, rect2 = (l1, t1, r1, b1), (l2, t2, r2, b2) - pairs[-1][-1].append((rect1, rect2)) - num_crops_to_generate += 1 - return pairs, num_crops_to_generate - - -def prepare_jobs(pairs, num_levels, num_pairs_in_dir): - jobs = [] - powers = [num_pairs_in_dir**level for level in reversed(range(num_levels))] - - def get_path(idx): - idx_array = [] - d = idx - for level in range(num_levels - 1): - idx_array.append(idx // powers[level]) - idx = idx % powers[level] - idx_array.append(d) - return '/'.join(map(lambda x: hex(x)[2:], idx_array)) - - idx = 0 - for pair_data in tqdm(pairs): - img1, img2, rotation, crops = pair_data - if -60 <= rotation and rotation <= 60: - rotation = 0 # most likely not a true rotation - paths = [get_path(idx + k) for k in range(len(crops))] - idx += len(crops) - jobs.append(((img1, img2), rotation, crops, paths)) - return jobs - - -def load_image(path): - try: - return Image.open(path).convert('RGB') - except Exception as e: - print('skipping', path, e) - raise OSError() - - -def save_image_crops(args, data): - # load images - img_pair, rot, crops, paths = data - try: - img1, img2 = [load_image(os.path.join(args.root_dir, impath)) for impath in img_pair] - except OSError as e: - return [] - - def area(sz): - return sz[0] * sz[1] - - tgt_size = (args.imsize, args.imsize) - - def prepare_crop(img, rect, rot=0): - # actual crop - img = img.crop(rect) - - # resize to desired size - interp = Image.Resampling.LANCZOS if area(img.size) > 4*area(tgt_size) else Image.Resampling.BICUBIC - img = img.resize(tgt_size, resample=interp) - - # rotate the image - rot90 = (round(rot/90) % 4) * 90 - if rot90 == 90: - img = img.transpose(Image.Transpose.ROTATE_90) - elif rot90 == 180: - img = img.transpose(Image.Transpose.ROTATE_180) - elif rot90 == 270: - img = img.transpose(Image.Transpose.ROTATE_270) - return img - - results = [] - for (rect1, rect2), path in zip(crops, paths): - crop1 = prepare_crop(img1, rect1) - crop2 = prepare_crop(img2, rect2, rot) - - fullpath1 = os.path.join(args.output_dir, path+'_1.jpg') - fullpath2 = os.path.join(args.output_dir, path+'_2.jpg') - os.makedirs(os.path.dirname(fullpath1), exist_ok=True) - - assert not os.path.isfile(fullpath1), fullpath1 - assert not os.path.isfile(fullpath2), fullpath2 - crop1.save(fullpath1) - crop2.save(fullpath2) - results.append(path) - - return results - - -if __name__ == '__main__': - args = arg_parser().parse_args() - main(args) - diff --git a/modules/croco/datasets/habitat_sim/README.MD b/modules/croco/datasets/habitat_sim/README.MD deleted file mode 100644 index a505781ff9eb91bce7f1d189e848f8ba1c560940..0000000000000000000000000000000000000000 --- a/modules/croco/datasets/habitat_sim/README.MD +++ /dev/null @@ -1,76 +0,0 @@ -## Generation of synthetic image pairs using Habitat-Sim - -These instructions allow to generate pre-training pairs from the Habitat simulator. -As we did not save metadata of the pairs used in the original paper, they are not strictly the same, but these data use the same setting and are equivalent. - -### Download Habitat-Sim scenes -Download Habitat-Sim scenes: -- Download links can be found here: https://github.com/facebookresearch/habitat-sim/blob/main/DATASETS.md -- We used scenes from the HM3D, habitat-test-scenes, Replica, ReplicaCad and ScanNet datasets. -- Please put the scenes under `./data/habitat-sim-data/scene_datasets/` following the structure below, or update manually paths in `paths.py`. -``` -./data/ -└──habitat-sim-data/ - └──scene_datasets/ - ├──hm3d/ - ├──gibson/ - ├──habitat-test-scenes/ - ├──replica_cad_baked_lighting/ - ├──replica_cad/ - ├──ReplicaDataset/ - └──scannet/ -``` - -### Image pairs generation -We provide metadata to generate reproducible images pairs for pretraining and validation. -Experiments described in the paper used similar data, but whose generation was not reproducible at the time. - -Specifications: -- 256x256 resolution images, with 60 degrees field of view . -- Up to 1000 image pairs per scene. -- Number of scenes considered/number of images pairs per dataset: - - Scannet: 1097 scenes / 985 209 pairs - - HM3D: - - hm3d/train: 800 / 800k pairs - - hm3d/val: 100 scenes / 100k pairs - - hm3d/minival: 10 scenes / 10k pairs - - habitat-test-scenes: 3 scenes / 3k pairs - - replica_cad_baked_lighting: 13 scenes / 13k pairs - -- Scenes from hm3d/val and hm3d/minival pairs were not used for the pre-training but kept for validation purposes. - -Download metadata and extract it: -```bash -mkdir -p data/habitat_release_metadata/ -cd data/habitat_release_metadata/ -wget https://download.europe.naverlabs.com/ComputerVision/CroCo/data/habitat_release_metadata/multiview_habitat_metadata.tar.gz -tar -xvf multiview_habitat_metadata.tar.gz -cd ../.. -# Location of the metadata -METADATA_DIR="./data/habitat_release_metadata/multiview_habitat_metadata" -``` - -Generate image pairs from metadata: -- The following command will print a list of commandlines to generate image pairs for each scene: -```bash -# Target output directory -PAIRS_DATASET_DIR="./data/habitat_release/" -python datasets/habitat_sim/generate_from_metadata_files.py --input_dir=$METADATA_DIR --output_dir=$PAIRS_DATASET_DIR -``` -- One can launch multiple of such commands in parallel e.g. using GNU Parallel: -```bash -python datasets/habitat_sim/generate_from_metadata_files.py --input_dir=$METADATA_DIR --output_dir=$PAIRS_DATASET_DIR | parallel -j 16 -``` - -## Metadata generation - -Image pairs were randomly sampled using the following commands, whose outputs contain randomness and are thus not exactly reproducible: -```bash -# Print commandlines to generate image pairs from the different scenes available. -PAIRS_DATASET_DIR=MY_CUSTOM_PATH -python datasets/habitat_sim/generate_multiview_images.py --list_commands --output_dir=$PAIRS_DATASET_DIR - -# Once a dataset is generated, pack metadata files for reproducibility. -METADATA_DIR=MY_CUSTON_PATH -python datasets/habitat_sim/pack_metadata_files.py $PAIRS_DATASET_DIR $METADATA_DIR -``` diff --git a/modules/croco/datasets/habitat_sim/__init__.py b/modules/croco/datasets/habitat_sim/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/modules/croco/datasets/habitat_sim/generate_from_metadata.py b/modules/croco/datasets/habitat_sim/generate_from_metadata.py deleted file mode 100644 index fbe0d399084359495250dc8184671ff498adfbf2..0000000000000000000000000000000000000000 --- a/modules/croco/datasets/habitat_sim/generate_from_metadata.py +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - -""" -Script to generate image pairs for a given scene reproducing poses provided in a metadata file. -""" -import os -from datasets.habitat_sim.multiview_habitat_sim_generator import MultiviewHabitatSimGenerator -from datasets.habitat_sim.paths import SCENES_DATASET -import argparse -import quaternion -import PIL.Image -import cv2 -import json -from tqdm import tqdm - -def generate_multiview_images_from_metadata(metadata_filename, - output_dir, - overload_params = dict(), - scene_datasets_paths=None, - exist_ok=False): - """ - Generate images from a metadata file for reproducibility purposes. - """ - # Reorder paths by decreasing label length, to avoid collisions when testing if a string by such label - if scene_datasets_paths is not None: - scene_datasets_paths = dict(sorted(scene_datasets_paths.items(), key= lambda x: len(x[0]), reverse=True)) - - with open(metadata_filename, 'r') as f: - input_metadata = json.load(f) - metadata = dict() - for key, value in input_metadata.items(): - # Optionally replace some paths - if key in ("scene_dataset_config_file", "scene", "navmesh") and value != "": - if scene_datasets_paths is not None: - for dataset_label, dataset_path in scene_datasets_paths.items(): - if value.startswith(dataset_label): - value = os.path.normpath(os.path.join(dataset_path, os.path.relpath(value, dataset_label))) - break - metadata[key] = value - - # Overload some parameters - for key, value in overload_params.items(): - metadata[key] = value - - generation_entries = dict([(key, value) for key, value in metadata.items() if not (key in ('multiviews', 'output_dir', 'generate_depth'))]) - generate_depth = metadata["generate_depth"] - - os.makedirs(output_dir, exist_ok=exist_ok) - - generator = MultiviewHabitatSimGenerator(**generation_entries) - - # Generate views - for idx_label, data in tqdm(metadata['multiviews'].items()): - positions = data["positions"] - orientations = data["orientations"] - n = len(positions) - for oidx in range(n): - observation = generator.render_viewpoint(positions[oidx], quaternion.from_float_array(orientations[oidx])) - observation_label = f"{oidx + 1}" # Leonid is indexing starting from 1 - # Color image saved using PIL - img = PIL.Image.fromarray(observation['color'][:,:,:3]) - filename = os.path.join(output_dir, f"{idx_label}_{observation_label}.jpeg") - img.save(filename) - if generate_depth: - # Depth image as EXR file - filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_depth.exr") - cv2.imwrite(filename, observation['depth'], [cv2.IMWRITE_EXR_TYPE, cv2.IMWRITE_EXR_TYPE_HALF]) - # Camera parameters - camera_params = dict([(key, observation[key].tolist()) for key in ("camera_intrinsics", "R_cam2world", "t_cam2world")]) - filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_camera_params.json") - with open(filename, "w") as f: - json.dump(camera_params, f) - # Save metadata - with open(os.path.join(output_dir, "metadata.json"), "w") as f: - json.dump(metadata, f) - - generator.close() - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--metadata_filename", required=True) - parser.add_argument("--output_dir", required=True) - args = parser.parse_args() - - generate_multiview_images_from_metadata(metadata_filename=args.metadata_filename, - output_dir=args.output_dir, - scene_datasets_paths=SCENES_DATASET, - overload_params=dict(), - exist_ok=True) - - \ No newline at end of file diff --git a/modules/croco/datasets/habitat_sim/generate_from_metadata_files.py b/modules/croco/datasets/habitat_sim/generate_from_metadata_files.py deleted file mode 100644 index 962ef849d8c31397b8622df4f2d9140175d78873..0000000000000000000000000000000000000000 --- a/modules/croco/datasets/habitat_sim/generate_from_metadata_files.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - -""" -Script generating commandlines to generate image pairs from metadata files. -""" -import os -import glob -from tqdm import tqdm -import argparse - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--input_dir", required=True) - parser.add_argument("--output_dir", required=True) - parser.add_argument("--prefix", default="", help="Commanline prefix, useful e.g. to setup environment.") - args = parser.parse_args() - - input_metadata_filenames = glob.iglob(f"{args.input_dir}/**/metadata.json", recursive=True) - - for metadata_filename in tqdm(input_metadata_filenames): - output_dir = os.path.join(args.output_dir, os.path.relpath(os.path.dirname(metadata_filename), args.input_dir)) - # Do not process the scene if the metadata file already exists - if os.path.exists(os.path.join(output_dir, "metadata.json")): - continue - commandline = f"{args.prefix}python datasets/habitat_sim/generate_from_metadata.py --metadata_filename={metadata_filename} --output_dir={output_dir}" - print(commandline) diff --git a/modules/croco/datasets/habitat_sim/generate_multiview_images.py b/modules/croco/datasets/habitat_sim/generate_multiview_images.py deleted file mode 100644 index 421d49a1696474415940493296b3f2d982398850..0000000000000000000000000000000000000000 --- a/modules/croco/datasets/habitat_sim/generate_multiview_images.py +++ /dev/null @@ -1,177 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - -import os -from tqdm import tqdm -import argparse -import PIL.Image -import numpy as np -import json -from datasets.habitat_sim.multiview_habitat_sim_generator import MultiviewHabitatSimGenerator, NoNaviguableSpaceError -from datasets.habitat_sim.paths import list_scenes_available -import cv2 -import quaternion -import shutil - -def generate_multiview_images_for_scene(scene_dataset_config_file, - scene, - navmesh, - output_dir, - views_count, - size, - exist_ok=False, - generate_depth=False, - **kwargs): - """ - Generate tuples of overlapping views for a given scene. - generate_depth: generate depth images and camera parameters. - """ - if os.path.exists(output_dir) and not exist_ok: - print(f"Scene {scene}: data already generated. Ignoring generation.") - return - try: - print(f"Scene {scene}: {size} multiview acquisitions to generate...") - os.makedirs(output_dir, exist_ok=exist_ok) - - metadata_filename = os.path.join(output_dir, "metadata.json") - - metadata_template = dict(scene_dataset_config_file=scene_dataset_config_file, - scene=scene, - navmesh=navmesh, - views_count=views_count, - size=size, - generate_depth=generate_depth, - **kwargs) - metadata_template["multiviews"] = dict() - - if os.path.exists(metadata_filename): - print("Metadata file already exists:", metadata_filename) - print("Loading already generated metadata file...") - with open(metadata_filename, "r") as f: - metadata = json.load(f) - - for key in metadata_template.keys(): - if key != "multiviews": - assert metadata_template[key] == metadata[key], f"existing file is inconsistent with the input parameters:\nKey: {key}\nmetadata: {metadata[key]}\ntemplate: {metadata_template[key]}." - else: - print("No temporary file found. Starting generation from scratch...") - metadata = metadata_template - - starting_id = len(metadata["multiviews"]) - print(f"Starting generation from index {starting_id}/{size}...") - if starting_id >= size: - print("Generation already done.") - return - - generator = MultiviewHabitatSimGenerator(scene_dataset_config_file=scene_dataset_config_file, - scene=scene, - navmesh=navmesh, - views_count = views_count, - size = size, - **kwargs) - - for idx in tqdm(range(starting_id, size)): - # Generate / re-generate the observations - try: - data = generator[idx] - observations = data["observations"] - positions = data["positions"] - orientations = data["orientations"] - - idx_label = f"{idx:08}" - for oidx, observation in enumerate(observations): - observation_label = f"{oidx + 1}" # Leonid is indexing starting from 1 - # Color image saved using PIL - img = PIL.Image.fromarray(observation['color'][:,:,:3]) - filename = os.path.join(output_dir, f"{idx_label}_{observation_label}.jpeg") - img.save(filename) - if generate_depth: - # Depth image as EXR file - filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_depth.exr") - cv2.imwrite(filename, observation['depth'], [cv2.IMWRITE_EXR_TYPE, cv2.IMWRITE_EXR_TYPE_HALF]) - # Camera parameters - camera_params = dict([(key, observation[key].tolist()) for key in ("camera_intrinsics", "R_cam2world", "t_cam2world")]) - filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_camera_params.json") - with open(filename, "w") as f: - json.dump(camera_params, f) - metadata["multiviews"][idx_label] = {"positions": positions.tolist(), - "orientations": orientations.tolist(), - "covisibility_ratios": data["covisibility_ratios"].tolist(), - "valid_fractions": data["valid_fractions"].tolist(), - "pairwise_visibility_ratios": data["pairwise_visibility_ratios"].tolist()} - except RecursionError: - print("Recursion error: unable to sample observations for this scene. We will stop there.") - break - - # Regularly save a temporary metadata file, in case we need to restart the generation - if idx % 10 == 0: - with open(metadata_filename, "w") as f: - json.dump(metadata, f) - - # Save metadata - with open(metadata_filename, "w") as f: - json.dump(metadata, f) - - generator.close() - except NoNaviguableSpaceError: - pass - -def create_commandline(scene_data, generate_depth, exist_ok=False): - """ - Create a commandline string to generate a scene. - """ - def my_formatting(val): - if val is None or val == "": - return '""' - else: - return val - commandline = f"""python {__file__} --scene {my_formatting(scene_data.scene)} - --scene_dataset_config_file {my_formatting(scene_data.scene_dataset_config_file)} - --navmesh {my_formatting(scene_data.navmesh)} - --output_dir {my_formatting(scene_data.output_dir)} - --generate_depth {int(generate_depth)} - --exist_ok {int(exist_ok)} - """ - commandline = " ".join(commandline.split()) - return commandline - -if __name__ == "__main__": - os.umask(2) - - parser = argparse.ArgumentParser(description="""Example of use -- listing commands to generate data for scenes available: - > python datasets/habitat_sim/generate_multiview_habitat_images.py --list_commands - """) - - parser.add_argument("--output_dir", type=str, required=True) - parser.add_argument("--list_commands", action='store_true', help="list commandlines to run if true") - parser.add_argument("--scene", type=str, default="") - parser.add_argument("--scene_dataset_config_file", type=str, default="") - parser.add_argument("--navmesh", type=str, default="") - - parser.add_argument("--generate_depth", type=int, default=1) - parser.add_argument("--exist_ok", type=int, default=0) - - kwargs = dict(resolution=(256,256), hfov=60, views_count = 2, size=1000) - - args = parser.parse_args() - generate_depth=bool(args.generate_depth) - exist_ok = bool(args.exist_ok) - - if args.list_commands: - # Listing scenes available... - scenes_data = list_scenes_available(base_output_dir=args.output_dir) - - for scene_data in scenes_data: - print(create_commandline(scene_data, generate_depth=generate_depth, exist_ok=exist_ok)) - else: - if args.scene == "" or args.output_dir == "": - print("Missing scene or output dir argument!") - print(parser.format_help()) - else: - generate_multiview_images_for_scene(scene=args.scene, - scene_dataset_config_file = args.scene_dataset_config_file, - navmesh = args.navmesh, - output_dir = args.output_dir, - exist_ok=exist_ok, - generate_depth=generate_depth, - **kwargs) \ No newline at end of file diff --git a/modules/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py b/modules/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py deleted file mode 100644 index 91e5f923b836a645caf5d8e4aacc425047e3c144..0000000000000000000000000000000000000000 --- a/modules/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py +++ /dev/null @@ -1,390 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - -import os -import numpy as np -import quaternion -import habitat_sim -import json -from sklearn.neighbors import NearestNeighbors -import cv2 - -# OpenCV to habitat camera convention transformation -R_OPENCV2HABITAT = np.stack((habitat_sim.geo.RIGHT, -habitat_sim.geo.UP, habitat_sim.geo.FRONT), axis=0) -R_HABITAT2OPENCV = R_OPENCV2HABITAT.T -DEG2RAD = np.pi / 180 - -def compute_camera_intrinsics(height, width, hfov): - f = width/2 / np.tan(hfov/2 * np.pi/180) - cu, cv = width/2, height/2 - return f, cu, cv - -def compute_camera_pose_opencv_convention(camera_position, camera_orientation): - R_cam2world = quaternion.as_rotation_matrix(camera_orientation) @ R_OPENCV2HABITAT - t_cam2world = np.asarray(camera_position) - return R_cam2world, t_cam2world - -def compute_pointmap(depthmap, hfov): - """ Compute a HxWx3 pointmap in camera frame from a HxW depth map.""" - height, width = depthmap.shape - f, cu, cv = compute_camera_intrinsics(height, width, hfov) - # Cast depth map to point - z_cam = depthmap - u, v = np.meshgrid(range(width), range(height)) - x_cam = (u - cu) / f * z_cam - y_cam = (v - cv) / f * z_cam - X_cam = np.stack((x_cam, y_cam, z_cam), axis=-1) - return X_cam - -def compute_pointcloud(depthmap, hfov, camera_position, camera_rotation): - """Return a 3D point cloud corresponding to valid pixels of the depth map""" - R_cam2world, t_cam2world = compute_camera_pose_opencv_convention(camera_position, camera_rotation) - - X_cam = compute_pointmap(depthmap=depthmap, hfov=hfov) - valid_mask = (X_cam[:,:,2] != 0.0) - - X_cam = X_cam.reshape(-1, 3)[valid_mask.flatten()] - X_world = X_cam @ R_cam2world.T + t_cam2world.reshape(1, 3) - return X_world - -def compute_pointcloud_overlaps_scikit(pointcloud1, pointcloud2, distance_threshold, compute_symmetric=False): - """ - Compute 'overlapping' metrics based on a distance threshold between two point clouds. - """ - nbrs = NearestNeighbors(n_neighbors=1, algorithm = 'kd_tree').fit(pointcloud2) - distances, indices = nbrs.kneighbors(pointcloud1) - intersection1 = np.count_nonzero(distances.flatten() < distance_threshold) - - data = {"intersection1": intersection1, - "size1": len(pointcloud1)} - if compute_symmetric: - nbrs = NearestNeighbors(n_neighbors=1, algorithm = 'kd_tree').fit(pointcloud1) - distances, indices = nbrs.kneighbors(pointcloud2) - intersection2 = np.count_nonzero(distances.flatten() < distance_threshold) - data["intersection2"] = intersection2 - data["size2"] = len(pointcloud2) - - return data - -def _append_camera_parameters(observation, hfov, camera_location, camera_rotation): - """ - Add camera parameters to the observation dictionnary produced by Habitat-Sim - In-place modifications. - """ - R_cam2world, t_cam2world = compute_camera_pose_opencv_convention(camera_location, camera_rotation) - height, width = observation['depth'].shape - f, cu, cv = compute_camera_intrinsics(height, width, hfov) - K = np.asarray([[f, 0, cu], - [0, f, cv], - [0, 0, 1.0]]) - observation["camera_intrinsics"] = K - observation["t_cam2world"] = t_cam2world - observation["R_cam2world"] = R_cam2world - -def look_at(eye, center, up, return_cam2world=True): - """ - Return camera pose looking at a given center point. - Analogous of gluLookAt function, using OpenCV camera convention. - """ - z = center - eye - z /= np.linalg.norm(z, axis=-1, keepdims=True) - y = -up - y = y - np.sum(y * z, axis=-1, keepdims=True) * z - y /= np.linalg.norm(y, axis=-1, keepdims=True) - x = np.cross(y, z, axis=-1) - - if return_cam2world: - R = np.stack((x, y, z), axis=-1) - t = eye - else: - # World to camera transformation - # Transposed matrix - R = np.stack((x, y, z), axis=-2) - t = - np.einsum('...ij, ...j', R, eye) - return R, t - -def look_at_for_habitat(eye, center, up, return_cam2world=True): - R, t = look_at(eye, center, up) - orientation = quaternion.from_rotation_matrix(R @ R_OPENCV2HABITAT.T) - return orientation, t - -def generate_orientation_noise(pan_range, tilt_range, roll_range): - return (quaternion.from_rotation_vector(np.random.uniform(*pan_range) * DEG2RAD * habitat_sim.geo.UP) - * quaternion.from_rotation_vector(np.random.uniform(*tilt_range) * DEG2RAD * habitat_sim.geo.RIGHT) - * quaternion.from_rotation_vector(np.random.uniform(*roll_range) * DEG2RAD * habitat_sim.geo.FRONT)) - - -class NoNaviguableSpaceError(RuntimeError): - def __init__(self, *args): - super().__init__(*args) - -class MultiviewHabitatSimGenerator: - def __init__(self, - scene, - navmesh, - scene_dataset_config_file, - resolution = (240, 320), - views_count=2, - hfov = 60, - gpu_id = 0, - size = 10000, - minimum_covisibility = 0.5, - transform = None): - self.scene = scene - self.navmesh = navmesh - self.scene_dataset_config_file = scene_dataset_config_file - self.resolution = resolution - self.views_count = views_count - assert(self.views_count >= 1) - self.hfov = hfov - self.gpu_id = gpu_id - self.size = size - self.transform = transform - - # Noise added to camera orientation - self.pan_range = (-3, 3) - self.tilt_range = (-10, 10) - self.roll_range = (-5, 5) - - # Height range to sample cameras - self.height_range = (1.2, 1.8) - - # Random steps between the camera views - self.random_steps_count = 5 - self.random_step_variance = 2.0 - - # Minimum fraction of the scene which should be valid (well defined depth) - self.minimum_valid_fraction = 0.7 - - # Distance threshold to see to select pairs - self.distance_threshold = 0.05 - # Minimum IoU of a view point cloud with respect to the reference view to be kept. - self.minimum_covisibility = minimum_covisibility - - # Maximum number of retries. - self.max_attempts_count = 100 - - self.seed = None - self._lazy_initialization() - - def _lazy_initialization(self): - # Lazy random seeding and instantiation of the simulator to deal with multiprocessing properly - if self.seed == None: - # Re-seed numpy generator - np.random.seed() - self.seed = np.random.randint(2**32-1) - sim_cfg = habitat_sim.SimulatorConfiguration() - sim_cfg.scene_id = self.scene - if self.scene_dataset_config_file is not None and self.scene_dataset_config_file != "": - sim_cfg.scene_dataset_config_file = self.scene_dataset_config_file - sim_cfg.random_seed = self.seed - sim_cfg.load_semantic_mesh = False - sim_cfg.gpu_device_id = self.gpu_id - - depth_sensor_spec = habitat_sim.CameraSensorSpec() - depth_sensor_spec.uuid = "depth" - depth_sensor_spec.sensor_type = habitat_sim.SensorType.DEPTH - depth_sensor_spec.resolution = self.resolution - depth_sensor_spec.hfov = self.hfov - depth_sensor_spec.position = [0.0, 0.0, 0] - depth_sensor_spec.orientation - - rgb_sensor_spec = habitat_sim.CameraSensorSpec() - rgb_sensor_spec.uuid = "color" - rgb_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR - rgb_sensor_spec.resolution = self.resolution - rgb_sensor_spec.hfov = self.hfov - rgb_sensor_spec.position = [0.0, 0.0, 0] - agent_cfg = habitat_sim.agent.AgentConfiguration(sensor_specifications=[rgb_sensor_spec, depth_sensor_spec]) - - cfg = habitat_sim.Configuration(sim_cfg, [agent_cfg]) - self.sim = habitat_sim.Simulator(cfg) - if self.navmesh is not None and self.navmesh != "": - # Use pre-computed navmesh when available (usually better than those generated automatically) - self.sim.pathfinder.load_nav_mesh(self.navmesh) - - if not self.sim.pathfinder.is_loaded: - # Try to compute a navmesh - navmesh_settings = habitat_sim.NavMeshSettings() - navmesh_settings.set_defaults() - self.sim.recompute_navmesh(self.sim.pathfinder, navmesh_settings, True) - - # Ensure that the navmesh is not empty - if not self.sim.pathfinder.is_loaded: - raise NoNaviguableSpaceError(f"No naviguable location (scene: {self.scene} -- navmesh: {self.navmesh})") - - self.agent = self.sim.initialize_agent(agent_id=0) - - def close(self): - self.sim.close() - - def __del__(self): - self.sim.close() - - def __len__(self): - return self.size - - def sample_random_viewpoint(self): - """ Sample a random viewpoint using the navmesh """ - nav_point = self.sim.pathfinder.get_random_navigable_point() - - # Sample a random viewpoint height - viewpoint_height = np.random.uniform(*self.height_range) - viewpoint_position = nav_point + viewpoint_height * habitat_sim.geo.UP - viewpoint_orientation = quaternion.from_rotation_vector(np.random.uniform(0, 2 * np.pi) * habitat_sim.geo.UP) * generate_orientation_noise(self.pan_range, self.tilt_range, self.roll_range) - return viewpoint_position, viewpoint_orientation, nav_point - - def sample_other_random_viewpoint(self, observed_point, nav_point): - """ Sample a random viewpoint close to an existing one, using the navmesh and a reference observed point.""" - other_nav_point = nav_point - - walk_directions = self.random_step_variance * np.asarray([1,0,1]) - for i in range(self.random_steps_count): - temp = self.sim.pathfinder.snap_point(other_nav_point + walk_directions * np.random.normal(size=3)) - # Snapping may return nan when it fails - if not np.isnan(temp[0]): - other_nav_point = temp - - other_viewpoint_height = np.random.uniform(*self.height_range) - other_viewpoint_position = other_nav_point + other_viewpoint_height * habitat_sim.geo.UP - - # Set viewing direction towards the central point - rotation, position = look_at_for_habitat(eye=other_viewpoint_position, center=observed_point, up=habitat_sim.geo.UP, return_cam2world=True) - rotation = rotation * generate_orientation_noise(self.pan_range, self.tilt_range, self.roll_range) - return position, rotation, other_nav_point - - def is_other_pointcloud_overlapping(self, ref_pointcloud, other_pointcloud): - """ Check if a viewpoint is valid and overlaps significantly with a reference one. """ - # Observation - pixels_count = self.resolution[0] * self.resolution[1] - valid_fraction = len(other_pointcloud) / pixels_count - assert valid_fraction <= 1.0 and valid_fraction >= 0.0 - overlap = compute_pointcloud_overlaps_scikit(ref_pointcloud, other_pointcloud, self.distance_threshold, compute_symmetric=True) - covisibility = min(overlap["intersection1"] / pixels_count, overlap["intersection2"] / pixels_count) - is_valid = (valid_fraction >= self.minimum_valid_fraction) and (covisibility >= self.minimum_covisibility) - return is_valid, valid_fraction, covisibility - - def is_other_viewpoint_overlapping(self, ref_pointcloud, observation, position, rotation): - """ Check if a viewpoint is valid and overlaps significantly with a reference one. """ - # Observation - other_pointcloud = compute_pointcloud(observation['depth'], self.hfov, position, rotation) - return self.is_other_pointcloud_overlapping(ref_pointcloud, other_pointcloud) - - def render_viewpoint(self, viewpoint_position, viewpoint_orientation): - agent_state = habitat_sim.AgentState() - agent_state.position = viewpoint_position - agent_state.rotation = viewpoint_orientation - self.agent.set_state(agent_state) - viewpoint_observations = self.sim.get_sensor_observations(agent_ids=0) - _append_camera_parameters(viewpoint_observations, self.hfov, viewpoint_position, viewpoint_orientation) - return viewpoint_observations - - def __getitem__(self, useless_idx): - ref_position, ref_orientation, nav_point = self.sample_random_viewpoint() - ref_observations = self.render_viewpoint(ref_position, ref_orientation) - # Extract point cloud - ref_pointcloud = compute_pointcloud(depthmap=ref_observations['depth'], hfov=self.hfov, - camera_position=ref_position, camera_rotation=ref_orientation) - - pixels_count = self.resolution[0] * self.resolution[1] - ref_valid_fraction = len(ref_pointcloud) / pixels_count - assert ref_valid_fraction <= 1.0 and ref_valid_fraction >= 0.0 - if ref_valid_fraction < self.minimum_valid_fraction: - # This should produce a recursion error at some point when something is very wrong. - return self[0] - # Pick an reference observed point in the point cloud - observed_point = np.mean(ref_pointcloud, axis=0) - - # Add the first image as reference - viewpoints_observations = [ref_observations] - viewpoints_covisibility = [ref_valid_fraction] - viewpoints_positions = [ref_position] - viewpoints_orientations = [quaternion.as_float_array(ref_orientation)] - viewpoints_clouds = [ref_pointcloud] - viewpoints_valid_fractions = [ref_valid_fraction] - - for _ in range(self.views_count - 1): - # Generate an other viewpoint using some dummy random walk - successful_sampling = False - for sampling_attempt in range(self.max_attempts_count): - position, rotation, _ = self.sample_other_random_viewpoint(observed_point, nav_point) - # Observation - other_viewpoint_observations = self.render_viewpoint(position, rotation) - other_pointcloud = compute_pointcloud(other_viewpoint_observations['depth'], self.hfov, position, rotation) - - is_valid, valid_fraction, covisibility = self.is_other_pointcloud_overlapping(ref_pointcloud, other_pointcloud) - if is_valid: - successful_sampling = True - break - if not successful_sampling: - print("WARNING: Maximum number of attempts reached.") - # Dirty hack, try using a novel original viewpoint - return self[0] - viewpoints_observations.append(other_viewpoint_observations) - viewpoints_covisibility.append(covisibility) - viewpoints_positions.append(position) - viewpoints_orientations.append(quaternion.as_float_array(rotation)) # WXYZ convention for the quaternion encoding. - viewpoints_clouds.append(other_pointcloud) - viewpoints_valid_fractions.append(valid_fraction) - - # Estimate relations between all pairs of images - pairwise_visibility_ratios = np.ones((len(viewpoints_observations), len(viewpoints_observations))) - for i in range(len(viewpoints_observations)): - pairwise_visibility_ratios[i,i] = viewpoints_valid_fractions[i] - for j in range(i+1, len(viewpoints_observations)): - overlap = compute_pointcloud_overlaps_scikit(viewpoints_clouds[i], viewpoints_clouds[j], self.distance_threshold, compute_symmetric=True) - pairwise_visibility_ratios[i,j] = overlap['intersection1'] / pixels_count - pairwise_visibility_ratios[j,i] = overlap['intersection2'] / pixels_count - - # IoU is relative to the image 0 - data = {"observations": viewpoints_observations, - "positions": np.asarray(viewpoints_positions), - "orientations": np.asarray(viewpoints_orientations), - "covisibility_ratios": np.asarray(viewpoints_covisibility), - "valid_fractions": np.asarray(viewpoints_valid_fractions, dtype=float), - "pairwise_visibility_ratios": np.asarray(pairwise_visibility_ratios, dtype=float), - } - - if self.transform is not None: - data = self.transform(data) - return data - - def generate_random_spiral_trajectory(self, images_count = 100, max_radius=0.5, half_turns=5, use_constant_orientation=False): - """ - Return a list of images corresponding to a spiral trajectory from a random starting point. - Useful to generate nice visualisations. - Use an even number of half turns to get a nice "C1-continuous" loop effect - """ - ref_position, ref_orientation, navpoint = self.sample_random_viewpoint() - ref_observations = self.render_viewpoint(ref_position, ref_orientation) - ref_pointcloud = compute_pointcloud(depthmap=ref_observations['depth'], hfov=self.hfov, - camera_position=ref_position, camera_rotation=ref_orientation) - pixels_count = self.resolution[0] * self.resolution[1] - if len(ref_pointcloud) / pixels_count < self.minimum_valid_fraction: - # Dirty hack: ensure that the valid part of the image is significant - return self.generate_random_spiral_trajectory(images_count, max_radius, half_turns, use_constant_orientation) - - # Pick an observed point in the point cloud - observed_point = np.mean(ref_pointcloud, axis=0) - ref_R, ref_t = compute_camera_pose_opencv_convention(ref_position, ref_orientation) - - images = [] - is_valid = [] - # Spiral trajectory, use_constant orientation - for i, alpha in enumerate(np.linspace(0, 1, images_count)): - r = max_radius * np.abs(np.sin(alpha * np.pi)) # Increase then decrease the radius - theta = alpha * half_turns * np.pi - x = r * np.cos(theta) - y = r * np.sin(theta) - z = 0.0 - position = ref_position + (ref_R @ np.asarray([x, y, z]).reshape(3,1)).flatten() - if use_constant_orientation: - orientation = ref_orientation - else: - # trajectory looking at a mean point in front of the ref observation - orientation, position = look_at_for_habitat(eye=position, center=observed_point, up=habitat_sim.geo.UP) - observations = self.render_viewpoint(position, orientation) - images.append(observations['color'][...,:3]) - _is_valid, valid_fraction, iou = self.is_other_viewpoint_overlapping(ref_pointcloud, observations, position, orientation) - is_valid.append(_is_valid) - return images, np.all(is_valid) \ No newline at end of file diff --git a/modules/croco/datasets/habitat_sim/pack_metadata_files.py b/modules/croco/datasets/habitat_sim/pack_metadata_files.py deleted file mode 100644 index 10672a01f7dd615d3b4df37781f7f6f97e753ba6..0000000000000000000000000000000000000000 --- a/modules/croco/datasets/habitat_sim/pack_metadata_files.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). -""" -Utility script to pack metadata files of the dataset in order to be able to re-generate it elsewhere. -""" -import os -import glob -from tqdm import tqdm -import shutil -import json -from datasets.habitat_sim.paths import * -import argparse -import collections - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("input_dir") - parser.add_argument("output_dir") - args = parser.parse_args() - - input_dirname = args.input_dir - output_dirname = args.output_dir - - input_metadata_filenames = glob.iglob(f"{input_dirname}/**/metadata.json", recursive=True) - - images_count = collections.defaultdict(lambda : 0) - - os.makedirs(output_dirname) - for input_filename in tqdm(input_metadata_filenames): - # Ignore empty files - with open(input_filename, "r") as f: - original_metadata = json.load(f) - if "multiviews" not in original_metadata or len(original_metadata["multiviews"]) == 0: - print("No views in", input_filename) - continue - - relpath = os.path.relpath(input_filename, input_dirname) - print(relpath) - - # Copy metadata, while replacing scene paths by generic keys depending on the dataset, for portability. - # Data paths are sorted by decreasing length to avoid potential bugs due to paths starting by the same string pattern. - scenes_dataset_paths = dict(sorted(SCENES_DATASET.items(), key=lambda x: len(x[1]), reverse=True)) - metadata = dict() - for key, value in original_metadata.items(): - if key in ("scene_dataset_config_file", "scene", "navmesh") and value != "": - known_path = False - for dataset, dataset_path in scenes_dataset_paths.items(): - if value.startswith(dataset_path): - value = os.path.join(dataset, os.path.relpath(value, dataset_path)) - known_path = True - break - if not known_path: - raise KeyError("Unknown path:" + value) - metadata[key] = value - - # Compile some general statistics while packing data - scene_split = metadata["scene"].split("/") - upper_level = "/".join(scene_split[:2]) if scene_split[0] == "hm3d" else scene_split[0] - images_count[upper_level] += len(metadata["multiviews"]) - - output_filename = os.path.join(output_dirname, relpath) - os.makedirs(os.path.dirname(output_filename), exist_ok=True) - with open(output_filename, "w") as f: - json.dump(metadata, f) - - # Print statistics - print("Images count:") - for upper_level, count in images_count.items(): - print(f"- {upper_level}: {count}") \ No newline at end of file diff --git a/modules/croco/datasets/habitat_sim/paths.py b/modules/croco/datasets/habitat_sim/paths.py deleted file mode 100644 index 4d63b5fa29c274ddfeae084734a35ba66d7edee8..0000000000000000000000000000000000000000 --- a/modules/croco/datasets/habitat_sim/paths.py +++ /dev/null @@ -1,129 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - -""" -Paths to Habitat-Sim scenes -""" - -import os -import json -import collections -from tqdm import tqdm - - -# Hardcoded path to the different scene datasets -SCENES_DATASET = { - "hm3d": "./data/habitat-sim-data/scene_datasets/hm3d/", - "gibson": "./data/habitat-sim-data/scene_datasets/gibson/", - "habitat-test-scenes": "./data/habitat-sim/scene_datasets/habitat-test-scenes/", - "replica_cad_baked_lighting": "./data/habitat-sim/scene_datasets/replica_cad_baked_lighting/", - "replica_cad": "./data/habitat-sim/scene_datasets/replica_cad/", - "replica": "./data/habitat-sim/scene_datasets/ReplicaDataset/", - "scannet": "./data/habitat-sim/scene_datasets/scannet/" -} - -SceneData = collections.namedtuple("SceneData", ["scene_dataset_config_file", "scene", "navmesh", "output_dir"]) - -def list_replicacad_scenes(base_output_dir, base_path=SCENES_DATASET["replica_cad"]): - scene_dataset_config_file = os.path.join(base_path, "replicaCAD.scene_dataset_config.json") - scenes = [f"apt_{i}" for i in range(6)] + ["empty_stage"] - navmeshes = [f"navmeshes/apt_{i}_static_furniture.navmesh" for i in range(6)] + ["empty_stage.navmesh"] - scenes_data = [] - for idx in range(len(scenes)): - output_dir = os.path.join(base_output_dir, "ReplicaCAD", scenes[idx]) - # Add scene - data = SceneData(scene_dataset_config_file=scene_dataset_config_file, - scene = scenes[idx] + ".scene_instance.json", - navmesh = os.path.join(base_path, navmeshes[idx]), - output_dir = output_dir) - scenes_data.append(data) - return scenes_data - -def list_replica_cad_baked_lighting_scenes(base_output_dir, base_path=SCENES_DATASET["replica_cad_baked_lighting"]): - scene_dataset_config_file = os.path.join(base_path, "replicaCAD_baked.scene_dataset_config.json") - scenes = sum([[f"Baked_sc{i}_staging_{j:02}" for i in range(5)] for j in range(21)], []) - navmeshes = ""#[f"navmeshes/apt_{i}_static_furniture.navmesh" for i in range(6)] + ["empty_stage.navmesh"] - scenes_data = [] - for idx in range(len(scenes)): - output_dir = os.path.join(base_output_dir, "replica_cad_baked_lighting", scenes[idx]) - data = SceneData(scene_dataset_config_file=scene_dataset_config_file, - scene = scenes[idx], - navmesh = "", - output_dir = output_dir) - scenes_data.append(data) - return scenes_data - -def list_replica_scenes(base_output_dir, base_path): - scenes_data = [] - for scene_id in os.listdir(base_path): - scene = os.path.join(base_path, scene_id, "mesh.ply") - navmesh = os.path.join(base_path, scene_id, "habitat/mesh_preseg_semantic.navmesh") # Not sure if I should use it - scene_dataset_config_file = "" - output_dir = os.path.join(base_output_dir, scene_id) - # Add scene only if it does not exist already, or if exist_ok - data = SceneData(scene_dataset_config_file = scene_dataset_config_file, - scene = scene, - navmesh = navmesh, - output_dir = output_dir) - scenes_data.append(data) - return scenes_data - - -def list_scenes(base_output_dir, base_path): - """ - Generic method iterating through a base_path folder to find scenes. - """ - scenes_data = [] - for root, dirs, files in os.walk(base_path, followlinks=True): - folder_scenes_data = [] - for file in files: - name, ext = os.path.splitext(file) - if ext == ".glb": - scene = os.path.join(root, name + ".glb") - navmesh = os.path.join(root, name + ".navmesh") - if not os.path.exists(navmesh): - navmesh = "" - relpath = os.path.relpath(root, base_path) - output_dir = os.path.abspath(os.path.join(base_output_dir, relpath, name)) - data = SceneData(scene_dataset_config_file="", - scene = scene, - navmesh = navmesh, - output_dir = output_dir) - folder_scenes_data.append(data) - - # Specific check for HM3D: - # When two meshesxxxx.basis.glb and xxxx.glb are present, use the 'basis' version. - basis_scenes = [data.scene[:-len(".basis.glb")] for data in folder_scenes_data if data.scene.endswith(".basis.glb")] - if len(basis_scenes) != 0: - folder_scenes_data = [data for data in folder_scenes_data if not (data.scene[:-len(".glb")] in basis_scenes)] - - scenes_data.extend(folder_scenes_data) - return scenes_data - -def list_scenes_available(base_output_dir, scenes_dataset_paths=SCENES_DATASET): - scenes_data = [] - - # HM3D - for split in ("minival", "train", "val", "examples"): - scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, f"hm3d/{split}/"), - base_path=f"{scenes_dataset_paths['hm3d']}/{split}") - - # Gibson - scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, "gibson"), - base_path=scenes_dataset_paths["gibson"]) - - # Habitat test scenes (just a few) - scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, "habitat-test-scenes"), - base_path=scenes_dataset_paths["habitat-test-scenes"]) - - # ReplicaCAD (baked lightning) - scenes_data += list_replica_cad_baked_lighting_scenes(base_output_dir=base_output_dir) - - # ScanNet - scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, "scannet"), - base_path=scenes_dataset_paths["scannet"]) - - # Replica - list_replica_scenes(base_output_dir=os.path.join(base_output_dir, "replica"), - base_path=scenes_dataset_paths["replica"]) - return scenes_data diff --git a/modules/croco/datasets/pairs_dataset.py b/modules/croco/datasets/pairs_dataset.py deleted file mode 100644 index 9f107526b34e154d9013a9a7a0bde3d5ff6f581c..0000000000000000000000000000000000000000 --- a/modules/croco/datasets/pairs_dataset.py +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - -import os -from torch.utils.data import Dataset -from PIL import Image - -from datasets.transforms import get_pair_transforms - -def load_image(impath): - return Image.open(impath) - -def load_pairs_from_cache_file(fname, root=''): - assert os.path.isfile(fname), "cannot parse pairs from {:s}, file does not exist".format(fname) - with open(fname, 'r') as fid: - lines = fid.read().strip().splitlines() - pairs = [ (os.path.join(root,l.split()[0]), os.path.join(root,l.split()[1])) for l in lines] - return pairs - -def load_pairs_from_list_file(fname, root=''): - assert os.path.isfile(fname), "cannot parse pairs from {:s}, file does not exist".format(fname) - with open(fname, 'r') as fid: - lines = fid.read().strip().splitlines() - pairs = [ (os.path.join(root,l+'_1.jpg'), os.path.join(root,l+'_2.jpg')) for l in lines if not l.startswith('#')] - return pairs - - -def write_cache_file(fname, pairs, root=''): - if len(root)>0: - if not root.endswith('/'): root+='/' - assert os.path.isdir(root) - s = '' - for im1, im2 in pairs: - if len(root)>0: - assert im1.startswith(root), im1 - assert im2.startswith(root), im2 - s += '{:s} {:s}\n'.format(im1[len(root):], im2[len(root):]) - with open(fname, 'w') as fid: - fid.write(s[:-1]) - -def parse_and_cache_all_pairs(dname, data_dir='./data/'): - if dname=='habitat_release': - dirname = os.path.join(data_dir, 'habitat_release') - assert os.path.isdir(dirname), "cannot find folder for habitat_release pairs: "+dirname - cache_file = os.path.join(dirname, 'pairs.txt') - assert not os.path.isfile(cache_file), "cache file already exists: "+cache_file - - print('Parsing pairs for dataset: '+dname) - pairs = [] - for root, dirs, files in os.walk(dirname): - if 'val' in root: continue - dirs.sort() - pairs += [ (os.path.join(root,f), os.path.join(root,f[:-len('_1.jpeg')]+'_2.jpeg')) for f in sorted(files) if f.endswith('_1.jpeg')] - print('Found {:,} pairs'.format(len(pairs))) - print('Writing cache to: '+cache_file) - write_cache_file(cache_file, pairs, root=dirname) - - else: - raise NotImplementedError('Unknown dataset: '+dname) - -def dnames_to_image_pairs(dnames, data_dir='./data/'): - """ - dnames: list of datasets with image pairs, separated by + - """ - all_pairs = [] - for dname in dnames.split('+'): - if dname=='habitat_release': - dirname = os.path.join(data_dir, 'habitat_release') - assert os.path.isdir(dirname), "cannot find folder for habitat_release pairs: "+dirname - cache_file = os.path.join(dirname, 'pairs.txt') - assert os.path.isfile(cache_file), "cannot find cache file for habitat_release pairs, please first create the cache file, see instructions. "+cache_file - pairs = load_pairs_from_cache_file(cache_file, root=dirname) - elif dname in ['ARKitScenes', 'MegaDepth', '3DStreetView', 'IndoorVL']: - dirname = os.path.join(data_dir, dname+'_crops') - assert os.path.isdir(dirname), "cannot find folder for {:s} pairs: {:s}".format(dname, dirname) - list_file = os.path.join(dirname, 'listing.txt') - assert os.path.isfile(list_file), "cannot find list file for {:s} pairs, see instructions. {:s}".format(dname, list_file) - pairs = load_pairs_from_list_file(list_file, root=dirname) - print(' {:s}: {:,} pairs'.format(dname, len(pairs))) - all_pairs += pairs - if '+' in dnames: print(' Total: {:,} pairs'.format(len(all_pairs))) - return all_pairs - - -class PairsDataset(Dataset): - - def __init__(self, dnames, trfs='', totensor=True, normalize=True, data_dir='./data/'): - super().__init__() - self.image_pairs = dnames_to_image_pairs(dnames, data_dir=data_dir) - self.transforms = get_pair_transforms(transform_str=trfs, totensor=totensor, normalize=normalize) - - def __len__(self): - return len(self.image_pairs) - - def __getitem__(self, index): - im1path, im2path = self.image_pairs[index] - im1 = load_image(im1path) - im2 = load_image(im2path) - if self.transforms is not None: im1, im2 = self.transforms(im1, im2) - return im1, im2 - - -if __name__=="__main__": - import argparse - parser = argparse.ArgumentParser(prog="Computing and caching list of pairs for a given dataset") - parser.add_argument('--data_dir', default='./data/', type=str, help="path where data are stored") - parser.add_argument('--dataset', default='habitat_release', type=str, help="name of the dataset") - args = parser.parse_args() - parse_and_cache_all_pairs(dname=args.dataset, data_dir=args.data_dir) diff --git a/modules/croco/datasets/transforms.py b/modules/croco/datasets/transforms.py deleted file mode 100644 index 216bac61f8254fd50e7f269ee80301f250a2d11e..0000000000000000000000000000000000000000 --- a/modules/croco/datasets/transforms.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - -import torch -import torchvision.transforms -import torchvision.transforms.functional as F - -# "Pair": apply a transform on a pair -# "Both": apply the exact same transform to both images - -class ComposePair(torchvision.transforms.Compose): - def __call__(self, img1, img2): - for t in self.transforms: - img1, img2 = t(img1, img2) - return img1, img2 - -class NormalizeBoth(torchvision.transforms.Normalize): - def forward(self, img1, img2): - img1 = super().forward(img1) - img2 = super().forward(img2) - return img1, img2 - -class ToTensorBoth(torchvision.transforms.ToTensor): - def __call__(self, img1, img2): - img1 = super().__call__(img1) - img2 = super().__call__(img2) - return img1, img2 - -class RandomCropPair(torchvision.transforms.RandomCrop): - # the crop will be intentionally different for the two images with this class - def forward(self, img1, img2): - img1 = super().forward(img1) - img2 = super().forward(img2) - return img1, img2 - -class ColorJitterPair(torchvision.transforms.ColorJitter): - # can be symmetric (same for both images) or assymetric (different jitter params for each image) depending on assymetric_prob - def __init__(self, assymetric_prob, **kwargs): - super().__init__(**kwargs) - self.assymetric_prob = assymetric_prob - def jitter_one(self, img, fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor): - for fn_id in fn_idx: - if fn_id == 0 and brightness_factor is not None: - img = F.adjust_brightness(img, brightness_factor) - elif fn_id == 1 and contrast_factor is not None: - img = F.adjust_contrast(img, contrast_factor) - elif fn_id == 2 and saturation_factor is not None: - img = F.adjust_saturation(img, saturation_factor) - elif fn_id == 3 and hue_factor is not None: - img = F.adjust_hue(img, hue_factor) - return img - - def forward(self, img1, img2): - - fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor = self.get_params( - self.brightness, self.contrast, self.saturation, self.hue - ) - img1 = self.jitter_one(img1, fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor) - if torch.rand(1) < self.assymetric_prob: # assymetric: - fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor = self.get_params( - self.brightness, self.contrast, self.saturation, self.hue - ) - img2 = self.jitter_one(img2, fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor) - return img1, img2 - -def get_pair_transforms(transform_str, totensor=True, normalize=True): - # transform_str is eg crop224+color - trfs = [] - for s in transform_str.split('+'): - if s.startswith('crop'): - size = int(s[len('crop'):]) - trfs.append(RandomCropPair(size)) - elif s=='acolor': - trfs.append(ColorJitterPair(assymetric_prob=1.0, brightness=(0.6, 1.4), contrast=(0.6, 1.4), saturation=(0.6, 1.4), hue=0.0)) - elif s=='': # if transform_str was "" - pass - else: - raise NotImplementedError('Unknown augmentation: '+s) - - if totensor: - trfs.append( ToTensorBoth() ) - if normalize: - trfs.append( NormalizeBoth(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ) - - if len(trfs)==0: - return None - elif len(trfs)==1: - return trfs - else: - return ComposePair(trfs) - - - - - diff --git a/modules/croco/demo.py b/modules/croco/demo.py deleted file mode 100644 index 91b80ccc5c98c18e20d1ce782511aa824ef28f77..0000000000000000000000000000000000000000 --- a/modules/croco/demo.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - -import torch -from models.croco import CroCoNet -from PIL import Image -import torchvision.transforms -from torchvision.transforms import ToTensor, Normalize, Compose - -def main(): - device = torch.device('cuda:0' if torch.cuda.is_available() and torch.cuda.device_count()>0 else 'cpu') - - # load 224x224 images and transform them to tensor - imagenet_mean = [0.485, 0.456, 0.406] - imagenet_mean_tensor = torch.tensor(imagenet_mean).view(1,3,1,1).to(device, non_blocking=True) - imagenet_std = [0.229, 0.224, 0.225] - imagenet_std_tensor = torch.tensor(imagenet_std).view(1,3,1,1).to(device, non_blocking=True) - trfs = Compose([ToTensor(), Normalize(mean=imagenet_mean, std=imagenet_std)]) - image1 = trfs(Image.open('assets/Chateau1.png').convert('RGB')).to(device, non_blocking=True).unsqueeze(0) - image2 = trfs(Image.open('assets/Chateau2.png').convert('RGB')).to(device, non_blocking=True).unsqueeze(0) - - # load model - ckpt = torch.load('pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth', 'cpu') - model = CroCoNet( **ckpt.get('croco_kwargs',{})).to(device) - model.eval() - msg = model.load_state_dict(ckpt['model'], strict=True) - - # forward - with torch.inference_mode(): - out, mask, target = model(image1, image2) - - # the output is normalized, thus use the mean/std of the actual image to go back to RGB space - patchified = model.patchify(image1) - mean = patchified.mean(dim=-1, keepdim=True) - var = patchified.var(dim=-1, keepdim=True) - decoded_image = model.unpatchify(out * (var + 1.e-6)**.5 + mean) - # undo imagenet normalization, prepare masked image - decoded_image = decoded_image * imagenet_std_tensor + imagenet_mean_tensor - input_image = image1 * imagenet_std_tensor + imagenet_mean_tensor - ref_image = image2 * imagenet_std_tensor + imagenet_mean_tensor - image_masks = model.unpatchify(model.patchify(torch.ones_like(ref_image)) * mask[:,:,None]) - masked_input_image = ((1 - image_masks) * input_image) - - # make visualization - visualization = torch.cat((ref_image, masked_input_image, decoded_image, input_image), dim=3) # 4*(B, 3, H, W) -> B, 3, H, W*4 - B, C, H, W = visualization.shape - visualization = visualization.permute(1, 0, 2, 3).reshape(C, B*H, W) - visualization = torchvision.transforms.functional.to_pil_image(torch.clamp(visualization, 0, 1)) - fname = "demo_output.png" - visualization.save(fname) - print('Visualization save in '+fname) - - -if __name__=="__main__": - main() diff --git a/modules/croco/interactive_demo.ipynb b/modules/croco/interactive_demo.ipynb deleted file mode 100644 index 6cfc960af5baac9a69029c29a16eea4e24123a71..0000000000000000000000000000000000000000 --- a/modules/croco/interactive_demo.ipynb +++ /dev/null @@ -1,271 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Interactive demo of Cross-view Completion." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Copyright (C) 2022-present Naver Corporation. All rights reserved.\n", - "# Licensed under CC BY-NC-SA 4.0 (non-commercial use only)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import numpy as np\n", - "from models.croco import CroCoNet\n", - "from ipywidgets import interact, interactive, fixed, interact_manual\n", - "import ipywidgets as widgets\n", - "import matplotlib.pyplot as plt\n", - "import quaternion\n", - "import models.masking" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load CroCo model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ckpt = torch.load('pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth', 'cpu')\n", - "model = CroCoNet( **ckpt.get('croco_kwargs',{}))\n", - "msg = model.load_state_dict(ckpt['model'], strict=True)\n", - "use_gpu = torch.cuda.is_available() and torch.cuda.device_count()>0\n", - "device = torch.device('cuda:0' if use_gpu else 'cpu')\n", - "model = model.eval()\n", - "model = model.to(device=device)\n", - "print(msg)\n", - "\n", - "def process_images(ref_image, target_image, masking_ratio, reconstruct_unmasked_patches=False):\n", - " \"\"\"\n", - " Perform Cross-View completion using two input images, specified using Numpy arrays.\n", - " \"\"\"\n", - " # Replace the mask generator\n", - " model.mask_generator = models.masking.RandomMask(model.patch_embed.num_patches, masking_ratio)\n", - "\n", - " # ImageNet-1k color normalization\n", - " imagenet_mean = torch.as_tensor([0.485, 0.456, 0.406]).reshape(1,3,1,1).to(device)\n", - " imagenet_std = torch.as_tensor([0.229, 0.224, 0.225]).reshape(1,3,1,1).to(device)\n", - "\n", - " normalize_input_colors = True\n", - " is_output_normalized = True\n", - " with torch.no_grad():\n", - " # Cast data to torch\n", - " target_image = (torch.as_tensor(target_image, dtype=torch.float, device=device).permute(2,0,1) / 255)[None]\n", - " ref_image = (torch.as_tensor(ref_image, dtype=torch.float, device=device).permute(2,0,1) / 255)[None]\n", - "\n", - " if normalize_input_colors:\n", - " ref_image = (ref_image - imagenet_mean) / imagenet_std\n", - " target_image = (target_image - imagenet_mean) / imagenet_std\n", - "\n", - " out, mask, _ = model(target_image, ref_image)\n", - " # # get target\n", - " if not is_output_normalized:\n", - " predicted_image = model.unpatchify(out)\n", - " else:\n", - " # The output only contains higher order information,\n", - " # we retrieve mean and standard deviation from the actual target image\n", - " patchified = model.patchify(target_image)\n", - " mean = patchified.mean(dim=-1, keepdim=True)\n", - " var = patchified.var(dim=-1, keepdim=True)\n", - " pred_renorm = out * (var + 1.e-6)**.5 + mean\n", - " predicted_image = model.unpatchify(pred_renorm)\n", - "\n", - " image_masks = model.unpatchify(model.patchify(torch.ones_like(ref_image)) * mask[:,:,None])\n", - " masked_target_image = (1 - image_masks) * target_image\n", - " \n", - " if not reconstruct_unmasked_patches:\n", - " # Replace unmasked patches by their actual values\n", - " predicted_image = predicted_image * image_masks + masked_target_image\n", - "\n", - " # Unapply color normalization\n", - " if normalize_input_colors:\n", - " predicted_image = predicted_image * imagenet_std + imagenet_mean\n", - " masked_target_image = masked_target_image * imagenet_std + imagenet_mean\n", - " \n", - " # Cast to Numpy\n", - " masked_target_image = np.asarray(torch.clamp(masked_target_image.squeeze(0).permute(1,2,0) * 255, 0, 255).cpu().numpy(), dtype=np.uint8)\n", - " predicted_image = np.asarray(torch.clamp(predicted_image.squeeze(0).permute(1,2,0) * 255, 0, 255).cpu().numpy(), dtype=np.uint8)\n", - " return masked_target_image, predicted_image" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Use the Habitat simulator to render images from arbitrary viewpoints (requires habitat_sim to be installed)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os.environ[\"MAGNUM_LOG\"]=\"quiet\"\n", - "os.environ[\"HABITAT_SIM_LOG\"]=\"quiet\"\n", - "import habitat_sim\n", - "\n", - "scene = \"habitat-sim-data/scene_datasets/habitat-test-scenes/skokloster-castle.glb\"\n", - "navmesh = \"habitat-sim-data/scene_datasets/habitat-test-scenes/skokloster-castle.navmesh\"\n", - "\n", - "sim_cfg = habitat_sim.SimulatorConfiguration()\n", - "if use_gpu: sim_cfg.gpu_device_id = 0\n", - "sim_cfg.scene_id = scene\n", - "sim_cfg.load_semantic_mesh = False\n", - "rgb_sensor_spec = habitat_sim.CameraSensorSpec()\n", - "rgb_sensor_spec.uuid = \"color\"\n", - "rgb_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR\n", - "rgb_sensor_spec.resolution = (224,224)\n", - "rgb_sensor_spec.hfov = 56.56\n", - "rgb_sensor_spec.position = [0.0, 0.0, 0.0]\n", - "rgb_sensor_spec.orientation = [0, 0, 0]\n", - "agent_cfg = habitat_sim.agent.AgentConfiguration(sensor_specifications=[rgb_sensor_spec])\n", - "\n", - "\n", - "cfg = habitat_sim.Configuration(sim_cfg, [agent_cfg])\n", - "sim = habitat_sim.Simulator(cfg)\n", - "if navmesh is not None:\n", - " sim.pathfinder.load_nav_mesh(navmesh)\n", - "agent = sim.initialize_agent(agent_id=0)\n", - "\n", - "def sample_random_viewpoint():\n", - " \"\"\" Sample a random viewpoint using the navmesh \"\"\"\n", - " nav_point = sim.pathfinder.get_random_navigable_point()\n", - " # Sample a random viewpoint height\n", - " viewpoint_height = np.random.uniform(1.0, 1.6)\n", - " viewpoint_position = nav_point + viewpoint_height * habitat_sim.geo.UP\n", - " viewpoint_orientation = quaternion.from_rotation_vector(np.random.uniform(-np.pi, np.pi) * habitat_sim.geo.UP)\n", - " return viewpoint_position, viewpoint_orientation\n", - "\n", - "def render_viewpoint(position, orientation):\n", - " agent_state = habitat_sim.AgentState()\n", - " agent_state.position = position\n", - " agent_state.rotation = orientation\n", - " agent.set_state(agent_state)\n", - " viewpoint_observations = sim.get_sensor_observations(agent_ids=0)\n", - " image = viewpoint_observations['color'][:,:,:3]\n", - " image = np.asarray(np.clip(1.5 * np.asarray(image, dtype=float), 0, 255), dtype=np.uint8)\n", - " return image" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sample a random reference view" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ref_position, ref_orientation = sample_random_viewpoint()\n", - "ref_image = render_viewpoint(ref_position, ref_orientation)\n", - "plt.clf()\n", - "fig, axes = plt.subplots(1,1, squeeze=False, num=1)\n", - "axes[0,0].imshow(ref_image)\n", - "for ax in axes.flatten():\n", - " ax.set_xticks([])\n", - " ax.set_yticks([])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Interactive cross-view completion using CroCo" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "reconstruct_unmasked_patches = False\n", - "\n", - "def show_demo(masking_ratio, x, y, z, panorama, elevation):\n", - " R = quaternion.as_rotation_matrix(ref_orientation)\n", - " target_position = ref_position + x * R[:,0] + y * R[:,1] + z * R[:,2]\n", - " target_orientation = (ref_orientation\n", - " * quaternion.from_rotation_vector(-elevation * np.pi/180 * habitat_sim.geo.LEFT) \n", - " * quaternion.from_rotation_vector(-panorama * np.pi/180 * habitat_sim.geo.UP))\n", - " \n", - " ref_image = render_viewpoint(ref_position, ref_orientation)\n", - " target_image = render_viewpoint(target_position, target_orientation)\n", - "\n", - " masked_target_image, predicted_image = process_images(ref_image, target_image, masking_ratio, reconstruct_unmasked_patches)\n", - "\n", - " fig, axes = plt.subplots(1,4, squeeze=True, dpi=300)\n", - " axes[0].imshow(ref_image)\n", - " axes[0].set_xlabel(\"Reference\")\n", - " axes[1].imshow(masked_target_image)\n", - " axes[1].set_xlabel(\"Masked target\")\n", - " axes[2].imshow(predicted_image)\n", - " axes[2].set_xlabel(\"Reconstruction\") \n", - " axes[3].imshow(target_image)\n", - " axes[3].set_xlabel(\"Target\")\n", - " for ax in axes.flatten():\n", - " ax.set_xticks([])\n", - " ax.set_yticks([])\n", - "\n", - "interact(show_demo,\n", - " masking_ratio=widgets.FloatSlider(description='masking', value=0.9, min=0.0, max=1.0),\n", - " x=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n", - " y=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n", - " z=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n", - " panorama=widgets.FloatSlider(value=0.0, min=-20, max=20, step=0.5),\n", - " elevation=widgets.FloatSlider(value=0.0, min=-20, max=20, step=0.5));" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.13" - }, - "vscode": { - "interpreter": { - "hash": "f9237820cd248d7e07cb4fb9f0e4508a85d642f19d831560c0a4b61f3e907e67" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/modules/croco/models/__pycache__/blocks.cpython-312.pyc b/modules/croco/models/__pycache__/blocks.cpython-312.pyc deleted file mode 100644 index 93c7671dc04288faa20b2087e4bb321dcf7d5eae..0000000000000000000000000000000000000000 Binary files a/modules/croco/models/__pycache__/blocks.cpython-312.pyc and /dev/null differ diff --git a/modules/croco/models/__pycache__/croco.cpython-312.pyc b/modules/croco/models/__pycache__/croco.cpython-312.pyc deleted file mode 100644 index 65996eee0bb753f42a25bc58319087c8d022631f..0000000000000000000000000000000000000000 Binary files a/modules/croco/models/__pycache__/croco.cpython-312.pyc and /dev/null differ diff --git a/modules/croco/models/__pycache__/dpt_block.cpython-312.pyc b/modules/croco/models/__pycache__/dpt_block.cpython-312.pyc deleted file mode 100644 index a9af6c9bbec80addbe45243829a2f2c1b3d6883d..0000000000000000000000000000000000000000 Binary files a/modules/croco/models/__pycache__/dpt_block.cpython-312.pyc and /dev/null differ diff --git a/modules/croco/models/__pycache__/masking.cpython-312.pyc b/modules/croco/models/__pycache__/masking.cpython-312.pyc deleted file mode 100644 index fe29441f7cac1ef9d261fa4dce6fefb2a2e4a197..0000000000000000000000000000000000000000 Binary files a/modules/croco/models/__pycache__/masking.cpython-312.pyc and /dev/null differ diff --git a/modules/croco/models/__pycache__/pos_embed.cpython-312.pyc b/modules/croco/models/__pycache__/pos_embed.cpython-312.pyc deleted file mode 100644 index 6f758f374520be10fbdcd9df4d94166fe464ed20..0000000000000000000000000000000000000000 Binary files a/modules/croco/models/__pycache__/pos_embed.cpython-312.pyc and /dev/null differ diff --git a/modules/croco/models/blocks.py b/modules/croco/models/blocks.py deleted file mode 100644 index 18133524f0ae265b0bd8d062d7c9eeaa63858a9b..0000000000000000000000000000000000000000 --- a/modules/croco/models/blocks.py +++ /dev/null @@ -1,241 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - - -# -------------------------------------------------------- -# Main encoder/decoder blocks -# -------------------------------------------------------- -# References: -# timm -# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py -# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/helpers.py -# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py -# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/mlp.py -# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/patch_embed.py - - -import torch -import torch.nn as nn - -from itertools import repeat -import collections.abc - - -def _ntuple(n): - def parse(x): - if isinstance(x, collections.abc.Iterable) and not isinstance(x, str): - return x - return tuple(repeat(x, n)) - return parse -to_2tuple = _ntuple(2) - -def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True): - """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). - """ - if drop_prob == 0. or not training: - return x - keep_prob = 1 - drop_prob - shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets - random_tensor = x.new_empty(shape).bernoulli_(keep_prob) - if keep_prob > 0.0 and scale_by_keep: - random_tensor.div_(keep_prob) - return x * random_tensor - -class DropPath(nn.Module): - """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). - """ - def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True): - super(DropPath, self).__init__() - self.drop_prob = drop_prob - self.scale_by_keep = scale_by_keep - - def forward(self, x): - return drop_path(x, self.drop_prob, self.training, self.scale_by_keep) - - def extra_repr(self): - return f'drop_prob={round(self.drop_prob,3):0.3f}' - -class Mlp(nn.Module): - """ MLP as used in Vision Transformer, MLP-Mixer and related networks""" - def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, bias=True, drop=0.): - super().__init__() - out_features = out_features or in_features - hidden_features = hidden_features or in_features - bias = to_2tuple(bias) - drop_probs = to_2tuple(drop) - - self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0]) - self.act = act_layer() - self.drop1 = nn.Dropout(drop_probs[0]) - self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1]) - self.drop2 = nn.Dropout(drop_probs[1]) - - def forward(self, x): - x = self.fc1(x) - x = self.act(x) - x = self.drop1(x) - x = self.fc2(x) - x = self.drop2(x) - return x - -class Attention(nn.Module): - - def __init__(self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.): - super().__init__() - self.num_heads = num_heads - head_dim = dim // num_heads - self.scale = head_dim ** -0.5 - self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) - self.attn_drop = nn.Dropout(attn_drop) - self.proj = nn.Linear(dim, dim) - self.proj_drop = nn.Dropout(proj_drop) - self.rope = rope - - def forward(self, x, xpos): - B, N, C = x.shape - - qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).transpose(1,3) - q, k, v = [qkv[:,:,i] for i in range(3)] - # q,k,v = qkv.unbind(2) # make torchscript happy (cannot use tensor as tuple) - - if self.rope is not None: - q = self.rope(q, xpos) - k = self.rope(k, xpos) - - attn = (q @ k.transpose(-2, -1)) * self.scale - attn = attn.softmax(dim=-1) - attn = self.attn_drop(attn) - - x = (attn @ v).transpose(1, 2).reshape(B, N, C) - x = self.proj(x) - x = self.proj_drop(x) - return x - -class Block(nn.Module): - - def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., - drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, rope=None): - super().__init__() - self.norm1 = norm_layer(dim) - self.attn = Attention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) - # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here - self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() - self.norm2 = norm_layer(dim) - mlp_hidden_dim = int(dim * mlp_ratio) - self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) - - def forward(self, x, xpos): - x = x + self.drop_path(self.attn(self.norm1(x), xpos)) - x = x + self.drop_path(self.mlp(self.norm2(x))) - return x - -class CrossAttention(nn.Module): - - def __init__(self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.): - super().__init__() - self.num_heads = num_heads - head_dim = dim // num_heads - self.scale = head_dim ** -0.5 - - self.projq = nn.Linear(dim, dim, bias=qkv_bias) - self.projk = nn.Linear(dim, dim, bias=qkv_bias) - self.projv = nn.Linear(dim, dim, bias=qkv_bias) - self.attn_drop = nn.Dropout(attn_drop) - self.proj = nn.Linear(dim, dim) - self.proj_drop = nn.Dropout(proj_drop) - - self.rope = rope - - def forward(self, query, key, value, qpos, kpos): - B, Nq, C = query.shape - Nk = key.shape[1] - Nv = value.shape[1] - - q = self.projq(query).reshape(B,Nq,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3) - k = self.projk(key).reshape(B,Nk,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3) - v = self.projv(value).reshape(B,Nv,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3) - - if self.rope is not None: - q = self.rope(q, qpos) - k = self.rope(k, kpos) - - attn = (q @ k.transpose(-2, -1)) * self.scale - attn = attn.softmax(dim=-1) - attn = self.attn_drop(attn) - - x = (attn @ v).transpose(1, 2).reshape(B, Nq, C) - x = self.proj(x) - x = self.proj_drop(x) - return x - -class DecoderBlock(nn.Module): - - def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., - drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, norm_mem=True, rope=None): - super().__init__() - self.norm1 = norm_layer(dim) - self.attn = Attention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) - self.cross_attn = CrossAttention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) - self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() - self.norm2 = norm_layer(dim) - self.norm3 = norm_layer(dim) - mlp_hidden_dim = int(dim * mlp_ratio) - self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) - self.norm_y = norm_layer(dim) if norm_mem else nn.Identity() - - def forward(self, x, y, xpos, ypos): - x = x + self.drop_path(self.attn(self.norm1(x), xpos)) - y_ = self.norm_y(y) - x = x + self.drop_path(self.cross_attn(self.norm2(x), y_, y_, xpos, ypos)) - x = x + self.drop_path(self.mlp(self.norm3(x))) - return x, y - - -# patch embedding -class PositionGetter(object): - """ return positions of patches """ - - def __init__(self): - self.cache_positions = {} - - def __call__(self, b, h, w, device): - if not (h,w) in self.cache_positions: - x = torch.arange(w, device=device) - y = torch.arange(h, device=device) - self.cache_positions[h,w] = torch.cartesian_prod(y, x) # (h, w, 2) - pos = self.cache_positions[h,w].view(1, h*w, 2).expand(b, -1, 2).clone() - return pos - -class PatchEmbed(nn.Module): - """ just adding _init_weights + position getter compared to timm.models.layers.patch_embed.PatchEmbed""" - - def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True): - super().__init__() - img_size = to_2tuple(img_size) - patch_size = to_2tuple(patch_size) - self.img_size = img_size - self.patch_size = patch_size - self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) - self.num_patches = self.grid_size[0] * self.grid_size[1] - self.flatten = flatten - - self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) - self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() - - self.position_getter = PositionGetter() - - def forward(self, x): - B, C, H, W = x.shape - torch._assert(H == self.img_size[0], f"Input image height ({H}) doesn't match model ({self.img_size[0]}).") - torch._assert(W == self.img_size[1], f"Input image width ({W}) doesn't match model ({self.img_size[1]}).") - x = self.proj(x) - pos = self.position_getter(B, x.size(2), x.size(3), x.device) - if self.flatten: - x = x.flatten(2).transpose(1, 2) # BCHW -> BNC - x = self.norm(x) - return x, pos - - def _init_weights(self): - w = self.proj.weight.data - torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1])) - diff --git a/modules/croco/models/criterion.py b/modules/croco/models/criterion.py deleted file mode 100644 index 11696c40865344490f23796ea45e8fbd5e654731..0000000000000000000000000000000000000000 --- a/modules/croco/models/criterion.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). -# -# -------------------------------------------------------- -# Criterion to train CroCo -# -------------------------------------------------------- -# References: -# MAE: https://github.com/facebookresearch/mae -# -------------------------------------------------------- - -import torch - -class MaskedMSE(torch.nn.Module): - - def __init__(self, norm_pix_loss=False, masked=True): - """ - norm_pix_loss: normalize each patch by their pixel mean and variance - masked: compute loss over the masked patches only - """ - super().__init__() - self.norm_pix_loss = norm_pix_loss - self.masked = masked - - def forward(self, pred, mask, target): - - if self.norm_pix_loss: - mean = target.mean(dim=-1, keepdim=True) - var = target.var(dim=-1, keepdim=True) - target = (target - mean) / (var + 1.e-6)**.5 - - loss = (pred - target) ** 2 - loss = loss.mean(dim=-1) # [N, L], mean loss per patch - if self.masked: - loss = (loss * mask).sum() / mask.sum() # mean loss on masked patches - else: - loss = loss.mean() # mean loss - return loss diff --git a/modules/croco/models/croco.py b/modules/croco/models/croco.py deleted file mode 100644 index 14c68634152d75555b4c35c25af268394c5821fe..0000000000000000000000000000000000000000 --- a/modules/croco/models/croco.py +++ /dev/null @@ -1,249 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - - -# -------------------------------------------------------- -# CroCo model during pretraining -# -------------------------------------------------------- - - - -import torch -import torch.nn as nn -torch.backends.cuda.matmul.allow_tf32 = True # for gpu >= Ampere and pytorch >= 1.12 -from functools import partial - -from models.blocks import Block, DecoderBlock, PatchEmbed -from models.pos_embed import get_2d_sincos_pos_embed, RoPE2D -from models.masking import RandomMask - - -class CroCoNet(nn.Module): - - def __init__(self, - img_size=224, # input image size - patch_size=16, # patch_size - mask_ratio=0.9, # ratios of masked tokens - enc_embed_dim=768, # encoder feature dimension - enc_depth=12, # encoder depth - enc_num_heads=12, # encoder number of heads in the transformer block - dec_embed_dim=512, # decoder feature dimension - dec_depth=8, # decoder depth - dec_num_heads=16, # decoder number of heads in the transformer block - mlp_ratio=4, - norm_layer=partial(nn.LayerNorm, eps=1e-6), - norm_im2_in_dec=True, # whether to apply normalization of the 'memory' = (second image) in the decoder - pos_embed='cosine', # positional embedding (either cosine or RoPE100) - ): - - super(CroCoNet, self).__init__() - - # patch embeddings (with initialization done as in MAE) - self._set_patch_embed(img_size, patch_size, enc_embed_dim) - - # mask generations - self._set_mask_generator(self.patch_embed.num_patches, mask_ratio) - - self.pos_embed = pos_embed - if pos_embed=='cosine': - # positional embedding of the encoder - enc_pos_embed = get_2d_sincos_pos_embed(enc_embed_dim, int(self.patch_embed.num_patches**.5), n_cls_token=0) - self.register_buffer('enc_pos_embed', torch.from_numpy(enc_pos_embed).float()) - # positional embedding of the decoder - dec_pos_embed = get_2d_sincos_pos_embed(dec_embed_dim, int(self.patch_embed.num_patches**.5), n_cls_token=0) - self.register_buffer('dec_pos_embed', torch.from_numpy(dec_pos_embed).float()) - # pos embedding in each block - self.rope = None # nothing for cosine - elif pos_embed.startswith('RoPE'): # eg RoPE100 - self.enc_pos_embed = None # nothing to add in the encoder with RoPE - self.dec_pos_embed = None # nothing to add in the decoder with RoPE - if RoPE2D is None: raise ImportError("Cannot find cuRoPE2D, please install it following the README instructions") - freq = float(pos_embed[len('RoPE'):]) - self.rope = RoPE2D(freq=freq) - else: - raise NotImplementedError('Unknown pos_embed '+pos_embed) - - # transformer for the encoder - self.enc_depth = enc_depth - self.enc_embed_dim = enc_embed_dim - self.enc_blocks = nn.ModuleList([ - Block(enc_embed_dim, enc_num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer, rope=self.rope) - for i in range(enc_depth)]) - self.enc_norm = norm_layer(enc_embed_dim) - - # masked tokens - self._set_mask_token(dec_embed_dim) - - # decoder - self._set_decoder(enc_embed_dim, dec_embed_dim, dec_num_heads, dec_depth, mlp_ratio, norm_layer, norm_im2_in_dec) - - # prediction head - self._set_prediction_head(dec_embed_dim, patch_size) - - # initializer weights - self.initialize_weights() - - def _set_patch_embed(self, img_size=224, patch_size=16, enc_embed_dim=768): - self.patch_embed = PatchEmbed(img_size, patch_size, 3, enc_embed_dim) - - def _set_mask_generator(self, num_patches, mask_ratio): - self.mask_generator = RandomMask(num_patches, mask_ratio) - - def _set_mask_token(self, dec_embed_dim): - self.mask_token = nn.Parameter(torch.zeros(1, 1, dec_embed_dim)) - - def _set_decoder(self, enc_embed_dim, dec_embed_dim, dec_num_heads, dec_depth, mlp_ratio, norm_layer, norm_im2_in_dec): - self.dec_depth = dec_depth - self.dec_embed_dim = dec_embed_dim - # transfer from encoder to decoder - self.decoder_embed = nn.Linear(enc_embed_dim, dec_embed_dim, bias=True) - # transformer for the decoder - self.dec_blocks = nn.ModuleList([ - DecoderBlock(dec_embed_dim, dec_num_heads, mlp_ratio=mlp_ratio, qkv_bias=True, norm_layer=norm_layer, norm_mem=norm_im2_in_dec, rope=self.rope) - for i in range(dec_depth)]) - # final norm layer - self.dec_norm = norm_layer(dec_embed_dim) - - def _set_prediction_head(self, dec_embed_dim, patch_size): - self.prediction_head = nn.Linear(dec_embed_dim, patch_size**2 * 3, bias=True) - - - def initialize_weights(self): - # patch embed - self.patch_embed._init_weights() - # mask tokens - if self.mask_token is not None: torch.nn.init.normal_(self.mask_token, std=.02) - # linears and layer norms - self.apply(self._init_weights) - - def _init_weights(self, m): - if isinstance(m, nn.Linear): - # we use xavier_uniform following official JAX ViT: - torch.nn.init.xavier_uniform_(m.weight) - if isinstance(m, nn.Linear) and m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) - - def _encode_image(self, image, do_mask=False, return_all_blocks=False): - """ - image has B x 3 x img_size x img_size - do_mask: whether to perform masking or not - return_all_blocks: if True, return the features at the end of every block - instead of just the features from the last block (eg for some prediction heads) - """ - # embed the image into patches (x has size B x Npatches x C) - # and get position if each return patch (pos has size B x Npatches x 2) - x, pos = self.patch_embed(image) - # add positional embedding without cls token - if self.enc_pos_embed is not None: - x = x + self.enc_pos_embed[None,...] - # apply masking - B,N,C = x.size() - if do_mask: - masks = self.mask_generator(x) - x = x[~masks].view(B, -1, C) - posvis = pos[~masks].view(B, -1, 2) - else: - B,N,C = x.size() - masks = torch.zeros((B,N), dtype=bool) - posvis = pos - # now apply the transformer encoder and normalization - if return_all_blocks: - out = [] - for blk in self.enc_blocks: - x = blk(x, posvis) - out.append(x) - out[-1] = self.enc_norm(out[-1]) - return out, pos, masks - else: - for blk in self.enc_blocks: - x = blk(x, posvis) - x = self.enc_norm(x) - return x, pos, masks - - def _decoder(self, feat1, pos1, masks1, feat2, pos2, return_all_blocks=False): - """ - return_all_blocks: if True, return the features at the end of every block - instead of just the features from the last block (eg for some prediction heads) - - masks1 can be None => assume image1 fully visible - """ - # encoder to decoder layer - visf1 = self.decoder_embed(feat1) - f2 = self.decoder_embed(feat2) - # append masked tokens to the sequence - B,Nenc,C = visf1.size() - if masks1 is None: # downstreams - f1_ = visf1 - else: # pretraining - Ntotal = masks1.size(1) - f1_ = self.mask_token.repeat(B, Ntotal, 1).to(dtype=visf1.dtype) - f1_[~masks1] = visf1.view(B * Nenc, C) - # add positional embedding - if self.dec_pos_embed is not None: - f1_ = f1_ + self.dec_pos_embed - f2 = f2 + self.dec_pos_embed - # apply Transformer blocks - out = f1_ - out2 = f2 - if return_all_blocks: - _out, out = out, [] - for blk in self.dec_blocks: - _out, out2 = blk(_out, out2, pos1, pos2) - out.append(_out) - out[-1] = self.dec_norm(out[-1]) - else: - for blk in self.dec_blocks: - out, out2 = blk(out, out2, pos1, pos2) - out = self.dec_norm(out) - return out - - def patchify(self, imgs): - """ - imgs: (B, 3, H, W) - x: (B, L, patch_size**2 *3) - """ - p = self.patch_embed.patch_size[0] - assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0 - - h = w = imgs.shape[2] // p - x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p)) - x = torch.einsum('nchpwq->nhwpqc', x) - x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 3)) - - return x - - def unpatchify(self, x, channels=3): - """ - x: (N, L, patch_size**2 *channels) - imgs: (N, 3, H, W) - """ - patch_size = self.patch_embed.patch_size[0] - h = w = int(x.shape[1]**.5) - assert h * w == x.shape[1] - x = x.reshape(shape=(x.shape[0], h, w, patch_size, patch_size, channels)) - x = torch.einsum('nhwpqc->nchpwq', x) - imgs = x.reshape(shape=(x.shape[0], channels, h * patch_size, h * patch_size)) - return imgs - - def forward(self, img1, img2): - """ - img1: tensor of size B x 3 x img_size x img_size - img2: tensor of size B x 3 x img_size x img_size - - out will be B x N x (3*patch_size*patch_size) - masks are also returned as B x N just in case - """ - # encoder of the masked first image - feat1, pos1, mask1 = self._encode_image(img1, do_mask=True) - # encoder of the second image - feat2, pos2, _ = self._encode_image(img2, do_mask=False) - # decoder - decfeat = self._decoder(feat1, pos1, mask1, feat2, pos2) - # prediction head - out = self.prediction_head(decfeat) - # get target - target = self.patchify(img1) - return out, mask1, target diff --git a/modules/croco/models/croco_downstream.py b/modules/croco/models/croco_downstream.py deleted file mode 100644 index 159dfff4d2c1461bc235e21441b57ce1e2088f76..0000000000000000000000000000000000000000 --- a/modules/croco/models/croco_downstream.py +++ /dev/null @@ -1,122 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - -# -------------------------------------------------------- -# CroCo model for downstream tasks -# -------------------------------------------------------- - -import torch - -from .croco import CroCoNet - - -def croco_args_from_ckpt(ckpt): - if 'croco_kwargs' in ckpt: # CroCo v2 released models - return ckpt['croco_kwargs'] - elif 'args' in ckpt and hasattr(ckpt['args'], 'model'): # pretrained using the official code release - s = ckpt['args'].model # eg "CroCoNet(enc_embed_dim=1024, enc_num_heads=16, enc_depth=24)" - assert s.startswith('CroCoNet(') - return eval('dict'+s[len('CroCoNet'):]) # transform it into the string of a dictionary and evaluate it - else: # CroCo v1 released models - return dict() - -class CroCoDownstreamMonocularEncoder(CroCoNet): - - def __init__(self, - head, - **kwargs): - """ Build network for monocular downstream task, only using the encoder. - It takes an extra argument head, that is called with the features - and a dictionary img_info containing 'width' and 'height' keys - The head is setup with the croconet arguments in this init function - NOTE: It works by *calling super().__init__() but with redefined setters - - """ - super(CroCoDownstreamMonocularEncoder, self).__init__(**kwargs) - head.setup(self) - self.head = head - - def _set_mask_generator(self, *args, **kwargs): - """ No mask generator """ - return - - def _set_mask_token(self, *args, **kwargs): - """ No mask token """ - self.mask_token = None - return - - def _set_decoder(self, *args, **kwargs): - """ No decoder """ - return - - def _set_prediction_head(self, *args, **kwargs): - """ No 'prediction head' for downstream tasks.""" - return - - def forward(self, img): - """ - img if of size batch_size x 3 x h x w - """ - B, C, H, W = img.size() - img_info = {'height': H, 'width': W} - need_all_layers = hasattr(self.head, 'return_all_blocks') and self.head.return_all_blocks - out, _, _ = self._encode_image(img, do_mask=False, return_all_blocks=need_all_layers) - return self.head(out, img_info) - - -class CroCoDownstreamBinocular(CroCoNet): - - def __init__(self, - head, - **kwargs): - """ Build network for binocular downstream task - It takes an extra argument head, that is called with the features - and a dictionary img_info containing 'width' and 'height' keys - The head is setup with the croconet arguments in this init function - """ - super(CroCoDownstreamBinocular, self).__init__(**kwargs) - head.setup(self) - self.head = head - - def _set_mask_generator(self, *args, **kwargs): - """ No mask generator """ - return - - def _set_mask_token(self, *args, **kwargs): - """ No mask token """ - self.mask_token = None - return - - def _set_prediction_head(self, *args, **kwargs): - """ No prediction head for downstream tasks, define your own head """ - return - - def encode_image_pairs(self, img1, img2, return_all_blocks=False): - """ run encoder for a pair of images - it is actually ~5% faster to concatenate the images along the batch dimension - than to encode them separately - """ - ## the two commented lines below is the naive version with separate encoding - #out, pos, _ = self._encode_image(img1, do_mask=False, return_all_blocks=return_all_blocks) - #out2, pos2, _ = self._encode_image(img2, do_mask=False, return_all_blocks=False) - ## and now the faster version - out, pos, _ = self._encode_image( torch.cat( (img1,img2), dim=0), do_mask=False, return_all_blocks=return_all_blocks ) - if return_all_blocks: - out,out2 = list(map(list, zip(*[o.chunk(2, dim=0) for o in out]))) - out2 = out2[-1] - else: - out,out2 = out.chunk(2, dim=0) - pos,pos2 = pos.chunk(2, dim=0) - return out, out2, pos, pos2 - - def forward(self, img1, img2): - B, C, H, W = img1.size() - img_info = {'height': H, 'width': W} - return_all_blocks = hasattr(self.head, 'return_all_blocks') and self.head.return_all_blocks - out, out2, pos, pos2 = self.encode_image_pairs(img1, img2, return_all_blocks=return_all_blocks) - if return_all_blocks: - decout = self._decoder(out[-1], pos, None, out2, pos2, return_all_blocks=return_all_blocks) - decout = out+decout - else: - decout = self._decoder(out, pos, None, out2, pos2, return_all_blocks=return_all_blocks) - return self.head(decout, img_info) \ No newline at end of file diff --git a/modules/croco/models/curope/__init__.py b/modules/croco/models/curope/__init__.py deleted file mode 100644 index 25e3d48a162760260826080f6366838e83e26878..0000000000000000000000000000000000000000 --- a/modules/croco/models/curope/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - -from .curope2d import cuRoPE2D diff --git a/modules/croco/models/curope/__pycache__/__init__.cpython-312.pyc b/modules/croco/models/curope/__pycache__/__init__.cpython-312.pyc deleted file mode 100644 index 1284f3c29d23a4417745e6f6ec4646a470215396..0000000000000000000000000000000000000000 Binary files a/modules/croco/models/curope/__pycache__/__init__.cpython-312.pyc and /dev/null differ diff --git a/modules/croco/models/curope/__pycache__/curope2d.cpython-312.pyc b/modules/croco/models/curope/__pycache__/curope2d.cpython-312.pyc deleted file mode 100644 index cfee90c8c4221943b91369c1bd5ab8c133b08fe3..0000000000000000000000000000000000000000 Binary files a/modules/croco/models/curope/__pycache__/curope2d.cpython-312.pyc and /dev/null differ diff --git a/modules/croco/models/curope/curope.cpp b/modules/croco/models/curope/curope.cpp deleted file mode 100644 index 8fe9058e05aa1bf3f37b0d970edc7312bc68455b..0000000000000000000000000000000000000000 --- a/modules/croco/models/curope/curope.cpp +++ /dev/null @@ -1,69 +0,0 @@ -/* - Copyright (C) 2022-present Naver Corporation. All rights reserved. - Licensed under CC BY-NC-SA 4.0 (non-commercial use only). -*/ - -#include - -// forward declaration -void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd ); - -void rope_2d_cpu( torch::Tensor tokens, const torch::Tensor positions, const float base, const float fwd ) -{ - const int B = tokens.size(0); - const int N = tokens.size(1); - const int H = tokens.size(2); - const int D = tokens.size(3) / 4; - - auto tok = tokens.accessor(); - auto pos = positions.accessor(); - - for (int b = 0; b < B; b++) { - for (int x = 0; x < 2; x++) { // y and then x (2d) - for (int n = 0; n < N; n++) { - - // grab the token position - const int p = pos[b][n][x]; - - for (int h = 0; h < H; h++) { - for (int d = 0; d < D; d++) { - // grab the two values - float u = tok[b][n][h][d+0+x*2*D]; - float v = tok[b][n][h][d+D+x*2*D]; - - // grab the cos,sin - const float inv_freq = fwd * p / powf(base, d/float(D)); - float c = cosf(inv_freq); - float s = sinf(inv_freq); - - // write the result - tok[b][n][h][d+0+x*2*D] = u*c - v*s; - tok[b][n][h][d+D+x*2*D] = v*c + u*s; - } - } - } - } - } -} - -void rope_2d( torch::Tensor tokens, // B,N,H,D - const torch::Tensor positions, // B,N,2 - const float base, - const float fwd ) -{ - TORCH_CHECK(tokens.dim() == 4, "tokens must have 4 dimensions"); - TORCH_CHECK(positions.dim() == 3, "positions must have 3 dimensions"); - TORCH_CHECK(tokens.size(0) == positions.size(0), "batch size differs between tokens & positions"); - TORCH_CHECK(tokens.size(1) == positions.size(1), "seq_length differs between tokens & positions"); - TORCH_CHECK(positions.size(2) == 2, "positions.shape[2] must be equal to 2"); - TORCH_CHECK(tokens.is_cuda() == positions.is_cuda(), "tokens and positions are not on the same device" ); - - if (tokens.is_cuda()) - rope_2d_cuda( tokens, positions, base, fwd ); - else - rope_2d_cpu( tokens, positions, base, fwd ); -} - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("rope_2d", &rope_2d, "RoPE 2d forward/backward"); -} diff --git a/modules/croco/models/curope/curope2d.py b/modules/croco/models/curope/curope2d.py deleted file mode 100644 index a49c12f8c529e9a889b5ac20c5767158f238e17d..0000000000000000000000000000000000000000 --- a/modules/croco/models/curope/curope2d.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - -import torch - -try: - import curope as _kernels # run `python setup.py install` -except ModuleNotFoundError: - from . import curope as _kernels # run `python setup.py build_ext --inplace` - - -class cuRoPE2D_func (torch.autograd.Function): - - @staticmethod - def forward(ctx, tokens, positions, base, F0=1): - ctx.save_for_backward(positions) - ctx.saved_base = base - ctx.saved_F0 = F0 - # tokens = tokens.clone() # uncomment this if inplace doesn't work - _kernels.rope_2d( tokens, positions, base, F0 ) - ctx.mark_dirty(tokens) - return tokens - - @staticmethod - def backward(ctx, grad_res): - positions, base, F0 = ctx.saved_tensors[0], ctx.saved_base, ctx.saved_F0 - _kernels.rope_2d( grad_res, positions, base, -F0 ) - ctx.mark_dirty(grad_res) - return grad_res, None, None, None - - -class cuRoPE2D(torch.nn.Module): - def __init__(self, freq=100.0, F0=1.0): - super().__init__() - self.base = freq - self.F0 = F0 - - def forward(self, tokens, positions): - cuRoPE2D_func.apply( tokens.transpose(1,2), positions, self.base, self.F0 ) - return tokens \ No newline at end of file diff --git a/modules/croco/models/curope/kernels.cu b/modules/croco/models/curope/kernels.cu deleted file mode 100644 index 7156cd1bb935cb1f0be45e58add53f9c21505c20..0000000000000000000000000000000000000000 --- a/modules/croco/models/curope/kernels.cu +++ /dev/null @@ -1,108 +0,0 @@ -/* - Copyright (C) 2022-present Naver Corporation. All rights reserved. - Licensed under CC BY-NC-SA 4.0 (non-commercial use only). -*/ - -#include -#include -#include -#include - -#define CHECK_CUDA(tensor) {\ - TORCH_CHECK((tensor).is_cuda(), #tensor " is not in cuda memory"); \ - TORCH_CHECK((tensor).is_contiguous(), #tensor " is not contiguous"); } -void CHECK_KERNEL() {auto error = cudaGetLastError(); TORCH_CHECK( error == cudaSuccess, cudaGetErrorString(error));} - - -template < typename scalar_t > -__global__ void rope_2d_cuda_kernel( - //scalar_t* __restrict__ tokens, - torch::PackedTensorAccessor32 tokens, - const int64_t* __restrict__ pos, - const float base, - const float fwd ) - // const int N, const int H, const int D ) -{ - // tokens shape = (B, N, H, D) - const int N = tokens.size(1); - const int H = tokens.size(2); - const int D = tokens.size(3); - - // each block update a single token, for all heads - // each thread takes care of a single output - extern __shared__ float shared[]; - float* shared_inv_freq = shared + D; - - const int b = blockIdx.x / N; - const int n = blockIdx.x % N; - - const int Q = D / 4; - // one token = [0..Q : Q..2Q : 2Q..3Q : 3Q..D] - // u_Y v_Y u_X v_X - - // shared memory: first, compute inv_freq - if (threadIdx.x < Q) - shared_inv_freq[threadIdx.x] = fwd / powf(base, threadIdx.x/float(Q)); - __syncthreads(); - - // start of X or Y part - const int X = threadIdx.x < D/2 ? 0 : 1; - const int m = (X*D/2) + (threadIdx.x % Q); // index of u_Y or u_X - - // grab the cos,sin appropriate for me - const float freq = pos[blockIdx.x*2+X] * shared_inv_freq[threadIdx.x % Q]; - const float cos = cosf(freq); - const float sin = sinf(freq); - /* - float* shared_cos_sin = shared + D + D/4; - if ((threadIdx.x % (D/2)) < Q) - shared_cos_sin[m+0] = cosf(freq); - else - shared_cos_sin[m+Q] = sinf(freq); - __syncthreads(); - const float cos = shared_cos_sin[m+0]; - const float sin = shared_cos_sin[m+Q]; - */ - - for (int h = 0; h < H; h++) - { - // then, load all the token for this head in shared memory - shared[threadIdx.x] = tokens[b][n][h][threadIdx.x]; - __syncthreads(); - - const float u = shared[m]; - const float v = shared[m+Q]; - - // write output - if ((threadIdx.x % (D/2)) < Q) - tokens[b][n][h][threadIdx.x] = u*cos - v*sin; - else - tokens[b][n][h][threadIdx.x] = v*cos + u*sin; - } -} - -void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd ) -{ - const int B = tokens.size(0); // batch size - const int N = tokens.size(1); // sequence length - const int H = tokens.size(2); // number of heads - const int D = tokens.size(3); // dimension per head - - TORCH_CHECK(tokens.stride(3) == 1 && tokens.stride(2) == D, "tokens are not contiguous"); - TORCH_CHECK(pos.is_contiguous(), "positions are not contiguous"); - TORCH_CHECK(pos.size(0) == B && pos.size(1) == N && pos.size(2) == 2, "bad pos.shape"); - TORCH_CHECK(D % 4 == 0, "token dim must be multiple of 4"); - - // one block for each layer, one thread per local-max - const int THREADS_PER_BLOCK = D; - const int N_BLOCKS = B * N; // each block takes care of H*D values - const int SHARED_MEM = sizeof(float) * (D + D/4); - - AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.type(), "rope_2d_cuda", ([&] { - rope_2d_cuda_kernel <<>> ( - //tokens.data_ptr(), - tokens.packed_accessor32(), - pos.data_ptr(), - base, fwd); //, N, H, D ); - })); -} diff --git a/modules/croco/models/curope/setup.py b/modules/croco/models/curope/setup.py deleted file mode 100644 index 230632ed05e309200e8f93a3a852072333975009..0000000000000000000000000000000000000000 --- a/modules/croco/models/curope/setup.py +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - -from setuptools import setup -from torch import cuda -from torch.utils.cpp_extension import BuildExtension, CUDAExtension - -# compile for all possible CUDA architectures -all_cuda_archs = cuda.get_gencode_flags().replace('compute=','arch=').split() -# alternatively, you can list cuda archs that you want, eg: -# all_cuda_archs = [ - # '-gencode', 'arch=compute_70,code=sm_70', - # '-gencode', 'arch=compute_75,code=sm_75', - # '-gencode', 'arch=compute_80,code=sm_80', - # '-gencode', 'arch=compute_86,code=sm_86' -# ] - -setup( - name = 'curope', - ext_modules = [ - CUDAExtension( - name='curope', - sources=[ - "curope.cpp", - "kernels.cu", - ], - extra_compile_args = dict( - nvcc=['-O3','--ptxas-options=-v',"--use_fast_math"]+all_cuda_archs, - cxx=['-O3']) - ) - ], - cmdclass = { - 'build_ext': BuildExtension - }) diff --git a/modules/croco/models/dpt_block.py b/modules/croco/models/dpt_block.py deleted file mode 100644 index d4ddfb74e2769ceca88720d4c730e00afd71c763..0000000000000000000000000000000000000000 --- a/modules/croco/models/dpt_block.py +++ /dev/null @@ -1,450 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - -# -------------------------------------------------------- -# DPT head for ViTs -# -------------------------------------------------------- -# References: -# https://github.com/isl-org/DPT -# https://github.com/EPFL-VILAB/MultiMAE/blob/main/multimae/output_adapters.py - -import torch -import torch.nn as nn -import torch.nn.functional as F -from einops import rearrange, repeat -from typing import Union, Tuple, Iterable, List, Optional, Dict - -def pair(t): - return t if isinstance(t, tuple) else (t, t) - -def make_scratch(in_shape, out_shape, groups=1, expand=False): - scratch = nn.Module() - - out_shape1 = out_shape - out_shape2 = out_shape - out_shape3 = out_shape - out_shape4 = out_shape - if expand == True: - out_shape1 = out_shape - out_shape2 = out_shape * 2 - out_shape3 = out_shape * 4 - out_shape4 = out_shape * 8 - - scratch.layer1_rn = nn.Conv2d( - in_shape[0], - out_shape1, - kernel_size=3, - stride=1, - padding=1, - bias=False, - groups=groups, - ) - scratch.layer2_rn = nn.Conv2d( - in_shape[1], - out_shape2, - kernel_size=3, - stride=1, - padding=1, - bias=False, - groups=groups, - ) - scratch.layer3_rn = nn.Conv2d( - in_shape[2], - out_shape3, - kernel_size=3, - stride=1, - padding=1, - bias=False, - groups=groups, - ) - scratch.layer4_rn = nn.Conv2d( - in_shape[3], - out_shape4, - kernel_size=3, - stride=1, - padding=1, - bias=False, - groups=groups, - ) - - scratch.layer_rn = nn.ModuleList([ - scratch.layer1_rn, - scratch.layer2_rn, - scratch.layer3_rn, - scratch.layer4_rn, - ]) - - return scratch - -class ResidualConvUnit_custom(nn.Module): - """Residual convolution module.""" - - def __init__(self, features, activation, bn): - """Init. - Args: - features (int): number of features - """ - super().__init__() - - self.bn = bn - - self.groups = 1 - - self.conv1 = nn.Conv2d( - features, - features, - kernel_size=3, - stride=1, - padding=1, - bias=not self.bn, - groups=self.groups, - ) - - self.conv2 = nn.Conv2d( - features, - features, - kernel_size=3, - stride=1, - padding=1, - bias=not self.bn, - groups=self.groups, - ) - - if self.bn == True: - self.bn1 = nn.BatchNorm2d(features) - self.bn2 = nn.BatchNorm2d(features) - - self.activation = activation - - self.skip_add = nn.quantized.FloatFunctional() - - def forward(self, x): - """Forward pass. - Args: - x (tensor): input - Returns: - tensor: output - """ - - out = self.activation(x) - out = self.conv1(out) - if self.bn == True: - out = self.bn1(out) - - out = self.activation(out) - out = self.conv2(out) - if self.bn == True: - out = self.bn2(out) - - if self.groups > 1: - out = self.conv_merge(out) - - return self.skip_add.add(out, x) - -class FeatureFusionBlock_custom(nn.Module): - """Feature fusion block.""" - - def __init__( - self, - features, - activation, - deconv=False, - bn=False, - expand=False, - align_corners=True, - width_ratio=1, - ): - """Init. - Args: - features (int): number of features - """ - super(FeatureFusionBlock_custom, self).__init__() - self.width_ratio = width_ratio - - self.deconv = deconv - self.align_corners = align_corners - - self.groups = 1 - - self.expand = expand - out_features = features - if self.expand == True: - out_features = features // 2 - - self.out_conv = nn.Conv2d( - features, - out_features, - kernel_size=1, - stride=1, - padding=0, - bias=True, - groups=1, - ) - - self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn) - self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn) - - self.skip_add = nn.quantized.FloatFunctional() - - def forward(self, *xs): - """Forward pass. - Returns: - tensor: output - """ - output = xs[0] - - if len(xs) == 2: - res = self.resConfUnit1(xs[1]) - if self.width_ratio != 1: - res = F.interpolate(res, size=(output.shape[2], output.shape[3]), mode='bilinear') - - output = self.skip_add.add(output, res) - # output += res - - output = self.resConfUnit2(output) - - if self.width_ratio != 1: - # and output.shape[3] < self.width_ratio * output.shape[2] - #size=(image.shape[]) - if (output.shape[3] / output.shape[2]) < (2 / 3) * self.width_ratio: - shape = 3 * output.shape[3] - else: - shape = int(self.width_ratio * 2 * output.shape[2]) - output = F.interpolate(output, size=(2* output.shape[2], shape), mode='bilinear') - else: - output = nn.functional.interpolate(output, scale_factor=2, - mode="bilinear", align_corners=self.align_corners) - output = self.out_conv(output) - return output - -def make_fusion_block(features, use_bn, width_ratio=1): - return FeatureFusionBlock_custom( - features, - nn.ReLU(False), - deconv=False, - bn=use_bn, - expand=False, - align_corners=True, - width_ratio=width_ratio, - ) - -class Interpolate(nn.Module): - """Interpolation module.""" - - def __init__(self, scale_factor, mode, align_corners=False): - """Init. - Args: - scale_factor (float): scaling - mode (str): interpolation mode - """ - super(Interpolate, self).__init__() - - self.interp = nn.functional.interpolate - self.scale_factor = scale_factor - self.mode = mode - self.align_corners = align_corners - - def forward(self, x): - """Forward pass. - Args: - x (tensor): input - Returns: - tensor: interpolated data - """ - - x = self.interp( - x, - scale_factor=self.scale_factor, - mode=self.mode, - align_corners=self.align_corners, - ) - - return x - -class DPTOutputAdapter(nn.Module): - """DPT output adapter. - - :param num_cahnnels: Number of output channels - :param stride_level: tride level compared to the full-sized image. - E.g. 4 for 1/4th the size of the image. - :param patch_size_full: Int or tuple of the patch size over the full image size. - Patch size for smaller inputs will be computed accordingly. - :param hooks: Index of intermediate layers - :param layer_dims: Dimension of intermediate layers - :param feature_dim: Feature dimension - :param last_dim: out_channels/in_channels for the last two Conv2d when head_type == regression - :param use_bn: If set to True, activates batch norm - :param dim_tokens_enc: Dimension of tokens coming from encoder - """ - - def __init__(self, - num_channels: int = 1, - stride_level: int = 1, - patch_size: Union[int, Tuple[int, int]] = 16, - main_tasks: Iterable[str] = ('rgb',), - hooks: List[int] = [2, 5, 8, 11], - layer_dims: List[int] = [96, 192, 384, 768], - feature_dim: int = 256, - last_dim: int = 32, - use_bn: bool = False, - dim_tokens_enc: Optional[int] = None, - head_type: str = 'regression', - output_width_ratio=1, - **kwargs): - super().__init__() - self.num_channels = num_channels - self.stride_level = stride_level - self.patch_size = pair(patch_size) - self.main_tasks = main_tasks - self.hooks = hooks - self.layer_dims = layer_dims - self.feature_dim = feature_dim - self.dim_tokens_enc = dim_tokens_enc * len(self.main_tasks) if dim_tokens_enc is not None else None - self.head_type = head_type - - # Actual patch height and width, taking into account stride of input - self.P_H = max(1, self.patch_size[0] // stride_level) - self.P_W = max(1, self.patch_size[1] // stride_level) - - self.scratch = make_scratch(layer_dims, feature_dim, groups=1, expand=False) - - self.scratch.refinenet1 = make_fusion_block(feature_dim, use_bn, output_width_ratio) - self.scratch.refinenet2 = make_fusion_block(feature_dim, use_bn, output_width_ratio) - self.scratch.refinenet3 = make_fusion_block(feature_dim, use_bn, output_width_ratio) - self.scratch.refinenet4 = make_fusion_block(feature_dim, use_bn, output_width_ratio) - - if self.head_type == 'regression': - # The "DPTDepthModel" head - self.head = nn.Sequential( - nn.Conv2d(feature_dim, feature_dim // 2, kernel_size=3, stride=1, padding=1), - Interpolate(scale_factor=2, mode="bilinear", align_corners=True), - nn.Conv2d(feature_dim // 2, last_dim, kernel_size=3, stride=1, padding=1), - nn.ReLU(True), - nn.Conv2d(last_dim, self.num_channels, kernel_size=1, stride=1, padding=0) - ) - elif self.head_type == 'semseg': - # The "DPTSegmentationModel" head - self.head = nn.Sequential( - nn.Conv2d(feature_dim, feature_dim, kernel_size=3, padding=1, bias=False), - nn.BatchNorm2d(feature_dim) if use_bn else nn.Identity(), - nn.ReLU(True), - nn.Dropout(0.1, False), - nn.Conv2d(feature_dim, self.num_channels, kernel_size=1), - Interpolate(scale_factor=2, mode="bilinear", align_corners=True), - ) - else: - raise ValueError('DPT head_type must be "regression" or "semseg".') - - if self.dim_tokens_enc is not None: - self.init(dim_tokens_enc=dim_tokens_enc) - - def init(self, dim_tokens_enc=768): - """ - Initialize parts of decoder that are dependent on dimension of encoder tokens. - Should be called when setting up MultiMAE. - - :param dim_tokens_enc: Dimension of tokens coming from encoder - """ - #print(dim_tokens_enc) - - # Set up activation postprocessing layers - if isinstance(dim_tokens_enc, int): - dim_tokens_enc = 4 * [dim_tokens_enc] - - self.dim_tokens_enc = [dt * len(self.main_tasks) for dt in dim_tokens_enc] - - self.act_1_postprocess = nn.Sequential( - nn.Conv2d( - in_channels=self.dim_tokens_enc[0], - out_channels=self.layer_dims[0], - kernel_size=1, stride=1, padding=0, - ), - nn.ConvTranspose2d( - in_channels=self.layer_dims[0], - out_channels=self.layer_dims[0], - kernel_size=4, stride=4, padding=0, - bias=True, dilation=1, groups=1, - ) - ) - - self.act_2_postprocess = nn.Sequential( - nn.Conv2d( - in_channels=self.dim_tokens_enc[1], - out_channels=self.layer_dims[1], - kernel_size=1, stride=1, padding=0, - ), - nn.ConvTranspose2d( - in_channels=self.layer_dims[1], - out_channels=self.layer_dims[1], - kernel_size=2, stride=2, padding=0, - bias=True, dilation=1, groups=1, - ) - ) - - self.act_3_postprocess = nn.Sequential( - nn.Conv2d( - in_channels=self.dim_tokens_enc[2], - out_channels=self.layer_dims[2], - kernel_size=1, stride=1, padding=0, - ) - ) - - self.act_4_postprocess = nn.Sequential( - nn.Conv2d( - in_channels=self.dim_tokens_enc[3], - out_channels=self.layer_dims[3], - kernel_size=1, stride=1, padding=0, - ), - nn.Conv2d( - in_channels=self.layer_dims[3], - out_channels=self.layer_dims[3], - kernel_size=3, stride=2, padding=1, - ) - ) - - self.act_postprocess = nn.ModuleList([ - self.act_1_postprocess, - self.act_2_postprocess, - self.act_3_postprocess, - self.act_4_postprocess - ]) - - def adapt_tokens(self, encoder_tokens): - # Adapt tokens - x = [] - x.append(encoder_tokens[:, :]) - x = torch.cat(x, dim=-1) - return x - - def forward(self, encoder_tokens: List[torch.Tensor], image_size): - #input_info: Dict): - assert self.dim_tokens_enc is not None, 'Need to call init(dim_tokens_enc) function first' - H, W = image_size - - # Number of patches in height and width - N_H = H // (self.stride_level * self.P_H) - N_W = W // (self.stride_level * self.P_W) - - # Hook decoder onto 4 layers from specified ViT layers - layers = [encoder_tokens[hook] for hook in self.hooks] - - # Extract only task-relevant tokens and ignore global tokens. - layers = [self.adapt_tokens(l) for l in layers] - - # Reshape tokens to spatial representation - layers = [rearrange(l, 'b (nh nw) c -> b c nh nw', nh=N_H, nw=N_W) for l in layers] - - layers = [self.act_postprocess[idx](l) for idx, l in enumerate(layers)] - # Project layers to chosen feature dim - layers = [self.scratch.layer_rn[idx](l) for idx, l in enumerate(layers)] - - # Fuse layers using refinement stages - path_4 = self.scratch.refinenet4(layers[3]) - path_3 = self.scratch.refinenet3(path_4, layers[2]) - path_2 = self.scratch.refinenet2(path_3, layers[1]) - path_1 = self.scratch.refinenet1(path_2, layers[0]) - - # Output head - out = self.head(path_1) - - return out diff --git a/modules/croco/models/head_downstream.py b/modules/croco/models/head_downstream.py deleted file mode 100644 index bd40c91ba244d6c3522c6efd4ed4d724b7bdc650..0000000000000000000000000000000000000000 --- a/modules/croco/models/head_downstream.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - -# -------------------------------------------------------- -# Heads for downstream tasks -# -------------------------------------------------------- - -""" -A head is a module where the __init__ defines only the head hyperparameters. -A method setup(croconet) takes a CroCoNet and set all layers according to the head and croconet attributes. -The forward takes the features as well as a dictionary img_info containing the keys 'width' and 'height' -""" - -import torch -import torch.nn as nn -from .dpt_block import DPTOutputAdapter - - -class PixelwiseTaskWithDPT(nn.Module): - """ DPT module for CroCo. - by default, hooks_idx will be equal to: - * for encoder-only: 4 equally spread layers - * for encoder+decoder: last encoder + 3 equally spread layers of the decoder - """ - - def __init__(self, *, hooks_idx=None, layer_dims=[96,192,384,768], - output_width_ratio=1, num_channels=1, postprocess=None, **kwargs): - super(PixelwiseTaskWithDPT, self).__init__() - self.return_all_blocks = True # backbone needs to return all layers - self.postprocess = postprocess - self.output_width_ratio = output_width_ratio - self.num_channels = num_channels - self.hooks_idx = hooks_idx - self.layer_dims = layer_dims - - def setup(self, croconet): - dpt_args = {'output_width_ratio': self.output_width_ratio, 'num_channels': self.num_channels} - if self.hooks_idx is None: - if hasattr(croconet, 'dec_blocks'): # encoder + decoder - step = {8: 3, 12: 4, 24: 8}[croconet.dec_depth] - hooks_idx = [croconet.dec_depth+croconet.enc_depth-1-i*step for i in range(3,-1,-1)] - else: # encoder only - step = croconet.enc_depth//4 - hooks_idx = [croconet.enc_depth-1-i*step for i in range(3,-1,-1)] - self.hooks_idx = hooks_idx - print(f' PixelwiseTaskWithDPT: automatically setting hook_idxs={self.hooks_idx}') - dpt_args['hooks'] = self.hooks_idx - dpt_args['layer_dims'] = self.layer_dims - self.dpt = DPTOutputAdapter(**dpt_args) - dim_tokens = [croconet.enc_embed_dim if hook0: - pos_embed = np.concatenate([np.zeros([n_cls_token, embed_dim]), pos_embed], axis=0) - return pos_embed - - -def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): - assert embed_dim % 2 == 0 - - # use half of dimensions to encode grid_h - emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) - emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) - - emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) - return emb - - -def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): - """ - embed_dim: output dimension for each position - pos: a list of positions to be encoded: size (M,) - out: (M, D) - """ - assert embed_dim % 2 == 0 - omega = np.arange(embed_dim // 2, dtype=float) - omega /= embed_dim / 2. - omega = 1. / 10000**omega # (D/2,) - - pos = pos.reshape(-1) # (M,) - out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product - - emb_sin = np.sin(out) # (M, D/2) - emb_cos = np.cos(out) # (M, D/2) - - emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) - return emb - - -# -------------------------------------------------------- -# Interpolate position embeddings for high-resolution -# References: -# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py -# DeiT: https://github.com/facebookresearch/deit -# -------------------------------------------------------- -def interpolate_pos_embed(model, checkpoint_model): - if 'pos_embed' in checkpoint_model: - pos_embed_checkpoint = checkpoint_model['pos_embed'] - embedding_size = pos_embed_checkpoint.shape[-1] - num_patches = model.patch_embed.num_patches - num_extra_tokens = model.pos_embed.shape[-2] - num_patches - # height (== width) for the checkpoint position embedding - orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) - # height (== width) for the new position embedding - new_size = int(num_patches ** 0.5) - # class_token and dist_token are kept unchanged - if orig_size != new_size: - print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size)) - extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] - # only the position tokens are interpolated - pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] - pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) - pos_tokens = torch.nn.functional.interpolate( - pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) - pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) - new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) - checkpoint_model['pos_embed'] = new_pos_embed - - -#---------------------------------------------------------- -# RoPE2D: RoPE implementation in 2D -#---------------------------------------------------------- - -try: - from models.curope import cuRoPE2D - RoPE2D = cuRoPE2D -except ImportError: - print('Warning, cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead') - - class RoPE2D(torch.nn.Module): - - def __init__(self, freq=100.0, F0=1.0): - super().__init__() - self.base = freq - self.F0 = F0 - self.cache = {} - - def get_cos_sin(self, D, seq_len, device, dtype): - if (D,seq_len,device,dtype) not in self.cache: - inv_freq = 1.0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D)) - t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) - freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype) - freqs = torch.cat((freqs, freqs), dim=-1) - cos = freqs.cos() # (Seq, Dim) - sin = freqs.sin() - self.cache[D,seq_len,device,dtype] = (cos,sin) - return self.cache[D,seq_len,device,dtype] - - @staticmethod - def rotate_half(x): - x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - def apply_rope1d(self, tokens, pos1d, cos, sin): - assert pos1d.ndim==2 - cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :] - sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :] - return (tokens * cos) + (self.rotate_half(tokens) * sin) - - def forward(self, tokens, positions): - """ - input: - * tokens: batch_size x nheads x ntokens x dim - * positions: batch_size x ntokens x 2 (y and x position of each token) - output: - * tokens after appplying RoPE2D (batch_size x nheads x ntokens x dim) - """ - assert tokens.size(3)%2==0, "number of dimensions should be a multiple of two" - D = tokens.size(3) // 2 - assert positions.ndim==3 and positions.shape[-1] == 2 # Batch, Seq, 2 - cos, sin = self.get_cos_sin(D, int(positions.max())+1, tokens.device, tokens.dtype) - # split features into two along the feature dimension, and apply rope1d on each half - y, x = tokens.chunk(2, dim=-1) - y = self.apply_rope1d(y, positions[:,:,0], cos, sin) - x = self.apply_rope1d(x, positions[:,:,1], cos, sin) - tokens = torch.cat((y, x), dim=-1) - return tokens \ No newline at end of file diff --git a/modules/croco/pretrain.py b/modules/croco/pretrain.py deleted file mode 100644 index 2c45e488015ef5380c71d0381ff453fdb860759e..0000000000000000000000000000000000000000 --- a/modules/croco/pretrain.py +++ /dev/null @@ -1,254 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). -# -# -------------------------------------------------------- -# Pre-training CroCo -# -------------------------------------------------------- -# References: -# MAE: https://github.com/facebookresearch/mae -# DeiT: https://github.com/facebookresearch/deit -# BEiT: https://github.com/microsoft/unilm/tree/master/beit -# -------------------------------------------------------- -import argparse -import datetime -import json -import numpy as np -import os -import sys -import time -import math -from pathlib import Path -from typing import Iterable - -import torch -import torch.distributed as dist -import torch.backends.cudnn as cudnn -from torch.utils.tensorboard import SummaryWriter -import torchvision.transforms as transforms -import torchvision.datasets as datasets - -import utils.misc as misc -from utils.misc import NativeScalerWithGradNormCount as NativeScaler -from models.croco import CroCoNet -from models.criterion import MaskedMSE -from datasets.pairs_dataset import PairsDataset - - -def get_args_parser(): - parser = argparse.ArgumentParser('CroCo pre-training', add_help=False) - # model and criterion - parser.add_argument('--model', default='CroCoNet()', type=str, help="string containing the model to build") - parser.add_argument('--norm_pix_loss', default=1, choices=[0,1], help="apply per-patch mean/std normalization before applying the loss") - # dataset - parser.add_argument('--dataset', default='habitat_release', type=str, help="training set") - parser.add_argument('--transforms', default='crop224+acolor', type=str, help="transforms to apply") # in the paper, we also use some homography and rotation, but find later that they were not useful or even harmful - # training - parser.add_argument('--seed', default=0, type=int, help="Random seed") - parser.add_argument('--batch_size', default=64, type=int, help="Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus") - parser.add_argument('--epochs', default=800, type=int, help="Maximum number of epochs for the scheduler") - parser.add_argument('--max_epoch', default=400, type=int, help="Stop training at this epoch") - parser.add_argument('--accum_iter', default=1, type=int, help="Accumulate gradient iterations (for increasing the effective batch size under memory constraints)") - parser.add_argument('--weight_decay', type=float, default=0.05, help="weight decay (default: 0.05)") - parser.add_argument('--lr', type=float, default=None, metavar='LR', help='learning rate (absolute lr)') - parser.add_argument('--blr', type=float, default=1.5e-4, metavar='LR', help='base learning rate: absolute_lr = base_lr * total_batch_size / 256') - parser.add_argument('--min_lr', type=float, default=0., metavar='LR', help='lower lr bound for cyclic schedulers that hit 0') - parser.add_argument('--warmup_epochs', type=int, default=40, metavar='N', help='epochs to warmup LR') - parser.add_argument('--amp', type=int, default=1, choices=[0,1], help="Use Automatic Mixed Precision for pretraining") - # others - parser.add_argument('--num_workers', default=8, type=int) - parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes') - parser.add_argument('--local_rank', default=-1, type=int) - parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training') - parser.add_argument('--save_freq', default=1, type=int, help='frequence (number of epochs) to save checkpoint in checkpoint-last.pth') - parser.add_argument('--keep_freq', default=20, type=int, help='frequence (number of epochs) to save checkpoint in checkpoint-%d.pth') - parser.add_argument('--print_freq', default=20, type=int, help='frequence (number of iterations) to print infos while training') - # paths - parser.add_argument('--output_dir', default='./output/', type=str, help="path where to save the output") - parser.add_argument('--data_dir', default='./data/', type=str, help="path where data are stored") - return parser - - - - -def main(args): - misc.init_distributed_mode(args) - global_rank = misc.get_rank() - world_size = misc.get_world_size() - - print("output_dir: "+args.output_dir) - if args.output_dir: - Path(args.output_dir).mkdir(parents=True, exist_ok=True) - - # auto resume - last_ckpt_fname = os.path.join(args.output_dir, f'checkpoint-last.pth') - args.resume = last_ckpt_fname if os.path.isfile(last_ckpt_fname) else None - - print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__)))) - print("{}".format(args).replace(', ', ',\n')) - - device = "cuda" if torch.cuda.is_available() else "cpu" - device = torch.device(device) - - # fix the seed - seed = args.seed + misc.get_rank() - torch.manual_seed(seed) - np.random.seed(seed) - - cudnn.benchmark = True - - ## training dataset and loader - print('Building dataset for {:s} with transforms {:s}'.format(args.dataset, args.transforms)) - dataset = PairsDataset(args.dataset, trfs=args.transforms, data_dir=args.data_dir) - if world_size>1: - sampler_train = torch.utils.data.DistributedSampler( - dataset, num_replicas=world_size, rank=global_rank, shuffle=True - ) - print("Sampler_train = %s" % str(sampler_train)) - else: - sampler_train = torch.utils.data.RandomSampler(dataset) - data_loader_train = torch.utils.data.DataLoader( - dataset, sampler=sampler_train, - batch_size=args.batch_size, - num_workers=args.num_workers, - pin_memory=True, - drop_last=True, - ) - - ## model - print('Loading model: {:s}'.format(args.model)) - model = eval(args.model) - print('Loading criterion: MaskedMSE(norm_pix_loss={:s})'.format(str(bool(args.norm_pix_loss)))) - criterion = MaskedMSE(norm_pix_loss=bool(args.norm_pix_loss)) - - model.to(device) - model_without_ddp = model - print("Model = %s" % str(model_without_ddp)) - - eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size() - if args.lr is None: # only base_lr is specified - args.lr = args.blr * eff_batch_size / 256 - print("base lr: %.2e" % (args.lr * 256 / eff_batch_size)) - print("actual lr: %.2e" % args.lr) - print("accumulate grad iterations: %d" % args.accum_iter) - print("effective batch size: %d" % eff_batch_size) - - if args.distributed: - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True, static_graph=True) - model_without_ddp = model.module - - param_groups = misc.get_parameter_groups(model_without_ddp, args.weight_decay) # following timm: set wd as 0 for bias and norm layers - optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95)) - print(optimizer) - loss_scaler = NativeScaler() - - misc.load_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler) - - if global_rank == 0 and args.output_dir is not None: - log_writer = SummaryWriter(log_dir=args.output_dir) - else: - log_writer = None - - print(f"Start training until {args.max_epoch} epochs") - start_time = time.time() - for epoch in range(args.start_epoch, args.max_epoch): - if world_size>1: - data_loader_train.sampler.set_epoch(epoch) - - train_stats = train_one_epoch( - model, criterion, data_loader_train, - optimizer, device, epoch, loss_scaler, - log_writer=log_writer, - args=args - ) - - if args.output_dir and epoch % args.save_freq == 0 : - misc.save_model( - args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, - loss_scaler=loss_scaler, epoch=epoch, fname='last') - - if args.output_dir and (epoch % args.keep_freq == 0 or epoch + 1 == args.max_epoch) and (epoch>0 or args.max_epoch==1): - misc.save_model( - args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, - loss_scaler=loss_scaler, epoch=epoch) - - log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, - 'epoch': epoch,} - - if args.output_dir and misc.is_main_process(): - if log_writer is not None: - log_writer.flush() - with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f: - f.write(json.dumps(log_stats) + "\n") - - total_time = time.time() - start_time - total_time_str = str(datetime.timedelta(seconds=int(total_time))) - print('Training time {}'.format(total_time_str)) - - - - -def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, - data_loader: Iterable, optimizer: torch.optim.Optimizer, - device: torch.device, epoch: int, loss_scaler, - log_writer=None, - args=None): - model.train(True) - metric_logger = misc.MetricLogger(delimiter=" ") - metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}')) - header = 'Epoch: [{}]'.format(epoch) - accum_iter = args.accum_iter - - optimizer.zero_grad() - - if log_writer is not None: - print('log_dir: {}'.format(log_writer.log_dir)) - - for data_iter_step, (image1, image2) in enumerate(metric_logger.log_every(data_loader, args.print_freq, header)): - - # we use a per iteration lr scheduler - if data_iter_step % accum_iter == 0: - misc.adjust_learning_rate(optimizer, data_iter_step / len(data_loader) + epoch, args) - - image1 = image1.to(device, non_blocking=True) - image2 = image2.to(device, non_blocking=True) - with torch.cuda.amp.autocast(enabled=bool(args.amp)): - out, mask, target = model(image1, image2) - loss = criterion(out, mask, target) - - loss_value = loss.item() - - if not math.isfinite(loss_value): - print("Loss is {}, stopping training".format(loss_value)) - sys.exit(1) - - loss /= accum_iter - loss_scaler(loss, optimizer, parameters=model.parameters(), - update_grad=(data_iter_step + 1) % accum_iter == 0) - if (data_iter_step + 1) % accum_iter == 0: - optimizer.zero_grad() - - torch.cuda.synchronize() - - metric_logger.update(loss=loss_value) - - lr = optimizer.param_groups[0]["lr"] - metric_logger.update(lr=lr) - - loss_value_reduce = misc.all_reduce_mean(loss_value) - if log_writer is not None and ((data_iter_step + 1) % (accum_iter*args.print_freq)) == 0: - # x-axis is based on epoch_1000x in the tensorboard, calibrating differences curves when batch size changes - epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000) - log_writer.add_scalar('train_loss', loss_value_reduce, epoch_1000x) - log_writer.add_scalar('lr', lr, epoch_1000x) - - # gather the stats from all processes - metric_logger.synchronize_between_processes() - print("Averaged stats:", metric_logger) - return {k: meter.global_avg for k, meter in metric_logger.meters.items()} - - - -if __name__ == '__main__': - args = get_args_parser() - args = args.parse_args() - main(args) diff --git a/modules/croco/stereoflow/README.MD b/modules/croco/stereoflow/README.MD deleted file mode 100644 index 81595380fadd274b523e0cf77921b1b65cbedb34..0000000000000000000000000000000000000000 --- a/modules/croco/stereoflow/README.MD +++ /dev/null @@ -1,318 +0,0 @@ -## CroCo-Stereo and CroCo-Flow - -This README explains how to use CroCo-Stereo and CroCo-Flow as well as how they were trained. -All commands should be launched from the root directory. - -### Simple inference example - -We provide a simple inference exemple for CroCo-Stereo and CroCo-Flow in the Totebook `croco-stereo-flow-demo.ipynb`. -Before running it, please download the trained models with: -``` -bash stereoflow/download_model.sh crocostereo.pth -bash stereoflow/download_model.sh crocoflow.pth -``` - -### Prepare data for training or evaluation - -Put the datasets used for training/evaluation in `./data/stereoflow` (or update the paths at the top of `stereoflow/datasets_stereo.py` and `stereoflow/datasets_flow.py`). -Please find below on the file structure should look for each dataset: -
-FlyingChairs - -``` -./data/stereoflow/FlyingChairs/ -└───chairs_split.txt -└───data/ - └─── ... -``` -
- -
-MPI-Sintel - -``` -./data/stereoflow/MPI-Sintel/ -└───training/ -│ └───clean/ -│ └───final/ -│ └───flow/ -└───test/ - └───clean/ - └───final/ -``` -
- -
-SceneFlow (including FlyingThings) - -``` -./data/stereoflow/SceneFlow/ -└───Driving/ -│ └───disparity/ -│ └───frames_cleanpass/ -│ └───frames_finalpass/ -└───FlyingThings/ -│ └───disparity/ -│ └───frames_cleanpass/ -│ └───frames_finalpass/ -│ └───optical_flow/ -└───Monkaa/ - └───disparity/ - └───frames_cleanpass/ - └───frames_finalpass/ -``` -
- -
-TartanAir - -``` -./data/stereoflow/TartanAir/ -└───abandonedfactory/ -│ └───.../ -└───abandonedfactory_night/ -│ └───.../ -└───.../ -``` -
- -
-Booster - -``` -./data/stereoflow/booster_gt/ -└───train/ - └───balanced/ - └───Bathroom/ - └───Bedroom/ - └───... -``` -
- -
-CREStereo - -``` -./data/stereoflow/crenet_stereo_trainset/ -└───stereo_trainset/ - └───crestereo/ - └───hole/ - └───reflective/ - └───shapenet/ - └───tree/ -``` -
- -
-ETH3D Two-view Low-res - -``` -./data/stereoflow/eth3d_lowres/ -└───test/ -│ └───lakeside_1l/ -│ └───... -└───train/ -│ └───delivery_area_1l/ -│ └───... -└───train_gt/ - └───delivery_area_1l/ - └───... -``` -
- -
-KITTI 2012 - -``` -./data/stereoflow/kitti-stereo-2012/ -└───testing/ -│ └───colored_0/ -│ └───colored_1/ -└───training/ - └───colored_0/ - └───colored_1/ - └───disp_occ/ - └───flow_occ/ -``` -
- -
-KITTI 2015 - -``` -./data/stereoflow/kitti-stereo-2015/ -└───testing/ -│ └───image_2/ -│ └───image_3/ -└───training/ - └───image_2/ - └───image_3/ - └───disp_occ_0/ - └───flow_occ/ -``` -
- -
-Middlebury - -``` -./data/stereoflow/middlebury -└───2005/ -│ └───train/ -│ └───Art/ -│ └───... -└───2006/ -│ └───Aloe/ -│ └───Baby1/ -│ └───... -└───2014/ -│ └───Adirondack-imperfect/ -│ └───Adirondack-perfect/ -│ └───... -└───2021/ -│ └───data/ -│ └───artroom1/ -│ └───artroom2/ -│ └───... -└───MiddEval3_F/ - └───test/ - │ └───Australia/ - │ └───... - └───train/ - └───Adirondack/ - └───... -``` -
- -
-Spring - -``` -./data/stereoflow/spring/ -└───test/ -│ └───0003/ -│ └───... -└───train/ - └───0001/ - └───... -``` -
- - -### CroCo-Stereo - -##### Main model - -The main training of CroCo-Stereo was performed on a series of datasets, and it was used as it for Middlebury v3 benchmark. - -``` -# Download the model -bash stereoflow/download_model.sh crocostereo.pth -# Middlebury v3 submission -python stereoflow/test.py --model stereoflow_models/crocostereo.pth --dataset "MdEval3('all_full')" --save submission --tile_overlap 0.9 -# Training command that was used, using checkpoint-last.pth -python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main/ -# or it can be launched on multiple gpus (while maintaining the effective batch size), e.g. on 3 gpus: -torchrun --nproc_per_node 3 stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 2 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main/ -``` - -For evaluation of validation set, we also provide the model trained on the `subtrain` subset of the training sets. - -``` -# Download the model -bash stereoflow/download_model.sh crocostereo_subtrain.pth -# Evaluation on validation sets -python stereoflow/test.py --model stereoflow_models/crocostereo_subtrain.pth --dataset "MdEval3('subval_full')+ETH3DLowRes('subval')+SceneFlow('test_finalpass')+SceneFlow('test_cleanpass')" --save metrics --tile_overlap 0.9 -# Training command that was used (same as above but on subtrain, using checkpoint-best.pth), can also be launched on multiple gpus -python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('subtrain')+50*Md05('subtrain')+50*Md06('subtrain')+50*Md14('subtrain')+50*Md21('subtrain')+50*MdEval3('subtrain_full')+Booster('subtrain_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main_subtrain/ -``` - -##### Other models - -
- Model for ETH3D - The model used for the submission on ETH3D is trained with the same command but using an unbounded Laplacian loss. - - # Download the model - bash stereoflow/download_model.sh crocostereo_eth3d.pth - # ETH3D submission - python stereoflow/test.py --model stereoflow_models/crocostereo_eth3d.pth --dataset "ETH3DLowRes('all')" --save submission --tile_overlap 0.9 - # Training command that was used - python -u stereoflow/train.py stereo --criterion "LaplacianLoss()" --tile_conf_mode conf_expbeta3 --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main_eth3d/ - -
- -
- Main model finetuned on Kitti - - # Download the model - bash stereoflow/download_model.sh crocostereo_finetune_kitti.pth - # Kitti submission - python stereoflow/test.py --model stereoflow_models/crocostereo_finetune_kitti.pth --dataset "Kitti15('test')" --save submission --tile_overlap 0.9 - # Training that was used - python -u stereoflow/train.py stereo --crop 352 1216 --criterion "LaplacianLossBounded2()" --dataset "Kitti12('train')+Kitti15('train')" --lr 3e-5 --batch_size 1 --accum_iter 6 --epochs 20 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocostereo.pth --output_dir xps/crocostereo/finetune_kitti/ --save_every 5 -
- -
- Main model finetuned on Spring - - # Download the model - bash stereoflow/download_model.sh crocostereo_finetune_spring.pth - # Spring submission - python stereoflow/test.py --model stereoflow_models/crocostereo_finetune_spring.pth --dataset "Spring('test')" --save submission --tile_overlap 0.9 - # Training command that was used - python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "Spring('train')" --lr 3e-5 --batch_size 6 --epochs 8 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocostereo.pth --output_dir xps/crocostereo/finetune_spring/ -
- -
- Smaller models - To train CroCo-Stereo with smaller CroCo pretrained models, simply replace the --pretrained argument. To download the smaller CroCo-Stereo models based on CroCo v2 pretraining with ViT-Base encoder and Small encoder, use bash stereoflow/download_model.sh crocostereo_subtrain_vitb_smalldecoder.pth, and for the model with a ViT-Base encoder and a Base decoder, use bash stereoflow/download_model.sh crocostereo_subtrain_vitb_basedecoder.pth. -
- - -### CroCo-Flow - -##### Main model - -The main training of CroCo-Flow was performed on the FlyingThings, FlyingChairs, MPI-Sintel and TartanAir datasets. -It was used for our submission to the MPI-Sintel benchmark. - -``` -# Download the model -bash stereoflow/download_model.sh crocoflow.pth -# Evaluation -python stereoflow/test.py --model stereoflow_models/crocoflow.pth --dataset "MPISintel('subval_cleanpass')+MPISintel('subval_finalpass')" --save metrics --tile_overlap 0.9 -# Sintel submission -python stereoflow/test.py --model stereoflow_models/crocoflow.pth --dataset "MPISintel('test_allpass')" --save submission --tile_overlap 0.9 -# Training command that was used, with checkpoint-best.pth -python -u stereoflow/train.py flow --criterion "LaplacianLossBounded()" --dataset "40*MPISintel('subtrain_cleanpass')+40*MPISintel('subtrain_finalpass')+4*FlyingThings('train_allpass')+4*FlyingChairs('train')+TartanAir('train')" --val_dataset "MPISintel('subval_cleanpass')+MPISintel('subval_finalpass')" --lr 2e-5 --batch_size 8 --epochs 240 --img_per_epoch 30000 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocoflow/main/ -``` - -##### Other models - -
- Main model finetuned on Kitti - - # Download the model - bash stereoflow/download_model.sh crocoflow_finetune_kitti.pth - # Kitti submission - python stereoflow/test.py --model stereoflow_models/crocoflow_finetune_kitti.pth --dataset "Kitti15('test')" --save submission --tile_overlap 0.99 - # Training that was used, with checkpoint-last.pth - python -u stereoflow/train.py flow --crop 352 1216 --criterion "LaplacianLossBounded()" --dataset "Kitti15('train')+Kitti12('train')" --lr 2e-5 --batch_size 1 --accum_iter 8 --epochs 150 --save_every 5 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocoflow.pth --output_dir xps/crocoflow/finetune_kitti/ -
- -
- Main model finetuned on Spring - - # Download the model - bash stereoflow/download_model.sh crocoflow_finetune_spring.pth - # Spring submission - python stereoflow/test.py --model stereoflow_models/crocoflow_finetune_spring.pth --dataset "Spring('test')" --save submission --tile_overlap 0.9 - # Training command that was used, with checkpoint-last.pth - python -u stereoflow/train.py flow --criterion "LaplacianLossBounded()" --dataset "Spring('train')" --lr 2e-5 --batch_size 8 --epochs 12 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocoflow.pth --output_dir xps/crocoflow/finetune_spring/ -
- -
- Smaller models - To train CroCo-Flow with smaller CroCo pretrained models, simply replace the --pretrained argument. To download the smaller CroCo-Flow models based on CroCo v2 pretraining with ViT-Base encoder and Small encoder, use bash stereoflow/download_model.sh crocoflow_vitb_smalldecoder.pth, and for the model with a ViT-Base encoder and a Base decoder, use bash stereoflow/download_model.sh crocoflow_vitb_basedecoder.pth. -
diff --git a/modules/croco/stereoflow/augmentor.py b/modules/croco/stereoflow/augmentor.py deleted file mode 100644 index 69e6117151988d94cbc4b385e0d88e982133bf10..0000000000000000000000000000000000000000 --- a/modules/croco/stereoflow/augmentor.py +++ /dev/null @@ -1,290 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - -# -------------------------------------------------------- -# Data augmentation for training stereo and flow -# -------------------------------------------------------- - -# References -# https://github.com/autonomousvision/unimatch/blob/master/dataloader/stereo/transforms.py -# https://github.com/autonomousvision/unimatch/blob/master/dataloader/flow/transforms.py - - -import numpy as np -import random -from PIL import Image - -import cv2 -cv2.setNumThreads(0) -cv2.ocl.setUseOpenCL(False) - -import torch -from torchvision.transforms import ColorJitter -import torchvision.transforms.functional as FF - -class StereoAugmentor(object): - - def __init__(self, crop_size, scale_prob=0.5, scale_xonly=True, lhth=800., lminscale=0.0, lmaxscale=1.0, hminscale=-0.2, hmaxscale=0.4, scale_interp_nearest=True, rightjitterprob=0.5, v_flip_prob=0.5, color_aug_asym=True, color_choice_prob=0.5): - self.crop_size = crop_size - self.scale_prob = scale_prob - self.scale_xonly = scale_xonly - self.lhth = lhth - self.lminscale = lminscale - self.lmaxscale = lmaxscale - self.hminscale = hminscale - self.hmaxscale = hmaxscale - self.scale_interp_nearest = scale_interp_nearest - self.rightjitterprob = rightjitterprob - self.v_flip_prob = v_flip_prob - self.color_aug_asym = color_aug_asym - self.color_choice_prob = color_choice_prob - - def _random_scale(self, img1, img2, disp): - ch,cw = self.crop_size - h,w = img1.shape[:2] - if self.scale_prob>0. and np.random.rand()1.: - scale_x = clip_scale - scale_y = scale_x if not self.scale_xonly else 1.0 - img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) - img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) - disp = cv2.resize(disp, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR if not self.scale_interp_nearest else cv2.INTER_NEAREST) * scale_x - return img1, img2, disp - - def _random_crop(self, img1, img2, disp): - h,w = img1.shape[:2] - ch,cw = self.crop_size - assert ch<=h and cw<=w, (img1.shape, h,w,ch,cw) - offset_x = np.random.randint(w - cw + 1) - offset_y = np.random.randint(h - ch + 1) - img1 = img1[offset_y:offset_y+ch,offset_x:offset_x+cw] - img2 = img2[offset_y:offset_y+ch,offset_x:offset_x+cw] - disp = disp[offset_y:offset_y+ch,offset_x:offset_x+cw] - return img1, img2, disp - - def _random_vflip(self, img1, img2, disp): - # vertical flip - if self.v_flip_prob>0 and np.random.rand() < self.v_flip_prob: - img1 = np.copy(np.flipud(img1)) - img2 = np.copy(np.flipud(img2)) - disp = np.copy(np.flipud(disp)) - return img1, img2, disp - - def _random_rotate_shift_right(self, img2): - if self.rightjitterprob>0. and np.random.rand() 0) & (xx < wd1) & (yy > 0) & (yy < ht1) - xx = xx[v] - yy = yy[v] - flow1 = flow1[v] - - flow = np.inf * np.ones([ht1, wd1, 2], dtype=np.float32) # invalid value every where, before we fill it with the correct ones - flow[yy, xx] = flow1 - return flow - - def spatial_transform(self, img1, img2, flow, dname): - - if np.random.rand() < self.spatial_aug_prob: - # randomly sample scale - ht, wd = img1.shape[:2] - clip_min_scale = np.maximum( - (self.crop_size[0] + 8) / float(ht), - (self.crop_size[1] + 8) / float(wd)) - min_scale, max_scale = self.min_scale, self.max_scale - scale = 2 ** np.random.uniform(self.min_scale, self.max_scale) - scale_x = scale - scale_y = scale - if np.random.rand() < self.stretch_prob: - scale_x *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch) - scale_y *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch) - scale_x = np.clip(scale_x, clip_min_scale, None) - scale_y = np.clip(scale_y, clip_min_scale, None) - # rescale the images - img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) - img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) - flow = self._resize_flow(flow, scale_x, scale_y, factor=2.0 if dname=='Spring' else 1.0) - elif dname=="Spring": - flow = self._resize_flow(flow, 1.0, 1.0, factor=2.0) - - if self.h_flip_prob>0. and np.random.rand() < self.h_flip_prob: # h-flip - img1 = img1[:, ::-1] - img2 = img2[:, ::-1] - flow = flow[:, ::-1] * [-1.0, 1.0] - - if self.v_flip_prob>0. and np.random.rand() < self.v_flip_prob: # v-flip - img1 = img1[::-1, :] - img2 = img2[::-1, :] - flow = flow[::-1, :] * [1.0, -1.0] - - # In case no cropping - if img1.shape[0] - self.crop_size[0] > 0: - y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0]) - else: - y0 = 0 - if img1.shape[1] - self.crop_size[1] > 0: - x0 = np.random.randint(0, img1.shape[1] - self.crop_size[1]) - else: - x0 = 0 - - img1 = img1[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]] - img2 = img2[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]] - flow = flow[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]] - - return img1, img2, flow - - def __call__(self, img1, img2, flow, dname): - img1, img2, flow = self.spatial_transform(img1, img2, flow, dname) - img1, img2 = self.color_transform(img1, img2) - img1 = np.ascontiguousarray(img1) - img2 = np.ascontiguousarray(img2) - flow = np.ascontiguousarray(flow) - return img1, img2, flow \ No newline at end of file diff --git a/modules/croco/stereoflow/criterion.py b/modules/croco/stereoflow/criterion.py deleted file mode 100644 index 57792ebeeee34827b317a4d32b7445837bb33f17..0000000000000000000000000000000000000000 --- a/modules/croco/stereoflow/criterion.py +++ /dev/null @@ -1,251 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - -# -------------------------------------------------------- -# Losses, metrics per batch, metrics per dataset -# -------------------------------------------------------- - -import torch -from torch import nn -import torch.nn.functional as F - -def _get_gtnorm(gt): - if gt.size(1)==1: # stereo - return gt - # flow - return torch.sqrt(torch.sum(gt**2, dim=1, keepdims=True)) # Bx1xHxW - -############ losses without confidence - -class L1Loss(nn.Module): - - def __init__(self, max_gtnorm=None): - super().__init__() - self.max_gtnorm = max_gtnorm - self.with_conf = False - - def _error(self, gt, predictions): - return torch.abs(gt-predictions) - - def forward(self, predictions, gt, inspect=False): - mask = torch.isfinite(gt) - if self.max_gtnorm is not None: - mask *= _get_gtnorm(gt).expand(-1,gt.size(1),-1,-1) which is a constant - - -class LaplacianLossBounded(nn.Module): # used for CroCo-Flow ; in the equation of the paper, we have a=1/b - def __init__(self, max_gtnorm=10000., a=0.25, b=4.): - super().__init__() - self.max_gtnorm = max_gtnorm - self.with_conf = True - self.a, self.b = a, b - - def forward(self, predictions, gt, conf): - mask = torch.isfinite(gt) - mask = mask[:,0,:,:] - if self.max_gtnorm is not None: mask *= _get_gtnorm(gt)[:,0,:,:] which is a constant - -class LaplacianLossBounded2(nn.Module): # used for CroCo-Stereo (except for ETH3D) ; in the equation of the paper, we have a=b - def __init__(self, max_gtnorm=None, a=3.0, b=3.0): - super().__init__() - self.max_gtnorm = max_gtnorm - self.with_conf = True - self.a, self.b = a, b - - def forward(self, predictions, gt, conf): - mask = torch.isfinite(gt) - mask = mask[:,0,:,:] - if self.max_gtnorm is not None: mask *= _get_gtnorm(gt)[:,0,:,:] which is a constant - -############## metrics per batch - -class StereoMetrics(nn.Module): - - def __init__(self, do_quantile=False): - super().__init__() - self.bad_ths = [0.5,1,2,3] - self.do_quantile = do_quantile - - def forward(self, predictions, gt): - B = predictions.size(0) - metrics = {} - gtcopy = gt.clone() - mask = torch.isfinite(gtcopy) - gtcopy[~mask] = 999999.0 # we make a copy and put a non-infinite value, such that it does not become nan once multiplied by the mask value 0 - Npx = mask.view(B,-1).sum(dim=1) - L1error = (torch.abs(gtcopy-predictions)*mask).view(B,-1) - L2error = (torch.square(gtcopy-predictions)*mask).view(B,-1) - # avgerr - metrics['avgerr'] = torch.mean(L1error.sum(dim=1)/Npx ) - # rmse - metrics['rmse'] = torch.sqrt(L2error.sum(dim=1)/Npx).mean(dim=0) - # err > t for t in [0.5,1,2,3] - for ths in self.bad_ths: - metrics['bad@{:.1f}'.format(ths)] = (((L1error>ths)* mask.view(B,-1)).sum(dim=1)/Npx).mean(dim=0) * 100 - return metrics - -class FlowMetrics(nn.Module): - def __init__(self): - super().__init__() - self.bad_ths = [1,3,5] - - def forward(self, predictions, gt): - B = predictions.size(0) - metrics = {} - mask = torch.isfinite(gt[:,0,:,:]) # both x and y would be infinite - Npx = mask.view(B,-1).sum(dim=1) - gtcopy = gt.clone() # to compute L1/L2 error, we need to have non-infinite value, the error computed at this locations will be ignored - gtcopy[:,0,:,:][~mask] = 999999.0 - gtcopy[:,1,:,:][~mask] = 999999.0 - L1error = (torch.abs(gtcopy-predictions).sum(dim=1)*mask).view(B,-1) - L2error = (torch.sqrt(torch.sum(torch.square(gtcopy-predictions),dim=1))*mask).view(B,-1) - metrics['L1err'] = torch.mean(L1error.sum(dim=1)/Npx ) - metrics['EPE'] = torch.mean(L2error.sum(dim=1)/Npx ) - for ths in self.bad_ths: - metrics['bad@{:.1f}'.format(ths)] = (((L2error>ths)* mask.view(B,-1)).sum(dim=1)/Npx).mean(dim=0) * 100 - return metrics - -############## metrics per dataset -## we update the average and maintain the number of pixels while adding data batch per batch -## at the beggining, call reset() -## after each batch, call add_batch(...) -## at the end: call get_results() - -class StereoDatasetMetrics(nn.Module): - - def __init__(self): - super().__init__() - self.bad_ths = [0.5,1,2,3] - - def reset(self): - self.agg_N = 0 # number of pixels so far - self.agg_L1err = torch.tensor(0.0) # L1 error so far - self.agg_Nbad = [0 for _ in self.bad_ths] # counter of bad pixels - self._metrics = None - - def add_batch(self, predictions, gt): - assert predictions.size(1)==1, predictions.size() - assert gt.size(1)==1, gt.size() - if gt.size(2)==predictions.size(2)*2 and gt.size(3)==predictions.size(3)*2: # special case for Spring ... - L1err = torch.minimum( torch.minimum( torch.minimum( - torch.sum(torch.abs(gt[:,:,0::2,0::2]-predictions),dim=1), - torch.sum(torch.abs(gt[:,:,1::2,0::2]-predictions),dim=1)), - torch.sum(torch.abs(gt[:,:,0::2,1::2]-predictions),dim=1)), - torch.sum(torch.abs(gt[:,:,1::2,1::2]-predictions),dim=1)) - valid = torch.isfinite(L1err) - else: - valid = torch.isfinite(gt[:,0,:,:]) # both x and y would be infinite - L1err = torch.sum(torch.abs(gt-predictions),dim=1) - N = valid.sum() - Nnew = self.agg_N + N - self.agg_L1err = float(self.agg_N)/Nnew * self.agg_L1err + L1err[valid].mean().cpu() * float(N)/Nnew - self.agg_N = Nnew - for i,th in enumerate(self.bad_ths): - self.agg_Nbad[i] += (L1err[valid]>th).sum().cpu() - - def _compute_metrics(self): - if self._metrics is not None: return - out = {} - out['L1err'] = self.agg_L1err.item() - for i,th in enumerate(self.bad_ths): - out['bad@{:.1f}'.format(th)] = (float(self.agg_Nbad[i]) / self.agg_N).item() * 100.0 - self._metrics = out - - def get_results(self): - self._compute_metrics() # to avoid recompute them multiple times - return self._metrics - -class FlowDatasetMetrics(nn.Module): - - def __init__(self): - super().__init__() - self.bad_ths = [0.5,1,3,5] - self.speed_ths = [(0,10),(10,40),(40,torch.inf)] - - def reset(self): - self.agg_N = 0 # number of pixels so far - self.agg_L1err = torch.tensor(0.0) # L1 error so far - self.agg_L2err = torch.tensor(0.0) # L2 (=EPE) error so far - self.agg_Nbad = [0 for _ in self.bad_ths] # counter of bad pixels - self.agg_EPEspeed = [torch.tensor(0.0) for _ in self.speed_ths] # EPE per speed bin so far - self.agg_Nspeed = [0 for _ in self.speed_ths] # N pixels per speed bin so far - self._metrics = None - self.pairname_results = {} - - def add_batch(self, predictions, gt): - assert predictions.size(1)==2, predictions.size() - assert gt.size(1)==2, gt.size() - if gt.size(2)==predictions.size(2)*2 and gt.size(3)==predictions.size(3)*2: # special case for Spring ... - L1err = torch.minimum( torch.minimum( torch.minimum( - torch.sum(torch.abs(gt[:,:,0::2,0::2]-predictions),dim=1), - torch.sum(torch.abs(gt[:,:,1::2,0::2]-predictions),dim=1)), - torch.sum(torch.abs(gt[:,:,0::2,1::2]-predictions),dim=1)), - torch.sum(torch.abs(gt[:,:,1::2,1::2]-predictions),dim=1)) - L2err = torch.minimum( torch.minimum( torch.minimum( - torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,0::2]-predictions),dim=1)), - torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,0::2]-predictions),dim=1))), - torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,1::2]-predictions),dim=1))), - torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,1::2]-predictions),dim=1))) - valid = torch.isfinite(L1err) - gtspeed = (torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,0::2]),dim=1)) + torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,1::2]),dim=1)) +\ - torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,0::2]),dim=1)) + torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,1::2]),dim=1)) ) / 4.0 # let's just average them - else: - valid = torch.isfinite(gt[:,0,:,:]) # both x and y would be infinite - L1err = torch.sum(torch.abs(gt-predictions),dim=1) - L2err = torch.sqrt(torch.sum(torch.square(gt-predictions),dim=1)) - gtspeed = torch.sqrt(torch.sum(torch.square(gt),dim=1)) - N = valid.sum() - Nnew = self.agg_N + N - self.agg_L1err = float(self.agg_N)/Nnew * self.agg_L1err + L1err[valid].mean().cpu() * float(N)/Nnew - self.agg_L2err = float(self.agg_N)/Nnew * self.agg_L2err + L2err[valid].mean().cpu() * float(N)/Nnew - self.agg_N = Nnew - for i,th in enumerate(self.bad_ths): - self.agg_Nbad[i] += (L2err[valid]>th).sum().cpu() - for i,(th1,th2) in enumerate(self.speed_ths): - vv = (gtspeed[valid]>=th1) * (gtspeed[valid] don't use batch_size>1 at test time) - self._prepare_data() - self._load_or_build_cache() - - def prepare_data(self): - """ - to be defined for each dataset - """ - raise NotImplementedError - - def __len__(self): - return len(self.pairnames) # each pairname is typically of the form (str, int1, int2) - - def __getitem__(self, index): - pairname = self.pairnames[index] - - # get filenames - img1name = self.pairname_to_img1name(pairname) - img2name = self.pairname_to_img2name(pairname) - flowname = self.pairname_to_flowname(pairname) if self.pairname_to_flowname is not None else None - - # load images and disparities - img1 = _read_img(img1name) - img2 = _read_img(img2name) - flow = self.load_flow(flowname) if flowname is not None else None - - # apply augmentations - if self.augmentor is not None: - img1, img2, flow = self.augmentor(img1, img2, flow, self.name) - - if self.totensor: - img1 = img_to_tensor(img1) - img2 = img_to_tensor(img2) - if flow is not None: - flow = flow_to_tensor(flow) - else: - flow = torch.tensor([]) # to allow dataloader batching with default collate_gn - pairname = str(pairname) # transform potential tuple to str to be able to batch it - - return img1, img2, flow, pairname - - def __rmul__(self, v): - self.rmul *= v - self.pairnames = v * self.pairnames - return self - - def __str__(self): - return f'{self.__class__.__name__}_{self.split}' - - def __repr__(self): - s = f'{self.__class__.__name__}(split={self.split}, augmentor={self.augmentor_str}, crop_size={str(self.crop_size)}, totensor={self.totensor})' - if self.rmul==1: - s+=f'\n\tnum pairs: {len(self.pairnames)}' - else: - s+=f'\n\tnum pairs: {len(self.pairnames)} ({len(self.pairnames)//self.rmul}x{self.rmul})' - return s - - def _set_root(self): - self.root = dataset_to_root[self.name] - assert os.path.isdir(self.root), f"could not find root directory for dataset {self.name}: {self.root}" - - def _load_or_build_cache(self): - cache_file = osp.join(cache_dir, self.name+'.pkl') - if osp.isfile(cache_file): - with open(cache_file, 'rb') as fid: - self.pairnames = pickle.load(fid)[self.split] - else: - tosave = self._build_cache() - os.makedirs(cache_dir, exist_ok=True) - with open(cache_file, 'wb') as fid: - pickle.dump(tosave, fid) - self.pairnames = tosave[self.split] - -class TartanAirDataset(FlowDataset): - - def _prepare_data(self): - self.name = "TartanAir" - self._set_root() - assert self.split in ['train'] - self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname[0], 'image_left/{:06d}_left.png'.format(pairname[1])) - self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname[0], 'image_left/{:06d}_left.png'.format(pairname[2])) - self.pairname_to_flowname = lambda pairname: osp.join(self.root, pairname[0], 'flow/{:06d}_{:06d}_flow.npy'.format(pairname[1],pairname[2])) - self.pairname_to_str = lambda pairname: os.path.join(pairname[0][pairname[0].find('/')+1:], '{:06d}_{:06d}'.format(pairname[1], pairname[2])) - self.load_flow = _read_numpy_flow - - def _build_cache(self): - seqs = sorted(os.listdir(self.root)) - pairs = [(osp.join(s,s,difficulty,Pxxx),int(a[:6]),int(a[:6])+1) for s in seqs for difficulty in ['Easy','Hard'] for Pxxx in sorted(os.listdir(osp.join(self.root,s,s,difficulty))) for a in sorted(os.listdir(osp.join(self.root,s,s,difficulty,Pxxx,'image_left/')))[:-1]] - assert len(pairs)==306268, "incorrect parsing of pairs in TartanAir" - tosave = {'train': pairs} - return tosave - -class FlyingChairsDataset(FlowDataset): - - def _prepare_data(self): - self.name = "FlyingChairs" - self._set_root() - assert self.split in ['train','val'] - self.pairname_to_img1name = lambda pairname: osp.join(self.root, 'data', pairname+'_img1.ppm') - self.pairname_to_img2name = lambda pairname: osp.join(self.root, 'data', pairname+'_img2.ppm') - self.pairname_to_flowname = lambda pairname: osp.join(self.root, 'data', pairname+'_flow.flo') - self.pairname_to_str = lambda pairname: pairname - self.load_flow = _read_flo_file - - def _build_cache(self): - split_file = osp.join(self.root, 'chairs_split.txt') - split_list = np.loadtxt(split_file, dtype=np.int32) - trainpairs = ['{:05d}'.format(i) for i in np.where(split_list==1)[0]+1] - valpairs = ['{:05d}'.format(i) for i in np.where(split_list==2)[0]+1] - assert len(trainpairs)==22232 and len(valpairs)==640, "incorrect parsing of pairs in MPI-Sintel" - tosave = {'train': trainpairs, 'val': valpairs} - return tosave - -class FlyingThingsDataset(FlowDataset): - - def _prepare_data(self): - self.name = "FlyingThings" - self._set_root() - assert self.split in [f'{set_}_{pass_}pass{camstr}' for set_ in ['train','test','test1024'] for camstr in ['','_rightcam'] for pass_ in ['clean','final','all']] - self.pairname_to_img1name = lambda pairname: osp.join(self.root, f'frames_{pairname[3]}pass', pairname[0].replace('into_future','').replace('into_past',''), '{:04d}.png'.format(pairname[1])) - self.pairname_to_img2name = lambda pairname: osp.join(self.root, f'frames_{pairname[3]}pass', pairname[0].replace('into_future','').replace('into_past',''), '{:04d}.png'.format(pairname[2])) - self.pairname_to_flowname = lambda pairname: osp.join(self.root, 'optical_flow', pairname[0], 'OpticalFlowInto{f:s}_{i:04d}_{c:s}.pfm'.format(f='Future' if 'future' in pairname[0] else 'Past', i=pairname[1], c='L' if 'left' in pairname[0] else 'R' )) - self.pairname_to_str = lambda pairname: os.path.join(pairname[3]+'pass', pairname[0], 'Into{f:s}_{i:04d}_{c:s}'.format(f='Future' if 'future' in pairname[0] else 'Past', i=pairname[1], c='L' if 'left' in pairname[0] else 'R' )) - self.load_flow = _read_pfm_flow - - def _build_cache(self): - tosave = {} - # train and test splits for the different passes - for set_ in ['train', 'test']: - sroot = osp.join(self.root, 'optical_flow', set_.upper()) - fname_to_i = lambda f: int(f[len('OpticalFlowIntoFuture_'):-len('_L.pfm')]) - pp = [(osp.join(set_.upper(), d, s, 'into_future/left'),fname_to_i(fname)) for d in sorted(os.listdir(sroot)) for s in sorted(os.listdir(osp.join(sroot,d))) for fname in sorted(os.listdir(osp.join(sroot,d, s, 'into_future/left')))[:-1]] - pairs = [(a,i,i+1) for a,i in pp] - pairs += [(a.replace('into_future','into_past'),i+1,i) for a,i in pp] - assert len(pairs)=={'train': 40302, 'test': 7866}[set_], "incorrect parsing of pairs Flying Things" - for cam in ['left','right']: - camstr = '' if cam=='left' else f'_{cam}cam' - for pass_ in ['final', 'clean']: - tosave[f'{set_}_{pass_}pass{camstr}'] = [(a.replace('left',cam),i,j,pass_) for a,i,j in pairs] - tosave[f'{set_}_allpass{camstr}'] = tosave[f'{set_}_cleanpass{camstr}'] + tosave[f'{set_}_finalpass{camstr}'] - # test1024: this is the same split as unimatch 'validation' split - # see https://github.com/autonomousvision/unimatch/blob/master/dataloader/flow/datasets.py#L229 - test1024_nsamples = 1024 - alltest_nsamples = len(tosave['test_cleanpass']) # 7866 - stride = alltest_nsamples // test1024_nsamples - remove = alltest_nsamples % test1024_nsamples - for cam in ['left','right']: - camstr = '' if cam=='left' else f'_{cam}cam' - for pass_ in ['final','clean']: - tosave[f'test1024_{pass_}pass{camstr}'] = sorted(tosave[f'test_{pass_}pass{camstr}'])[:-remove][::stride] # warning, it was not sorted before - assert len(tosave['test1024_cleanpass'])==1024, "incorrect parsing of pairs in Flying Things" - tosave[f'test1024_allpass{camstr}'] = tosave[f'test1024_cleanpass{camstr}'] + tosave[f'test1024_finalpass{camstr}'] - return tosave - - -class MPISintelDataset(FlowDataset): - - def _prepare_data(self): - self.name = "MPISintel" - self._set_root() - assert self.split in [s+'_'+p for s in ['train','test','subval','subtrain'] for p in ['cleanpass','finalpass','allpass']] - self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname[0], 'frame_{:04d}.png'.format(pairname[1])) - self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname[0], 'frame_{:04d}.png'.format(pairname[1]+1)) - self.pairname_to_flowname = lambda pairname: None if pairname[0].startswith('test/') else osp.join(self.root, pairname[0].replace('/clean/','/flow/').replace('/final/','/flow/'), 'frame_{:04d}.flo'.format(pairname[1])) - self.pairname_to_str = lambda pairname: osp.join(pairname[0], 'frame_{:04d}'.format(pairname[1])) - self.load_flow = _read_flo_file - - def _build_cache(self): - trainseqs = sorted(os.listdir(self.root+'training/clean')) - trainpairs = [ (osp.join('training/clean', s),i) for s in trainseqs for i in range(1, len(os.listdir(self.root+'training/clean/'+s)))] - subvalseqs = ['temple_2','temple_3'] - subtrainseqs = [s for s in trainseqs if s not in subvalseqs] - subvalpairs = [ (p,i) for p,i in trainpairs if any(s in p for s in subvalseqs)] - subtrainpairs = [ (p,i) for p,i in trainpairs if any(s in p for s in subtrainseqs)] - testseqs = sorted(os.listdir(self.root+'test/clean')) - testpairs = [ (osp.join('test/clean', s),i) for s in testseqs for i in range(1, len(os.listdir(self.root+'test/clean/'+s)))] - assert len(trainpairs)==1041 and len(testpairs)==552 and len(subvalpairs)==98 and len(subtrainpairs)==943, "incorrect parsing of pairs in MPI-Sintel" - tosave = {} - tosave['train_cleanpass'] = trainpairs - tosave['test_cleanpass'] = testpairs - tosave['subval_cleanpass'] = subvalpairs - tosave['subtrain_cleanpass'] = subtrainpairs - for t in ['train','test','subval','subtrain']: - tosave[t+'_finalpass'] = [(p.replace('/clean/','/final/'),i) for p,i in tosave[t+'_cleanpass']] - tosave[t+'_allpass'] = tosave[t+'_cleanpass'] + tosave[t+'_finalpass'] - return tosave - - def submission_save_pairname(self, pairname, prediction, outdir, _time): - assert prediction.shape[2]==2 - outfile = os.path.join(outdir, 'submission', self.pairname_to_str(pairname)+'.flo') - os.makedirs( os.path.dirname(outfile), exist_ok=True) - writeFlowFile(prediction, outfile) - - def finalize_submission(self, outdir): - assert self.split == 'test_allpass' - bundle_exe = "/nfs/data/ffs-3d/datasets/StereoFlow/MPI-Sintel/bundler/linux-x64/bundler" # eg - if os.path.isfile(bundle_exe): - cmd = f'{bundle_exe} "{outdir}/submission/test/clean/" "{outdir}/submission/test/final" "{outdir}/submission/bundled.lzma"' - print(cmd) - os.system(cmd) - print(f'Done. Submission file at: "{outdir}/submission/bundled.lzma"') - else: - print('Could not find bundler executable for submission.') - print('Please download it and run:') - print(f' "{outdir}/submission/test/clean/" "{outdir}/submission/test/final" "{outdir}/submission/bundled.lzma"') - -class SpringDataset(FlowDataset): - - def _prepare_data(self): - self.name = "Spring" - self._set_root() - assert self.split in ['train','test','subtrain','subval'] - self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname[0], pairname[1], 'frame_'+pairname[3], 'frame_{:s}_{:04d}.png'.format(pairname[3], pairname[4])) - self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname[0], pairname[1], 'frame_'+pairname[3], 'frame_{:s}_{:04d}.png'.format(pairname[3], pairname[4]+(1 if pairname[2]=='FW' else -1))) - self.pairname_to_flowname = lambda pairname: None if pairname[0]=='test' else osp.join(self.root, pairname[0], pairname[1], f'flow_{pairname[2]}_{pairname[3]}', f'flow_{pairname[2]}_{pairname[3]}_{pairname[4]:04d}.flo5') - self.pairname_to_str = lambda pairname: osp.join(pairname[0], pairname[1], f'flow_{pairname[2]}_{pairname[3]}', f'flow_{pairname[2]}_{pairname[3]}_{pairname[4]:04d}') - self.load_flow = _read_hdf5_flow - - def _build_cache(self): - # train - trainseqs = sorted(os.listdir( osp.join(self.root,'train'))) - trainpairs = [] - for leftright in ['left','right']: - for fwbw in ['FW','BW']: - trainpairs += [('train',s,fwbw,leftright,int(f[len(f'flow_{fwbw}_{leftright}_'):-len('.flo5')])) for s in trainseqs for f in sorted(os.listdir(osp.join(self.root,'train',s,f'flow_{fwbw}_{leftright}')))] - # test - testseqs = sorted(os.listdir( osp.join(self.root,'test'))) - testpairs = [] - for leftright in ['left','right']: - testpairs += [('test',s,'FW',leftright,int(f[len(f'frame_{leftright}_'):-len('.png')])) for s in testseqs for f in sorted(os.listdir(osp.join(self.root,'test',s,f'frame_{leftright}')))[:-1]] - testpairs += [('test',s,'BW',leftright,int(f[len(f'frame_{leftright}_'):-len('.png')])+1) for s in testseqs for f in sorted(os.listdir(osp.join(self.root,'test',s,f'frame_{leftright}')))[:-1]] - # subtrain / subval - subtrainpairs = [p for p in trainpairs if p[1]!='0041'] - subvalpairs = [p for p in trainpairs if p[1]=='0041'] - assert len(trainpairs)==19852 and len(testpairs)==3960 and len(subtrainpairs)==19472 and len(subvalpairs)==380, "incorrect parsing of pairs in Spring" - tosave = {'train': trainpairs, 'test': testpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs} - return tosave - - def submission_save_pairname(self, pairname, prediction, outdir, time): - assert prediction.ndim==3 - assert prediction.shape[2]==2 - assert prediction.dtype==np.float32 - outfile = osp.join(outdir, pairname[0], pairname[1], f'flow_{pairname[2]}_{pairname[3]}', f'flow_{pairname[2]}_{pairname[3]}_{pairname[4]:04d}.flo5') - os.makedirs( os.path.dirname(outfile), exist_ok=True) - writeFlo5File(prediction, outfile) - - def finalize_submission(self, outdir): - assert self.split=='test' - exe = "{self.root}/flow_subsampling" - if os.path.isfile(exe): - cmd = f'cd "{outdir}/test"; {exe} .' - print(cmd) - os.system(cmd) - print(f'Done. Submission file at {outdir}/test/flow_submission.hdf5') - else: - print('Could not find flow_subsampling executable for submission.') - print('Please download it and run:') - print(f'cd "{outdir}/test"; .') - - -class Kitti12Dataset(FlowDataset): - - def _prepare_data(self): - self.name = "Kitti12" - self._set_root() - assert self.split in ['train','test'] - self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname+'_10.png') - self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname+'_11.png') - self.pairname_to_flowname = None if self.split=='test' else lambda pairname: osp.join(self.root, pairname.replace('/colored_0/','/flow_occ/')+'_10.png') - self.pairname_to_str = lambda pairname: pairname.replace('/colored_0/','/') - self.load_flow = _read_kitti_flow - - def _build_cache(self): - trainseqs = ["training/colored_0/%06d"%(i) for i in range(194)] - testseqs = ["testing/colored_0/%06d"%(i) for i in range(195)] - assert len(trainseqs)==194 and len(testseqs)==195, "incorrect parsing of pairs in Kitti12" - tosave = {'train': trainseqs, 'test': testseqs} - return tosave - - def submission_save_pairname(self, pairname, prediction, outdir, time): - assert prediction.ndim==3 - assert prediction.shape[2]==2 - outfile = os.path.join(outdir, pairname.split('/')[-1]+'_10.png') - os.makedirs( os.path.dirname(outfile), exist_ok=True) - writeFlowKitti(outfile, prediction) - - def finalize_submission(self, outdir): - assert self.split=='test' - cmd = f'cd {outdir}/; zip -r "kitti12_flow_results.zip" .' - print(cmd) - os.system(cmd) - print(f'Done. Submission file at {outdir}/kitti12_flow_results.zip') - - -class Kitti15Dataset(FlowDataset): - - def _prepare_data(self): - self.name = "Kitti15" - self._set_root() - assert self.split in ['train','subtrain','subval','test'] - self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname+'_10.png') - self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname+'_11.png') - self.pairname_to_flowname = None if self.split=='test' else lambda pairname: osp.join(self.root, pairname.replace('/image_2/','/flow_occ/')+'_10.png') - self.pairname_to_str = lambda pairname: pairname.replace('/image_2/','/') - self.load_flow = _read_kitti_flow - - def _build_cache(self): - trainseqs = ["training/image_2/%06d"%(i) for i in range(200)] - subtrainseqs = trainseqs[:-10] - subvalseqs = trainseqs[-10:] - testseqs = ["testing/image_2/%06d"%(i) for i in range(200)] - assert len(trainseqs)==200 and len(subtrainseqs)==190 and len(subvalseqs)==10 and len(testseqs)==200, "incorrect parsing of pairs in Kitti15" - tosave = {'train': trainseqs, 'subtrain': subtrainseqs, 'subval': subvalseqs, 'test': testseqs} - return tosave - - def submission_save_pairname(self, pairname, prediction, outdir, time): - assert prediction.ndim==3 - assert prediction.shape[2]==2 - outfile = os.path.join(outdir, 'flow', pairname.split('/')[-1]+'_10.png') - os.makedirs( os.path.dirname(outfile), exist_ok=True) - writeFlowKitti(outfile, prediction) - - def finalize_submission(self, outdir): - assert self.split=='test' - cmd = f'cd {outdir}/; zip -r "kitti15_flow_results.zip" flow' - print(cmd) - os.system(cmd) - print(f'Done. Submission file at {outdir}/kitti15_flow_results.zip') - - -import cv2 -def _read_numpy_flow(filename): - return np.load(filename) - -def _read_pfm_flow(filename): - f, _ = _read_pfm(filename) - assert np.all(f[:,:,2]==0.0) - return np.ascontiguousarray(f[:,:,:2]) - -TAG_FLOAT = 202021.25 # tag to check the sanity of the file -TAG_STRING = 'PIEH' # string containing the tag -MIN_WIDTH = 1 -MAX_WIDTH = 99999 -MIN_HEIGHT = 1 -MAX_HEIGHT = 99999 -def readFlowFile(filename): - """ - readFlowFile() reads a flow file into a 2-band np.array. - if does not exist, an IOError is raised. - if does not finish by '.flo' or the tag, the width, the height or the file's size is illegal, an Expcetion is raised. - ---- PARAMETERS ---- - filename: string containg the name of the file to read a flow - ---- OUTPUTS ---- - a np.array of dimension (height x width x 2) containing the flow of type 'float32' - """ - - # check filename - if not filename.endswith(".flo"): - raise Exception("readFlowFile({:s}): filename must finish with '.flo'".format(filename)) - - # open the file and read it - with open(filename,'rb') as f: - # check tag - tag = struct.unpack('f',f.read(4))[0] - if tag != TAG_FLOAT: - raise Exception("flow_utils.readFlowFile({:s}): wrong tag".format(filename)) - # read dimension - w,h = struct.unpack('ii',f.read(8)) - if w < MIN_WIDTH or w > MAX_WIDTH: - raise Exception("flow_utils.readFlowFile({:s}: illegal width {:d}".format(filename,w)) - if h < MIN_HEIGHT or h > MAX_HEIGHT: - raise Exception("flow_utils.readFlowFile({:s}: illegal height {:d}".format(filename,h)) - flow = np.fromfile(f,'float32') - if not flow.shape == (h*w*2,): - raise Exception("flow_utils.readFlowFile({:s}: illegal size of the file".format(filename)) - flow.shape = (h,w,2) - return flow - -def writeFlowFile(flow,filename): - """ - writeFlowFile(flow,) write flow to the file . - if does not exist, an IOError is raised. - if does not finish with '.flo' or the flow has not 2 bands, an Exception is raised. - ---- PARAMETERS ---- - flow: np.array of dimension (height x width x 2) containing the flow to write - filename: string containg the name of the file to write a flow - """ - - # check filename - if not filename.endswith(".flo"): - raise Exception("flow_utils.writeFlowFile(,{:s}): filename must finish with '.flo'".format(filename)) - - if not flow.shape[2:] == (2,): - raise Exception("flow_utils.writeFlowFile(,{:s}): must have 2 bands".format(filename)) - - - # open the file and write it - with open(filename,'wb') as f: - # write TAG - f.write( TAG_STRING.encode('utf-8') ) - # write dimension - f.write( struct.pack('ii',flow.shape[1],flow.shape[0]) ) - # write the flow - - flow.astype(np.float32).tofile(f) - -_read_flo_file = readFlowFile - -def _read_kitti_flow(filename): - flow = cv2.imread(filename, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_COLOR) - flow = flow[:, :, ::-1].astype(np.float32) - valid = flow[:, :, 2]>0 - flow = flow[:, :, :2] - flow = (flow - 2 ** 15) / 64.0 - flow[~valid,0] = np.inf - flow[~valid,1] = np.inf - return flow -_read_hd1k_flow = _read_kitti_flow - - -def writeFlowKitti(filename, uv): - uv = 64.0 * uv + 2 ** 15 - valid = np.ones([uv.shape[0], uv.shape[1], 1]) - uv = np.concatenate([uv, valid], axis=-1).astype(np.uint16) - cv2.imwrite(filename, uv[..., ::-1]) - -def writeFlo5File(flow, filename): - with h5py.File(filename, "w") as f: - f.create_dataset("flow", data=flow, compression="gzip", compression_opts=5) - -def _read_hdf5_flow(filename): - flow = np.asarray(h5py.File(filename)['flow']) - flow[np.isnan(flow)] = np.inf # make invalid values as +inf - return flow.astype(np.float32) - -# flow visualization -RY = 15 -YG = 6 -GC = 4 -CB = 11 -BM = 13 -MR = 6 -UNKNOWN_THRESH = 1e9 - -def colorTest(): - """ - flow_utils.colorTest(): display an example of image showing the color encoding scheme - """ - import matplotlib.pylab as plt - truerange = 1 - h,w = 151,151 - trange = truerange*1.04 - s2 = round(h/2) - x,y = np.meshgrid(range(w),range(h)) - u = x*trange/s2-trange - v = y*trange/s2-trange - img = _computeColor(np.concatenate((u[:,:,np.newaxis],v[:,:,np.newaxis]),2)/trange/np.sqrt(2)) - plt.imshow(img) - plt.axis('off') - plt.axhline(round(h/2),color='k') - plt.axvline(round(w/2),color='k') - -def flowToColor(flow, maxflow=None, maxmaxflow=None, saturate=False): - """ - flow_utils.flowToColor(flow): return a color code flow field, normalized based on the maximum l2-norm of the flow - flow_utils.flowToColor(flow,maxflow): return a color code flow field, normalized by maxflow - ---- PARAMETERS ---- - flow: flow to display of shape (height x width x 2) - maxflow (default:None): if given, normalize the flow by its value, otherwise by the flow norm - maxmaxflow (default:None): if given, normalize the flow by the max of its value and the flow norm - ---- OUTPUT ---- - an np.array of shape (height x width x 3) of type uint8 containing a color code of the flow - """ - h,w,n = flow.shape - # check size of flow - assert n == 2, "flow_utils.flowToColor(flow): flow must have 2 bands" - # fix unknown flow - unknown_idx = np.max(np.abs(flow),2)>UNKNOWN_THRESH - flow[unknown_idx] = 0.0 - # compute max flow if needed - if maxflow is None: - maxflow = flowMaxNorm(flow) - if maxmaxflow is not None: - maxflow = min(maxmaxflow, maxflow) - # normalize flow - eps = np.spacing(1) # minimum positive float value to avoid division by 0 - # compute the flow - img = _computeColor(flow/(maxflow+eps), saturate=saturate) - # put black pixels in unknown location - img[ np.tile( unknown_idx[:,:,np.newaxis],[1,1,3]) ] = 0.0 - return img - -def flowMaxNorm(flow): - """ - flow_utils.flowMaxNorm(flow): return the maximum of the l2-norm of the given flow - ---- PARAMETERS ---- - flow: the flow - - ---- OUTPUT ---- - a float containing the maximum of the l2-norm of the flow - """ - return np.max( np.sqrt( np.sum( np.square( flow ) , 2) ) ) - -def _computeColor(flow, saturate=True): - """ - flow_utils._computeColor(flow): compute color codes for the flow field flow - - ---- PARAMETERS ---- - flow: np.array of dimension (height x width x 2) containing the flow to display - ---- OUTPUTS ---- - an np.array of dimension (height x width x 3) containing the color conversion of the flow - """ - # set nan to 0 - nanidx = np.isnan(flow[:,:,0]) - flow[nanidx] = 0.0 - - # colorwheel - ncols = RY + YG + GC + CB + BM + MR - nchans = 3 - colorwheel = np.zeros((ncols,nchans),'uint8') - col = 0; - #RY - colorwheel[:RY,0] = 255 - colorwheel[:RY,1] = [(255*i) // RY for i in range(RY)] - col += RY - # YG - colorwheel[col:col+YG,0] = [255 - (255*i) // YG for i in range(YG)] - colorwheel[col:col+YG,1] = 255 - col += YG - # GC - colorwheel[col:col+GC,1] = 255 - colorwheel[col:col+GC,2] = [(255*i) // GC for i in range(GC)] - col += GC - # CB - colorwheel[col:col+CB,1] = [255 - (255*i) // CB for i in range(CB)] - colorwheel[col:col+CB,2] = 255 - col += CB - # BM - colorwheel[col:col+BM,0] = [(255*i) // BM for i in range(BM)] - colorwheel[col:col+BM,2] = 255 - col += BM - # MR - colorwheel[col:col+MR,0] = 255 - colorwheel[col:col+MR,2] = [255 - (255*i) // MR for i in range(MR)] - - # compute utility variables - rad = np.sqrt( np.sum( np.square(flow) , 2) ) # magnitude - a = np.arctan2( -flow[:,:,1] , -flow[:,:,0]) / np.pi # angle - fk = (a+1)/2 * (ncols-1) # map [-1,1] to [0,ncols-1] - k0 = np.floor(fk).astype('int') - k1 = k0+1 - k1[k1==ncols] = 0 - f = fk-k0 - - if not saturate: - rad = np.minimum(rad,1) - - # compute the image - img = np.zeros( (flow.shape[0],flow.shape[1],nchans), 'uint8' ) - for i in range(nchans): - tmp = colorwheel[:,i].astype('float') - col0 = tmp[k0]/255 - col1 = tmp[k1]/255 - col = (1-f)*col0 + f*col1 - idx = (rad <= 1) - col[idx] = 1-rad[idx]*(1-col[idx]) # increase saturation with radius - col[~idx] *= 0.75 # out of range - img[:,:,i] = (255*col*(1-nanidx.astype('float'))).astype('uint8') - - return img - -# flow dataset getter - -def get_train_dataset_flow(dataset_str, augmentor=True, crop_size=None): - dataset_str = dataset_str.replace('(','Dataset(') - if augmentor: - dataset_str = dataset_str.replace(')',', augmentor=True)') - if crop_size is not None: - dataset_str = dataset_str.replace(')',', crop_size={:s})'.format(str(crop_size))) - return eval(dataset_str) - -def get_test_datasets_flow(dataset_str): - dataset_str = dataset_str.replace('(','Dataset(') - return [eval(s) for s in dataset_str.split('+')] \ No newline at end of file diff --git a/modules/croco/stereoflow/datasets_stereo.py b/modules/croco/stereoflow/datasets_stereo.py deleted file mode 100644 index dbdf841a6650afa71ae5782702902c79eba31a5c..0000000000000000000000000000000000000000 --- a/modules/croco/stereoflow/datasets_stereo.py +++ /dev/null @@ -1,674 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - -# -------------------------------------------------------- -# Dataset structure for stereo -# -------------------------------------------------------- - -import sys, os -import os.path as osp -import pickle -import numpy as np -from PIL import Image -import json -import h5py -from glob import glob -import cv2 - -import torch -from torch.utils import data - -from .augmentor import StereoAugmentor - - - -dataset_to_root = { - 'CREStereo': './data/stereoflow//crenet_stereo_trainset/stereo_trainset/crestereo/', - 'SceneFlow': './data/stereoflow//SceneFlow/', - 'ETH3DLowRes': './data/stereoflow/eth3d_lowres/', - 'Booster': './data/stereoflow/booster_gt/', - 'Middlebury2021': './data/stereoflow/middlebury/2021/data/', - 'Middlebury2014': './data/stereoflow/middlebury/2014/', - 'Middlebury2006': './data/stereoflow/middlebury/2006/', - 'Middlebury2005': './data/stereoflow/middlebury/2005/train/', - 'MiddleburyEval3': './data/stereoflow/middlebury/MiddEval3/', - 'Spring': './data/stereoflow/spring/', - 'Kitti15': './data/stereoflow/kitti-stereo-2015/', - 'Kitti12': './data/stereoflow/kitti-stereo-2012/', -} -cache_dir = "./data/stereoflow/datasets_stereo_cache/" - - -in1k_mean = torch.tensor([0.485, 0.456, 0.406]).view(3,1,1) -in1k_std = torch.tensor([0.229, 0.224, 0.225]).view(3,1,1) -def img_to_tensor(img): - img = torch.from_numpy(img).permute(2, 0, 1).float() / 255. - img = (img-in1k_mean)/in1k_std - return img -def disp_to_tensor(disp): - return torch.from_numpy(disp)[None,:,:] - -class StereoDataset(data.Dataset): - - def __init__(self, split, augmentor=False, crop_size=None, totensor=True): - self.split = split - if not augmentor: assert crop_size is None - if crop_size: assert augmentor - self.crop_size = crop_size - self.augmentor_str = augmentor - self.augmentor = StereoAugmentor(crop_size) if augmentor else None - self.totensor = totensor - self.rmul = 1 # keep track of rmul - self.has_constant_resolution = True # whether the dataset has constant resolution or not (=> don't use batch_size>1 at test time) - self._prepare_data() - self._load_or_build_cache() - - def prepare_data(self): - """ - to be defined for each dataset - """ - raise NotImplementedError - - def __len__(self): - return len(self.pairnames) - - def __getitem__(self, index): - pairname = self.pairnames[index] - - # get filenames - Limgname = self.pairname_to_Limgname(pairname) - Rimgname = self.pairname_to_Rimgname(pairname) - Ldispname = self.pairname_to_Ldispname(pairname) if self.pairname_to_Ldispname is not None else None - - # load images and disparities - Limg = _read_img(Limgname) - Rimg = _read_img(Rimgname) - disp = self.load_disparity(Ldispname) if Ldispname is not None else None - - # sanity check - if disp is not None: assert np.all(disp>0) or self.name=="Spring", (self.name, pairname, Ldispname) - - # apply augmentations - if self.augmentor is not None: - Limg, Rimg, disp = self.augmentor(Limg, Rimg, disp, self.name) - - if self.totensor: - Limg = img_to_tensor(Limg) - Rimg = img_to_tensor(Rimg) - if disp is None: - disp = torch.tensor([]) # to allow dataloader batching with default collate_gn - else: - disp = disp_to_tensor(disp) - - return Limg, Rimg, disp, str(pairname) - - def __rmul__(self, v): - self.rmul *= v - self.pairnames = v * self.pairnames - return self - - def __str__(self): - return f'{self.__class__.__name__}_{self.split}' - - def __repr__(self): - s = f'{self.__class__.__name__}(split={self.split}, augmentor={self.augmentor_str}, crop_size={str(self.crop_size)}, totensor={self.totensor})' - if self.rmul==1: - s+=f'\n\tnum pairs: {len(self.pairnames)}' - else: - s+=f'\n\tnum pairs: {len(self.pairnames)} ({len(self.pairnames)//self.rmul}x{self.rmul})' - return s - - def _set_root(self): - self.root = dataset_to_root[self.name] - assert os.path.isdir(self.root), f"could not find root directory for dataset {self.name}: {self.root}" - - def _load_or_build_cache(self): - cache_file = osp.join(cache_dir, self.name+'.pkl') - if osp.isfile(cache_file): - with open(cache_file, 'rb') as fid: - self.pairnames = pickle.load(fid)[self.split] - else: - tosave = self._build_cache() - os.makedirs(cache_dir, exist_ok=True) - with open(cache_file, 'wb') as fid: - pickle.dump(tosave, fid) - self.pairnames = tosave[self.split] - -class CREStereoDataset(StereoDataset): - - def _prepare_data(self): - self.name = 'CREStereo' - self._set_root() - assert self.split in ['train'] - self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname+'_left.jpg') - self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname+'_right.jpg') - self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, pairname+'_left.disp.png') - self.pairname_to_str = lambda pairname: pairname - self.load_disparity = _read_crestereo_disp - - - def _build_cache(self): - allpairs = [s+'/'+f[:-len('_left.jpg')] for s in sorted(os.listdir(self.root)) for f in sorted(os.listdir(self.root+'/'+s)) if f.endswith('_left.jpg')] - assert len(allpairs)==200000, "incorrect parsing of pairs in CreStereo" - tosave = {'train': allpairs} - return tosave - -class SceneFlowDataset(StereoDataset): - - def _prepare_data(self): - self.name = "SceneFlow" - self._set_root() - assert self.split in ['train_finalpass','train_cleanpass','train_allpass','test_finalpass','test_cleanpass','test_allpass','test1of100_cleanpass','test1of100_finalpass'] - self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname) - self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname).replace('/left/','/right/') - self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, pairname).replace('/frames_finalpass/','/disparity/').replace('/frames_cleanpass/','/disparity/')[:-4]+'.pfm' - self.pairname_to_str = lambda pairname: pairname[:-4] - self.load_disparity = _read_sceneflow_disp - - def _build_cache(self): - trainpairs = [] - # driving - pairs = sorted(glob(self.root+'Driving/frames_finalpass/*/*/*/left/*.png')) - pairs = list(map(lambda x: x[len(self.root):], pairs)) - assert len(pairs) == 4400, "incorrect parsing of pairs in SceneFlow" - trainpairs += pairs - # monkaa - pairs = sorted(glob(self.root+'Monkaa/frames_finalpass/*/left/*.png')) - pairs = list(map(lambda x: x[len(self.root):], pairs)) - assert len(pairs) == 8664, "incorrect parsing of pairs in SceneFlow" - trainpairs += pairs - # flyingthings - pairs = sorted(glob(self.root+'FlyingThings/frames_finalpass/TRAIN/*/*/left/*.png')) - pairs = list(map(lambda x: x[len(self.root):], pairs)) - assert len(pairs) == 22390, "incorrect parsing of pairs in SceneFlow" - trainpairs += pairs - assert len(trainpairs) == 35454, "incorrect parsing of pairs in SceneFlow" - testpairs = sorted(glob(self.root+'FlyingThings/frames_finalpass/TEST/*/*/left/*.png')) - testpairs = list(map(lambda x: x[len(self.root):], testpairs)) - assert len(testpairs) == 4370, "incorrect parsing of pairs in SceneFlow" - test1of100pairs = testpairs[::100] - assert len(test1of100pairs) == 44, "incorrect parsing of pairs in SceneFlow" - # all - tosave = {'train_finalpass': trainpairs, - 'train_cleanpass': list(map(lambda x: x.replace('frames_finalpass','frames_cleanpass'), trainpairs)), - 'test_finalpass': testpairs, - 'test_cleanpass': list(map(lambda x: x.replace('frames_finalpass','frames_cleanpass'), testpairs)), - 'test1of100_finalpass': test1of100pairs, - 'test1of100_cleanpass': list(map(lambda x: x.replace('frames_finalpass','frames_cleanpass'), test1of100pairs)), - } - tosave['train_allpass'] = tosave['train_finalpass']+tosave['train_cleanpass'] - tosave['test_allpass'] = tosave['test_finalpass']+tosave['test_cleanpass'] - return tosave - -class Md21Dataset(StereoDataset): - - def _prepare_data(self): - self.name = "Middlebury2021" - self._set_root() - assert self.split in ['train','subtrain','subval'] - self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname) - self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname.replace('/im0','/im1')) - self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, pairname.split('/')[0], 'disp0.pfm') - self.pairname_to_str = lambda pairname: pairname[:-4] - self.load_disparity = _read_middlebury_disp - - def _build_cache(self): - seqs = sorted(os.listdir(self.root)) - trainpairs = [] - for s in seqs: - #trainpairs += [s+'/im0.png'] # we should remove it, it is included as such in other lightings - trainpairs += [s+'/ambient/'+b+'/'+a for b in sorted(os.listdir(osp.join(self.root,s,'ambient'))) for a in sorted(os.listdir(osp.join(self.root,s,'ambient',b))) if a.startswith('im0')] - assert len(trainpairs)==355 - subtrainpairs = [p for p in trainpairs if any(p.startswith(s+'/') for s in seqs[:-2])] - subvalpairs = [p for p in trainpairs if any(p.startswith(s+'/') for s in seqs[-2:])] - assert len(subtrainpairs)==335 and len(subvalpairs)==20, "incorrect parsing of pairs in Middlebury 2021" - tosave = {'train': trainpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs} - return tosave - -class Md14Dataset(StereoDataset): - - def _prepare_data(self): - self.name = "Middlebury2014" - self._set_root() - assert self.split in ['train','subtrain','subval'] - self.pairname_to_Limgname = lambda pairname: osp.join(self.root, osp.dirname(pairname), 'im0.png') - self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname) - self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, osp.dirname(pairname), 'disp0.pfm') - self.pairname_to_str = lambda pairname: pairname[:-4] - self.load_disparity = _read_middlebury_disp - self.has_constant_resolution = False - - def _build_cache(self): - seqs = sorted(os.listdir(self.root)) - trainpairs = [] - for s in seqs: - trainpairs += [s+'/im1.png',s+'/im1E.png',s+'/im1L.png'] - assert len(trainpairs)==138 - valseqs = ['Umbrella-imperfect','Vintage-perfect'] - assert all(s in seqs for s in valseqs) - subtrainpairs = [p for p in trainpairs if not any(p.startswith(s+'/') for s in valseqs)] - subvalpairs = [p for p in trainpairs if any(p.startswith(s+'/') for s in valseqs)] - assert len(subtrainpairs)==132 and len(subvalpairs)==6, "incorrect parsing of pairs in Middlebury 2014" - tosave = {'train': trainpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs} - return tosave - -class Md06Dataset(StereoDataset): - - def _prepare_data(self): - self.name = "Middlebury2006" - self._set_root() - assert self.split in ['train','subtrain','subval'] - self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname) - self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, osp.dirname(pairname), 'view5.png') - self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, pairname.split('/')[0], 'disp1.png') - self.load_disparity = _read_middlebury20052006_disp - self.has_constant_resolution = False - - def _build_cache(self): - seqs = sorted(os.listdir(self.root)) - trainpairs = [] - for s in seqs: - for i in ['Illum1','Illum2','Illum3']: - for e in ['Exp0','Exp1','Exp2']: - trainpairs.append(osp.join(s,i,e,'view1.png')) - assert len(trainpairs)==189 - valseqs = ['Rocks1','Wood2'] - assert all(s in seqs for s in valseqs) - subtrainpairs = [p for p in trainpairs if not any(p.startswith(s+'/') for s in valseqs)] - subvalpairs = [p for p in trainpairs if any(p.startswith(s+'/') for s in valseqs)] - assert len(subtrainpairs)==171 and len(subvalpairs)==18, "incorrect parsing of pairs in Middlebury 2006" - tosave = {'train': trainpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs} - return tosave - -class Md05Dataset(StereoDataset): - - def _prepare_data(self): - self.name = "Middlebury2005" - self._set_root() - assert self.split in ['train','subtrain','subval'] - self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname) - self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, osp.dirname(pairname), 'view5.png') - self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, pairname.split('/')[0], 'disp1.png') - self.pairname_to_str = lambda pairname: pairname[:-4] - self.load_disparity = _read_middlebury20052006_disp - - def _build_cache(self): - seqs = sorted(os.listdir(self.root)) - trainpairs = [] - for s in seqs: - for i in ['Illum1','Illum2','Illum3']: - for e in ['Exp0','Exp1','Exp2']: - trainpairs.append(osp.join(s,i,e,'view1.png')) - assert len(trainpairs)==54, "incorrect parsing of pairs in Middlebury 2005" - valseqs = ['Reindeer'] - assert all(s in seqs for s in valseqs) - subtrainpairs = [p for p in trainpairs if not any(p.startswith(s+'/') for s in valseqs)] - subvalpairs = [p for p in trainpairs if any(p.startswith(s+'/') for s in valseqs)] - assert len(subtrainpairs)==45 and len(subvalpairs)==9, "incorrect parsing of pairs in Middlebury 2005" - tosave = {'train': trainpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs} - return tosave - -class MdEval3Dataset(StereoDataset): - - def _prepare_data(self): - self.name = "MiddleburyEval3" - self._set_root() - assert self.split in [s+'_'+r for s in ['train','subtrain','subval','test','all'] for r in ['full','half','quarter']] - if self.split.endswith('_full'): - self.root = self.root.replace('/MiddEval3','/MiddEval3_F') - elif self.split.endswith('_half'): - self.root = self.root.replace('/MiddEval3','/MiddEval3_H') - else: - assert self.split.endswith('_quarter') - self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname, 'im0.png') - self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname, 'im1.png') - self.pairname_to_Ldispname = lambda pairname: None if pairname.startswith('test') else osp.join(self.root, pairname, 'disp0GT.pfm') - self.pairname_to_str = lambda pairname: pairname - self.load_disparity = _read_middlebury_disp - # for submission only - self.submission_methodname = "CroCo-Stereo" - self.submission_sresolution = 'F' if self.split.endswith('_full') else ('H' if self.split.endswith('_half') else 'Q') - - def _build_cache(self): - trainpairs = ['train/'+s for s in sorted(os.listdir(self.root+'train/'))] - testpairs = ['test/'+s for s in sorted(os.listdir(self.root+'test/'))] - subvalpairs = trainpairs[-1:] - subtrainpairs = trainpairs[:-1] - allpairs = trainpairs+testpairs - assert len(trainpairs)==15 and len(testpairs)==15 and len(subvalpairs)==1 and len(subtrainpairs)==14 and len(allpairs)==30, "incorrect parsing of pairs in Middlebury Eval v3" - tosave = {} - for r in ['full','half','quarter']: - tosave.update(**{'train_'+r: trainpairs, 'subtrain_'+r: subtrainpairs, 'subval_'+r: subvalpairs, 'test_'+r: testpairs, 'all_'+r: allpairs}) - return tosave - - def submission_save_pairname(self, pairname, prediction, outdir, time): - assert prediction.ndim==2 - assert prediction.dtype==np.float32 - outfile = os.path.join(outdir, pairname.split('/')[0].replace('train','training')+self.submission_sresolution, pairname.split('/')[1], 'disp0'+self.submission_methodname+'.pfm') - os.makedirs( os.path.dirname(outfile), exist_ok=True) - writePFM(outfile, prediction) - timefile = os.path.join( os.path.dirname(outfile), "time"+self.submission_methodname+'.txt') - with open(timefile, 'w') as fid: - fid.write(str(time)) - - def finalize_submission(self, outdir): - cmd = f'cd {outdir}/; zip -r "{self.submission_methodname}.zip" .' - print(cmd) - os.system(cmd) - print(f'Done. Submission file at {outdir}/{self.submission_methodname}.zip') - -class ETH3DLowResDataset(StereoDataset): - - def _prepare_data(self): - self.name = "ETH3DLowRes" - self._set_root() - assert self.split in ['train','test','subtrain','subval','all'] - self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname, 'im0.png') - self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname, 'im1.png') - self.pairname_to_Ldispname = None if self.split=='test' else lambda pairname: None if pairname.startswith('test/') else osp.join(self.root, pairname.replace('train/','train_gt/'), 'disp0GT.pfm') - self.pairname_to_str = lambda pairname: pairname - self.load_disparity = _read_eth3d_disp - self.has_constant_resolution = False - - def _build_cache(self): - trainpairs = ['train/' + s for s in sorted(os.listdir(self.root+'train/'))] - testpairs = ['test/' + s for s in sorted(os.listdir(self.root+'test/'))] - assert len(trainpairs) == 27 and len(testpairs) == 20, "incorrect parsing of pairs in ETH3D Low Res" - subvalpairs = ['train/delivery_area_3s','train/electro_3l','train/playground_3l'] - assert all(p in trainpairs for p in subvalpairs) - subtrainpairs = [p for p in trainpairs if not p in subvalpairs] - assert len(subvalpairs)==3 and len(subtrainpairs)==24, "incorrect parsing of pairs in ETH3D Low Res" - tosave = {'train': trainpairs, 'test': testpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs, 'all': trainpairs+testpairs} - return tosave - - def submission_save_pairname(self, pairname, prediction, outdir, time): - assert prediction.ndim==2 - assert prediction.dtype==np.float32 - outfile = os.path.join(outdir, 'low_res_two_view', pairname.split('/')[1]+'.pfm') - os.makedirs( os.path.dirname(outfile), exist_ok=True) - writePFM(outfile, prediction) - timefile = outfile[:-4]+'.txt' - with open(timefile, 'w') as fid: - fid.write('runtime '+str(time)) - - def finalize_submission(self, outdir): - cmd = f'cd {outdir}/; zip -r "eth3d_low_res_two_view_results.zip" low_res_two_view' - print(cmd) - os.system(cmd) - print(f'Done. Submission file at {outdir}/eth3d_low_res_two_view_results.zip') - -class BoosterDataset(StereoDataset): - - def _prepare_data(self): - self.name = "Booster" - self._set_root() - assert self.split in ['train_balanced','test_balanced','subtrain_balanced','subval_balanced'] # we use only the balanced version - self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname) - self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname).replace('/camera_00/','/camera_02/') - self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, osp.dirname(pairname), '../disp_00.npy') # same images with different colors, same gt per sequence - self.pairname_to_str = lambda pairname: pairname[:-4].replace('/camera_00/','/') - self.load_disparity = _read_booster_disp - - - def _build_cache(self): - trainseqs = sorted(os.listdir(self.root+'train/balanced')) - trainpairs = ['train/balanced/'+s+'/camera_00/'+imname for s in trainseqs for imname in sorted(os.listdir(self.root+'train/balanced/'+s+'/camera_00/'))] - testpairs = ['test/balanced/'+s+'/camera_00/'+imname for s in sorted(os.listdir(self.root+'test/balanced')) for imname in sorted(os.listdir(self.root+'test/balanced/'+s+'/camera_00/'))] - assert len(trainpairs) == 228 and len(testpairs) == 191 - subtrainpairs = [p for p in trainpairs if any(s in p for s in trainseqs[:-2])] - subvalpairs = [p for p in trainpairs if any(s in p for s in trainseqs[-2:])] - # warning: if we do validation split, we should split scenes!!! - tosave = {'train_balanced': trainpairs, 'test_balanced': testpairs, 'subtrain_balanced': subtrainpairs, 'subval_balanced': subvalpairs,} - return tosave - -class SpringDataset(StereoDataset): - - def _prepare_data(self): - self.name = "Spring" - self._set_root() - assert self.split in ['train', 'test', 'subtrain', 'subval'] - self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname+'.png') - self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname+'.png').replace('frame_right','').replace('frame_left','frame_right').replace('','frame_left') - self.pairname_to_Ldispname = lambda pairname: None if pairname.startswith('test') else osp.join(self.root, pairname+'.dsp5').replace('frame_left','disp1_left').replace('frame_right','disp1_right') - self.pairname_to_str = lambda pairname: pairname - self.load_disparity = _read_hdf5_disp - - def _build_cache(self): - trainseqs = sorted(os.listdir( osp.join(self.root,'train'))) - trainpairs = [osp.join('train',s,'frame_left',f[:-4]) for s in trainseqs for f in sorted(os.listdir(osp.join(self.root,'train',s,'frame_left')))] - testseqs = sorted(os.listdir( osp.join(self.root,'test'))) - testpairs = [osp.join('test',s,'frame_left',f[:-4]) for s in testseqs for f in sorted(os.listdir(osp.join(self.root,'test',s,'frame_left')))] - testpairs += [p.replace('frame_left','frame_right') for p in testpairs] - """maxnorm = {'0001': 32.88, '0002': 228.5, '0004': 298.2, '0005': 142.5, '0006': 113.6, '0007': 27.3, '0008': 554.5, '0009': 155.6, '0010': 126.1, '0011': 87.6, '0012': 303.2, '0013': 24.14, '0014': 82.56, '0015': 98.44, '0016': 156.9, '0017': 28.17, '0018': 21.03, '0020': 178.0, '0021': 58.06, '0022': 354.2, '0023': 8.79, '0024': 97.06, '0025': 55.16, '0026': 91.9, '0027': 156.6, '0030': 200.4, '0032': 58.66, '0033': 373.5, '0036': 149.4, '0037': 5.625, '0038': 37.0, '0039': 12.2, '0041': 453.5, '0043': 457.0, '0044': 379.5, '0045': 161.8, '0047': 105.44} # => let'use 0041""" - subtrainpairs = [p for p in trainpairs if p.split('/')[1]!='0041'] - subvalpairs = [p for p in trainpairs if p.split('/')[1]=='0041'] - assert len(trainpairs)==5000 and len(testpairs)==2000 and len(subtrainpairs)==4904 and len(subvalpairs)==96, "incorrect parsing of pairs in Spring" - tosave = {'train': trainpairs, 'test': testpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs} - return tosave - - def submission_save_pairname(self, pairname, prediction, outdir, time): - assert prediction.ndim==2 - assert prediction.dtype==np.float32 - outfile = os.path.join(outdir, pairname+'.dsp5').replace('frame_left','disp1_left').replace('frame_right','disp1_right') - os.makedirs( os.path.dirname(outfile), exist_ok=True) - writeDsp5File(prediction, outfile) - - def finalize_submission(self, outdir): - assert self.split=='test' - exe = "{self.root}/disp1_subsampling" - if os.path.isfile(exe): - cmd = f'cd "{outdir}/test"; {exe} .' - print(cmd) - os.system(cmd) - else: - print('Could not find disp1_subsampling executable for submission.') - print('Please download it and run:') - print(f'cd "{outdir}/test"; .') - -class Kitti12Dataset(StereoDataset): - - def _prepare_data(self): - self.name = "Kitti12" - self._set_root() - assert self.split in ['train','test'] - self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname+'_10.png') - self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname.replace('/colored_0/','/colored_1/')+'_10.png') - self.pairname_to_Ldispname = None if self.split=='test' else lambda pairname: osp.join(self.root, pairname.replace('/colored_0/','/disp_occ/')+'_10.png') - self.pairname_to_str = lambda pairname: pairname.replace('/colored_0/','/') - self.load_disparity = _read_kitti_disp - - def _build_cache(self): - trainseqs = ["training/colored_0/%06d"%(i) for i in range(194)] - testseqs = ["testing/colored_0/%06d"%(i) for i in range(195)] - assert len(trainseqs)==194 and len(testseqs)==195, "incorrect parsing of pairs in Kitti12" - tosave = {'train': trainseqs, 'test': testseqs} - return tosave - - def submission_save_pairname(self, pairname, prediction, outdir, time): - assert prediction.ndim==2 - assert prediction.dtype==np.float32 - outfile = os.path.join(outdir, pairname.split('/')[-1]+'_10.png') - os.makedirs( os.path.dirname(outfile), exist_ok=True) - img = (prediction * 256).astype('uint16') - Image.fromarray(img).save(outfile) - - def finalize_submission(self, outdir): - assert self.split=='test' - cmd = f'cd {outdir}/; zip -r "kitti12_results.zip" .' - print(cmd) - os.system(cmd) - print(f'Done. Submission file at {outdir}/kitti12_results.zip') - -class Kitti15Dataset(StereoDataset): - - def _prepare_data(self): - self.name = "Kitti15" - self._set_root() - assert self.split in ['train','subtrain','subval','test'] - self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname+'_10.png') - self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname.replace('/image_2/','/image_3/')+'_10.png') - self.pairname_to_Ldispname = None if self.split=='test' else lambda pairname: osp.join(self.root, pairname.replace('/image_2/','/disp_occ_0/')+'_10.png') - self.pairname_to_str = lambda pairname: pairname.replace('/image_2/','/') - self.load_disparity = _read_kitti_disp - - def _build_cache(self): - trainseqs = ["training/image_2/%06d"%(i) for i in range(200)] - subtrainseqs = trainseqs[:-5] - subvalseqs = trainseqs[-5:] - testseqs = ["testing/image_2/%06d"%(i) for i in range(200)] - assert len(trainseqs)==200 and len(subtrainseqs)==195 and len(subvalseqs)==5 and len(testseqs)==200, "incorrect parsing of pairs in Kitti15" - tosave = {'train': trainseqs, 'subtrain': subtrainseqs, 'subval': subvalseqs, 'test': testseqs} - return tosave - - def submission_save_pairname(self, pairname, prediction, outdir, time): - assert prediction.ndim==2 - assert prediction.dtype==np.float32 - outfile = os.path.join(outdir, 'disp_0', pairname.split('/')[-1]+'_10.png') - os.makedirs( os.path.dirname(outfile), exist_ok=True) - img = (prediction * 256).astype('uint16') - Image.fromarray(img).save(outfile) - - def finalize_submission(self, outdir): - assert self.split=='test' - cmd = f'cd {outdir}/; zip -r "kitti15_results.zip" disp_0' - print(cmd) - os.system(cmd) - print(f'Done. Submission file at {outdir}/kitti15_results.zip') - - -### auxiliary functions - -def _read_img(filename): - # convert to RGB for scene flow finalpass data - img = np.asarray(Image.open(filename).convert('RGB')) - return img - -def _read_booster_disp(filename): - disp = np.load(filename) - disp[disp==0.0] = np.inf - return disp - -def _read_png_disp(filename, coef=1.0): - disp = np.asarray(Image.open(filename)) - disp = disp.astype(np.float32) / coef - disp[disp==0.0] = np.inf - return disp - -def _read_pfm_disp(filename): - disp = np.ascontiguousarray(_read_pfm(filename)[0]) - disp[disp<=0] = np.inf # eg /nfs/data/ffs-3d/datasets/middlebury/2014/Shopvac-imperfect/disp0.pfm - return disp - -def _read_npy_disp(filename): - return np.load(filename) - -def _read_crestereo_disp(filename): return _read_png_disp(filename, coef=32.0) -def _read_middlebury20052006_disp(filename): return _read_png_disp(filename, coef=1.0) -def _read_kitti_disp(filename): return _read_png_disp(filename, coef=256.0) -_read_sceneflow_disp = _read_pfm_disp -_read_eth3d_disp = _read_pfm_disp -_read_middlebury_disp = _read_pfm_disp -_read_carla_disp = _read_pfm_disp -_read_tartanair_disp = _read_npy_disp - -def _read_hdf5_disp(filename): - disp = np.asarray(h5py.File(filename)['disparity']) - disp[np.isnan(disp)] = np.inf # make invalid values as +inf - #disp[disp==0.0] = np.inf # make invalid values as +inf - return disp.astype(np.float32) - -import re -def _read_pfm(file): - file = open(file, 'rb') - - color = None - width = None - height = None - scale = None - endian = None - - header = file.readline().rstrip() - if header.decode("ascii") == 'PF': - color = True - elif header.decode("ascii") == 'Pf': - color = False - else: - raise Exception('Not a PFM file.') - - dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline().decode("ascii")) - if dim_match: - width, height = list(map(int, dim_match.groups())) - else: - raise Exception('Malformed PFM header.') - - scale = float(file.readline().decode("ascii").rstrip()) - if scale < 0: # little-endian - endian = '<' - scale = -scale - else: - endian = '>' # big-endian - - data = np.fromfile(file, endian + 'f') - shape = (height, width, 3) if color else (height, width) - - data = np.reshape(data, shape) - data = np.flipud(data) - return data, scale - -def writePFM(file, image, scale=1): - file = open(file, 'wb') - - color = None - - if image.dtype.name != 'float32': - raise Exception('Image dtype must be float32.') - - image = np.flipud(image) - - if len(image.shape) == 3 and image.shape[2] == 3: # color image - color = True - elif len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1: # greyscale - color = False - else: - raise Exception('Image must have H x W x 3, H x W x 1 or H x W dimensions.') - - file.write('PF\n' if color else 'Pf\n'.encode()) - file.write('%d %d\n'.encode() % (image.shape[1], image.shape[0])) - - endian = image.dtype.byteorder - - if endian == '<' or endian == '=' and sys.byteorder == 'little': - scale = -scale - - file.write('%f\n'.encode() % scale) - - image.tofile(file) - -def writeDsp5File(disp, filename): - with h5py.File(filename, "w") as f: - f.create_dataset("disparity", data=disp, compression="gzip", compression_opts=5) - - -# disp visualization - -def vis_disparity(disp, m=None, M=None): - if m is None: m = disp.min() - if M is None: M = disp.max() - disp_vis = (disp - m) / (M-m) * 255.0 - disp_vis = disp_vis.astype("uint8") - disp_vis = cv2.applyColorMap(disp_vis, cv2.COLORMAP_INFERNO) - return disp_vis - -# dataset getter - -def get_train_dataset_stereo(dataset_str, augmentor=True, crop_size=None): - dataset_str = dataset_str.replace('(','Dataset(') - if augmentor: - dataset_str = dataset_str.replace(')',', augmentor=True)') - if crop_size is not None: - dataset_str = dataset_str.replace(')',', crop_size={:s})'.format(str(crop_size))) - return eval(dataset_str) - -def get_test_datasets_stereo(dataset_str): - dataset_str = dataset_str.replace('(','Dataset(') - return [eval(s) for s in dataset_str.split('+')] \ No newline at end of file diff --git a/modules/croco/stereoflow/download_model.sh b/modules/croco/stereoflow/download_model.sh deleted file mode 100644 index 533119609108c5ec3c22ff79b10e9215c1ac5098..0000000000000000000000000000000000000000 --- a/modules/croco/stereoflow/download_model.sh +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - -model=$1 -outfile="stereoflow_models/${model}" -if [[ ! -f $outfile ]] -then - mkdir -p stereoflow_models/; - wget https://download.europe.naverlabs.com/ComputerVision/CroCo/StereoFlow_models/$1 -P stereoflow_models/; -else - echo "Model ${model} already downloaded in ${outfile}." -fi \ No newline at end of file diff --git a/modules/croco/stereoflow/engine.py b/modules/croco/stereoflow/engine.py deleted file mode 100644 index c057346b99143bf6b9c4666a58215b2b91aca7a6..0000000000000000000000000000000000000000 --- a/modules/croco/stereoflow/engine.py +++ /dev/null @@ -1,280 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - -# -------------------------------------------------------- -# Main function for training one epoch or testing -# -------------------------------------------------------- - -import math -import sys -from typing import Iterable -import numpy as np -import torch -import torchvision - -from utils import misc as misc - - -def split_prediction_conf(predictions, with_conf=False): - if not with_conf: - return predictions, None - conf = predictions[:,-1:,:,:] - predictions = predictions[:,:-1,:,:] - return predictions, conf - -def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, metrics: torch.nn.Module, - data_loader: Iterable, optimizer: torch.optim.Optimizer, - device: torch.device, epoch: int, loss_scaler, - log_writer=None, print_freq = 20, - args=None): - model.train(True) - metric_logger = misc.MetricLogger(delimiter=" ") - metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}')) - header = 'Epoch: [{}]'.format(epoch) - - accum_iter = args.accum_iter - - optimizer.zero_grad() - - details = {} - - if log_writer is not None: - print('log_dir: {}'.format(log_writer.log_dir)) - - if args.img_per_epoch: - iter_per_epoch = args.img_per_epoch // args.batch_size + int(args.img_per_epoch % args.batch_size > 0) - assert len(data_loader) >= iter_per_epoch, 'Dataset is too small for so many iterations' - len_data_loader = iter_per_epoch - else: - len_data_loader, iter_per_epoch = len(data_loader), None - - for data_iter_step, (image1, image2, gt, pairname) in enumerate(metric_logger.log_every(data_loader, print_freq, header, max_iter=iter_per_epoch)): - - image1 = image1.to(device, non_blocking=True) - image2 = image2.to(device, non_blocking=True) - gt = gt.to(device, non_blocking=True) - - # we use a per iteration (instead of per epoch) lr scheduler - if data_iter_step % accum_iter == 0: - misc.adjust_learning_rate(optimizer, data_iter_step / len_data_loader + epoch, args) - - with torch.cuda.amp.autocast(enabled=bool(args.amp)): - prediction = model(image1, image2) - prediction, conf = split_prediction_conf(prediction, criterion.with_conf) - batch_metrics = metrics(prediction.detach(), gt) - loss = criterion(prediction, gt) if conf is None else criterion(prediction, gt, conf) - - loss_value = loss.item() - if not math.isfinite(loss_value): - print("Loss is {}, stopping training".format(loss_value)) - sys.exit(1) - - loss /= accum_iter - loss_scaler(loss, optimizer, parameters=model.parameters(), - update_grad=(data_iter_step + 1) % accum_iter == 0) - if (data_iter_step + 1) % accum_iter == 0: - optimizer.zero_grad() - - torch.cuda.synchronize() - - metric_logger.update(loss=loss_value) - for k,v in batch_metrics.items(): - metric_logger.update(**{k: v.item()}) - lr = optimizer.param_groups[0]["lr"] - metric_logger.update(lr=lr) - - #if args.dsitributed: loss_value_reduce = misc.all_reduce_mean(loss_value) - time_to_log = ((data_iter_step + 1) % (args.tboard_log_step * accum_iter) == 0 or data_iter_step == len_data_loader-1) - loss_value_reduce = misc.all_reduce_mean(loss_value) - if log_writer is not None and time_to_log: - epoch_1000x = int((data_iter_step / len_data_loader + epoch) * 1000) - # We use epoch_1000x as the x-axis in tensorboard. This calibrates different curves when batch size changes. - log_writer.add_scalar('train/loss', loss_value_reduce, epoch_1000x) - log_writer.add_scalar('lr', lr, epoch_1000x) - for k,v in batch_metrics.items(): - log_writer.add_scalar('train/'+k, v.item(), epoch_1000x) - - # gather the stats from all processes - #if args.distributed: metric_logger.synchronize_between_processes() - print("Averaged stats:", metric_logger) - return {k: meter.global_avg for k, meter in metric_logger.meters.items()} - - -@torch.no_grad() -def validate_one_epoch(model: torch.nn.Module, - criterion: torch.nn.Module, - metrics: torch.nn.Module, - data_loaders: list[Iterable], - device: torch.device, - epoch: int, - log_writer=None, - args=None): - - model.eval() - metric_loggers = [] - header = 'Epoch: [{}]'.format(epoch) - print_freq = 20 - - conf_mode = args.tile_conf_mode - crop = args.crop - - if log_writer is not None: - print('log_dir: {}'.format(log_writer.log_dir)) - - results = {} - dnames = [] - image1, image2, gt, prediction = None, None, None, None - for didx, data_loader in enumerate(data_loaders): - dname = str(data_loader.dataset) - dnames.append(dname) - metric_loggers.append(misc.MetricLogger(delimiter=" ")) - for data_iter_step, (image1, image2, gt, pairname) in enumerate(metric_loggers[didx].log_every(data_loader, print_freq, header)): - image1 = image1.to(device, non_blocking=True) - image2 = image2.to(device, non_blocking=True) - gt = gt.to(device, non_blocking=True) - if dname.startswith('Spring'): - assert gt.size(2)==image1.size(2)*2 and gt.size(3)==image1.size(3)*2 - gt = (gt[:,:,0::2,0::2] + gt[:,:,0::2,1::2] + gt[:,:,1::2,0::2] + gt[:,:,1::2,1::2] ) / 4.0 # we approximate the gt based on the 2x upsampled ones - - with torch.inference_mode(): - prediction, tiled_loss, c = tiled_pred(model, criterion, image1, image2, gt, conf_mode=conf_mode, overlap=args.val_overlap, crop=crop, with_conf=criterion.with_conf) - batch_metrics = metrics(prediction.detach(), gt) - loss = criterion(prediction.detach(), gt) if not criterion.with_conf else criterion(prediction.detach(), gt, c) - loss_value = loss.item() - metric_loggers[didx].update(loss_tiled=tiled_loss.item()) - metric_loggers[didx].update(**{f'loss': loss_value}) - for k,v in batch_metrics.items(): - metric_loggers[didx].update(**{dname+'_' + k: v.item()}) - - results = {k: meter.global_avg for ml in metric_loggers for k, meter in ml.meters.items()} - if len(dnames)>1: - for k in batch_metrics.keys(): - results['AVG_'+k] = sum(results[dname+'_'+k] for dname in dnames) / len(dnames) - - if log_writer is not None : - epoch_1000x = int((1 + epoch) * 1000) - for k,v in results.items(): - log_writer.add_scalar('val/'+k, v, epoch_1000x) - - print("Averaged stats:", results) - return results - -import torch.nn.functional as F -def _resize_img(img, new_size): - return F.interpolate(img, size=new_size, mode='bicubic', align_corners=False) -def _resize_stereo_or_flow(data, new_size): - assert data.ndim==4 - assert data.size(1) in [1,2] - scale_x = new_size[1]/float(data.size(3)) - out = F.interpolate(data, size=new_size, mode='bicubic', align_corners=False) - out[:,0,:,:] *= scale_x - if out.size(1)==2: - scale_y = new_size[0]/float(data.size(2)) - out[:,1,:,:] *= scale_y - print(scale_x, new_size, data.shape) - return out - - -@torch.no_grad() -def tiled_pred(model, criterion, img1, img2, gt, - overlap=0.5, bad_crop_thr=0.05, - downscale=False, crop=512, ret='loss', - conf_mode='conf_expsigmoid_10_5', with_conf=False, - return_time=False): - - # for each image, we are going to run inference on many overlapping patches - # then, all predictions will be weighted-averaged - if gt is not None: - B, C, H, W = gt.shape - else: - B, _, H, W = img1.shape - C = model.head.num_channels-int(with_conf) - win_height, win_width = crop[0], crop[1] - - # upscale to be larger than the crop - do_change_scale = H= window and 0 <= overlap < 1, (total, window, overlap) - num_windows = 1 + int(np.ceil( (total - window) / ((1-overlap) * window) )) - offsets = np.linspace(0, total-window, num_windows).round().astype(int) - yield from (slice(x, x+window) for x in offsets) - -def _crop(img, sy, sx): - B, THREE, H, W = img.shape - if 0 <= sy.start and sy.stop <= H and 0 <= sx.start and sx.stop <= W: - return img[:,:,sy,sx] - l, r = max(0,-sx.start), max(0,sx.stop-W) - t, b = max(0,-sy.start), max(0,sy.stop-H) - img = torch.nn.functional.pad(img, (l,r,t,b), mode='constant') - return img[:, :, slice(sy.start+t,sy.stop+t), slice(sx.start+l,sx.stop+l)] \ No newline at end of file diff --git a/modules/croco/stereoflow/test.py b/modules/croco/stereoflow/test.py deleted file mode 100644 index 0248e56664c769752595af251e1eadcfa3a479d9..0000000000000000000000000000000000000000 --- a/modules/croco/stereoflow/test.py +++ /dev/null @@ -1,216 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - -# -------------------------------------------------------- -# Main test function -# -------------------------------------------------------- - -import os -import argparse -import pickle -from PIL import Image -import numpy as np -from tqdm import tqdm - -import torch -from torch.utils.data import DataLoader - -import utils.misc as misc -from models.croco_downstream import CroCoDownstreamBinocular -from models.head_downstream import PixelwiseTaskWithDPT - -from stereoflow.criterion import * -from stereoflow.datasets_stereo import get_test_datasets_stereo -from stereoflow.datasets_flow import get_test_datasets_flow -from stereoflow.engine import tiled_pred - -from stereoflow.datasets_stereo import vis_disparity -from stereoflow.datasets_flow import flowToColor - -def get_args_parser(): - parser = argparse.ArgumentParser('Test CroCo models on stereo/flow', add_help=False) - # important argument - parser.add_argument('--model', required=True, type=str, help='Path to the model to evaluate') - parser.add_argument('--dataset', required=True, type=str, help="test dataset (there can be multiple dataset separated by a +)") - # tiling - parser.add_argument('--tile_conf_mode', type=str, default='', help='Weights for the tiling aggregation based on confidence (empty means use the formula from the loaded checkpoint') - parser.add_argument('--tile_overlap', type=float, default=0.7, help='overlap between tiles') - # save (it will automatically go to _/_) - parser.add_argument('--save', type=str, nargs='+', default=[], - help='what to save: \ - metrics (pickle file), \ - pred (raw prediction save as torch tensor), \ - visu (visualization in png of each prediction), \ - err10 (visualization in png of the error clamp at 10 for each prediction), \ - submission (submission file)') - # other (no impact) - parser.add_argument('--num_workers', default=4, type=int) - return parser - - -def _load_model_and_criterion(model_path, do_load_metrics, device): - print('loading model from', model_path) - assert os.path.isfile(model_path) - ckpt = torch.load(model_path, 'cpu') - - ckpt_args = ckpt['args'] - task = ckpt_args.task - tile_conf_mode = ckpt_args.tile_conf_mode - num_channels = {'stereo': 1, 'flow': 2}[task] - with_conf = eval(ckpt_args.criterion).with_conf - if with_conf: num_channels += 1 - print('head: PixelwiseTaskWithDPT()') - head = PixelwiseTaskWithDPT() - head.num_channels = num_channels - print('croco_args:', ckpt_args.croco_args) - model = CroCoDownstreamBinocular(head, **ckpt_args.croco_args) - msg = model.load_state_dict(ckpt['model'], strict=True) - model.eval() - model = model.to(device) - - if do_load_metrics: - if task=='stereo': - metrics = StereoDatasetMetrics().to(device) - else: - metrics = FlowDatasetMetrics().to(device) - else: - metrics = None - - return model, metrics, ckpt_args.crop, with_conf, task, tile_conf_mode - - -def _save_batch(pred, gt, pairnames, dataset, task, save, outdir, time, submission_dir=None): - - for i in range(len(pairnames)): - - pairname = eval(pairnames[i]) if pairnames[i].startswith('(') else pairnames[i] # unbatch pairname - fname = os.path.join(outdir, dataset.pairname_to_str(pairname)) - os.makedirs(os.path.dirname(fname), exist_ok=True) - - predi = pred[i,...] - if gt is not None: gti = gt[i,...] - - if 'pred' in save: - torch.save(predi.squeeze(0).cpu(), fname+'_pred.pth') - - if 'visu' in save: - if task=='stereo': - disparity = predi.permute((1,2,0)).squeeze(2).cpu().numpy() - m,M = None - if gt is not None: - mask = torch.isfinite(gti) - m = gt[mask].min() - M = gt[mask].max() - img_disparity = vis_disparity(disparity, m=m, M=M) - Image.fromarray(img_disparity).save(fname+'_pred.png') - else: - # normalize flowToColor according to the maxnorm of gt (or prediction if not available) - flowNorm = torch.sqrt(torch.sum( (gti if gt is not None else predi)**2, dim=0)).max().item() - imgflow = flowToColor(predi.permute((1,2,0)).cpu().numpy(), maxflow=flowNorm) - Image.fromarray(imgflow).save(fname+'_pred.png') - - if 'err10' in save: - assert gt is not None - L2err = torch.sqrt(torch.sum( (gti-predi)**2, dim=0)) - valid = torch.isfinite(gti[0,:,:]) - L2err[~valid] = 0.0 - L2err = torch.clamp(L2err, max=10.0) - red = (L2err*255.0/10.0).to(dtype=torch.uint8)[:,:,None] - zer = torch.zeros_like(red) - imgerr = torch.cat( (red,zer,zer), dim=2).cpu().numpy() - Image.fromarray(imgerr).save(fname+'_err10.png') - - if 'submission' in save: - assert submission_dir is not None - predi_np = predi.permute(1,2,0).squeeze(2).cpu().numpy() # transform into HxWx2 for flow or HxW for stereo - dataset.submission_save_pairname(pairname, predi_np, submission_dir, time) - -def main(args): - - # load the pretrained model and metrics - device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu') - model, metrics, cropsize, with_conf, task, tile_conf_mode = _load_model_and_criterion(args.model, 'metrics' in args.save, device) - if args.tile_conf_mode=='': args.tile_conf_mode = tile_conf_mode - - # load the datasets - datasets = (get_test_datasets_stereo if task=='stereo' else get_test_datasets_flow)(args.dataset) - dataloaders = [DataLoader(dataset, batch_size=1, shuffle=False, num_workers=args.num_workers, pin_memory=True, drop_last=False) for dataset in datasets] - - # run - for i,dataloader in enumerate(dataloaders): - dataset = datasets[i] - dstr = args.dataset.split('+')[i] - - outdir = args.model+'_'+misc.filename(dstr) - if 'metrics' in args.save and len(args.save)==1: - fname = os.path.join(outdir, f'conf_{args.tile_conf_mode}_overlap_{args.tile_overlap}.pkl') - if os.path.isfile(fname) and len(args.save)==1: - print(' metrics already compute in '+fname) - with open(fname, 'rb') as fid: - results = pickle.load(fid) - for k,v in results.items(): - print('{:s}: {:.3f}'.format(k, v)) - continue - - if 'submission' in args.save: - dirname = f'submission_conf_{args.tile_conf_mode}_overlap_{args.tile_overlap}' - submission_dir = os.path.join(outdir, dirname) - else: - submission_dir = None - - print('') - print('saving {:s} in {:s}'.format('+'.join(args.save), outdir)) - print(repr(dataset)) - - if metrics is not None: - metrics.reset() - - for data_iter_step, (image1, image2, gt, pairnames) in enumerate(tqdm(dataloader)): - - do_flip = (task=='stereo' and dstr.startswith('Spring') and any("right" in p for p in pairnames)) # we flip the images and will flip the prediction after as we assume img1 is on the left - - image1 = image1.to(device, non_blocking=True) - image2 = image2.to(device, non_blocking=True) - gt = gt.to(device, non_blocking=True) if gt.numel()>0 else None # special case for test time - if do_flip: - assert all("right" in p for p in pairnames) - image1 = image1.flip(dims=[3]) # this is already the right frame, let's flip it - image2 = image2.flip(dims=[3]) - gt = gt # that is ok - - with torch.inference_mode(): - pred, _, _, time = tiled_pred(model, None, image1, image2, None if dataset.name=='Spring' else gt, conf_mode=args.tile_conf_mode, overlap=args.tile_overlap, crop=cropsize, with_conf=with_conf, return_time=True) - - if do_flip: - pred = pred.flip(dims=[3]) - - if metrics is not None: - metrics.add_batch(pred, gt) - - if any(k in args.save for k in ['pred','visu','err10','submission']): - _save_batch(pred, gt, pairnames, dataset, task, args.save, outdir, time, submission_dir=submission_dir) - - - # print - if metrics is not None: - results = metrics.get_results() - for k,v in results.items(): - print('{:s}: {:.3f}'.format(k, v)) - - # save if needed - if 'metrics' in args.save: - os.makedirs(os.path.dirname(fname), exist_ok=True) - with open(fname, 'wb') as fid: - pickle.dump(results, fid) - print('metrics saved in', fname) - - # finalize submission if needed - if 'submission' in args.save: - dataset.finalize_submission(submission_dir) - - - -if __name__ == '__main__': - args = get_args_parser() - args = args.parse_args() - main(args) \ No newline at end of file diff --git a/modules/croco/stereoflow/train.py b/modules/croco/stereoflow/train.py deleted file mode 100644 index 91f2414ffbe5ecd547d31c0e2455478d402719d6..0000000000000000000000000000000000000000 --- a/modules/croco/stereoflow/train.py +++ /dev/null @@ -1,253 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - -# -------------------------------------------------------- -# Main training function -# -------------------------------------------------------- - -import argparse -import datetime -import json -import numpy as np -import os -import sys -import time - -import torch -import torch.distributed as dist -import torch.backends.cudnn as cudnn -from torch.utils.tensorboard import SummaryWriter -import torchvision.transforms as transforms -import torchvision.datasets as datasets -from torch.utils.data import DataLoader - -import utils -import utils.misc as misc -from utils.misc import NativeScalerWithGradNormCount as NativeScaler -from models.croco_downstream import CroCoDownstreamBinocular, croco_args_from_ckpt -from models.pos_embed import interpolate_pos_embed -from models.head_downstream import PixelwiseTaskWithDPT - -from stereoflow.datasets_stereo import get_train_dataset_stereo, get_test_datasets_stereo -from stereoflow.datasets_flow import get_train_dataset_flow, get_test_datasets_flow -from stereoflow.engine import train_one_epoch, validate_one_epoch -from stereoflow.criterion import * - - -def get_args_parser(): - # prepare subparsers - parser = argparse.ArgumentParser('Finetuning CroCo models on stereo or flow', add_help=False) - subparsers = parser.add_subparsers(title="Task (stereo or flow)", dest="task", required=True) - parser_stereo = subparsers.add_parser('stereo', help='Training stereo model') - parser_flow = subparsers.add_parser('flow', help='Training flow model') - def add_arg(name_or_flags, default=None, default_stereo=None, default_flow=None, **kwargs): - if default is not None: assert default_stereo is None and default_flow is None, "setting default makes default_stereo and default_flow disabled" - parser_stereo.add_argument(name_or_flags, default=default if default is not None else default_stereo, **kwargs) - parser_flow.add_argument(name_or_flags, default=default if default is not None else default_flow, **kwargs) - # output dir - add_arg('--output_dir', required=True, type=str, help='path where to save, if empty, automatically created') - # model - add_arg('--crop', type=int, nargs = '+', default_stereo=[352, 704], default_flow=[320, 384], help = "size of the random image crops used during training.") - add_arg('--pretrained', required=True, type=str, help="Load pretrained model (required as croco arguments come from there)") - # criterion - add_arg('--criterion', default_stereo='LaplacianLossBounded2()', default_flow='LaplacianLossBounded()', type=str, help='string to evaluate to get criterion') - add_arg('--bestmetric', default_stereo='avgerr', default_flow='EPE', type=str) - # dataset - add_arg('--dataset', type=str, required=True, help="training set") - # training - add_arg('--seed', default=0, type=int, help='seed') - add_arg('--batch_size', default_stereo=6, default_flow=8, type=int, help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus') - add_arg('--epochs', default=32, type=int, help='number of training epochs') - add_arg('--img_per_epoch', type=int, default=None, help='Fix the number of images seen in an epoch (None means use all training pairs)') - add_arg('--accum_iter', default=1, type=int, help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)') - add_arg('--weight_decay', type=float, default=0.05, help='weight decay (default: 0.05)') - add_arg('--lr', type=float, default_stereo=3e-5, default_flow=2e-5, metavar='LR', help='learning rate (absolute lr)') - add_arg('--min_lr', type=float, default=0., metavar='LR', help='lower lr bound for cyclic schedulers that hit 0') - add_arg('--warmup_epochs', type=int, default=1, metavar='N', help='epochs to warmup LR') - add_arg('--optimizer', default='AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95))', type=str, - help="Optimizer from torch.optim [ default: AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95)) ]") - add_arg('--amp', default=0, type=int, choices=[0,1], help='enable automatic mixed precision training') - # validation - add_arg('--val_dataset', type=str, default='', help="Validation sets, multiple separated by + (empty string means that no validation is performed)") - add_arg('--tile_conf_mode', type=str, default_stereo='conf_expsigmoid_15_3', default_flow='conf_expsigmoid_10_5', help='Weights for tile aggregation') - add_arg('--val_overlap', default=0.7, type=float, help='Overlap value for the tiling') - # others - add_arg('--num_workers', default=8, type=int) - add_arg('--eval_every', type=int, default=1, help='Val loss evaluation frequency') - add_arg('--save_every', type=int, default=1, help='Save checkpoint frequency') - add_arg('--start_from', type=str, default=None, help='Start training using weights from an other model (eg for finetuning)') - add_arg('--tboard_log_step', type=int, default=100, help='Log to tboard every so many steps') - add_arg('--dist_url', default='env://', help='url used to set up distributed training') - - return parser - - -def main(args): - misc.init_distributed_mode(args) - global_rank = misc.get_rank() - num_tasks = misc.get_world_size() - - assert os.path.isfile(args.pretrained) - print("output_dir: "+args.output_dir) - os.makedirs(args.output_dir, exist_ok=True) - - # fix the seed for reproducibility - seed = args.seed + misc.get_rank() - torch.manual_seed(seed) - np.random.seed(seed) - cudnn.benchmark = True - - # Metrics / criterion - device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') - metrics = (StereoMetrics if args.task=='stereo' else FlowMetrics)().to(device) - criterion = eval(args.criterion).to(device) - print('Criterion: ', args.criterion) - - # Prepare model - assert os.path.isfile(args.pretrained) - ckpt = torch.load(args.pretrained, 'cpu') - croco_args = croco_args_from_ckpt(ckpt) - croco_args['img_size'] = (args.crop[0], args.crop[1]) - print('Croco args: '+str(croco_args)) - args.croco_args = croco_args # saved for test time - # prepare head - num_channels = {'stereo': 1, 'flow': 2}[args.task] - if criterion.with_conf: num_channels += 1 - print(f'Building head PixelwiseTaskWithDPT() with {num_channels} channel(s)') - head = PixelwiseTaskWithDPT() - head.num_channels = num_channels - # build model and load pretrained weights - model = CroCoDownstreamBinocular(head, **croco_args) - interpolate_pos_embed(model, ckpt['model']) - msg = model.load_state_dict(ckpt['model'], strict=False) - print(msg) - - total_params = sum(p.numel() for p in model.parameters()) - total_params_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) - print(f"Total params: {total_params}") - print(f"Total params trainable: {total_params_trainable}") - model_without_ddp = model.to(device) - - eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size() - print("lr: %.2e" % args.lr) - print("accumulate grad iterations: %d" % args.accum_iter) - print("effective batch size: %d" % eff_batch_size) - - if args.distributed: - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], static_graph=True) - model_without_ddp = model.module - - # following timm: set wd as 0 for bias and norm layers - param_groups = misc.get_parameter_groups(model_without_ddp, args.weight_decay) - optimizer = eval(f"torch.optim.{args.optimizer}") - print(optimizer) - loss_scaler = NativeScaler() - - # automatic restart - last_ckpt_fname = os.path.join(args.output_dir, f'checkpoint-last.pth') - args.resume = last_ckpt_fname if os.path.isfile(last_ckpt_fname) else None - - if not args.resume and args.start_from: - print(f"Starting from an other model's weights: {args.start_from}") - best_so_far = None - args.start_epoch = 0 - ckpt = torch.load(args.start_from, 'cpu') - msg = model_without_ddp.load_state_dict(ckpt['model'], strict=False) - print(msg) - else: - best_so_far = misc.load_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler) - - if best_so_far is None: best_so_far = np.inf - - # tensorboard - log_writer = None - if global_rank == 0 and args.output_dir is not None: - log_writer = SummaryWriter(log_dir=args.output_dir, purge_step=args.start_epoch*1000) - - # dataset and loader - print('Building Train Data loader for dataset: ', args.dataset) - train_dataset = (get_train_dataset_stereo if args.task=='stereo' else get_train_dataset_flow)(args.dataset, crop_size=args.crop) - def _print_repr_dataset(d): - if isinstance(d, torch.utils.data.dataset.ConcatDataset): - for dd in d.datasets: - _print_repr_dataset(dd) - else: - print(repr(d)) - _print_repr_dataset(train_dataset) - print(' total length:', len(train_dataset)) - if args.distributed: - sampler_train = torch.utils.data.DistributedSampler( - train_dataset, num_replicas=num_tasks, rank=global_rank, shuffle=True - ) - else: - sampler_train = torch.utils.data.RandomSampler(train_dataset) - data_loader_train = torch.utils.data.DataLoader( - train_dataset, sampler=sampler_train, - batch_size=args.batch_size, - num_workers=args.num_workers, - pin_memory=True, - drop_last=True, - ) - if args.val_dataset=='': - data_loaders_val = None - else: - print('Building Val Data loader for datasets: ', args.val_dataset) - val_datasets = (get_test_datasets_stereo if args.task=='stereo' else get_test_datasets_flow)(args.val_dataset) - for val_dataset in val_datasets: print(repr(val_dataset)) - data_loaders_val = [DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=args.num_workers, pin_memory=True, drop_last=False) for val_dataset in val_datasets] - bestmetric = ("AVG_" if len(data_loaders_val)>1 else str(data_loaders_val[0].dataset)+'_')+args.bestmetric - - print(f"Start training for {args.epochs} epochs") - start_time = time.time() - # Training Loop - for epoch in range(args.start_epoch, args.epochs): - - if args.distributed: data_loader_train.sampler.set_epoch(epoch) - - # Train - epoch_start = time.time() - train_stats = train_one_epoch(model, criterion, metrics, data_loader_train, optimizer, device, epoch, loss_scaler, log_writer=log_writer, args=args) - epoch_time = time.time() - epoch_start - - if args.distributed: dist.barrier() - - # Validation (current naive implementation runs the validation on every gpu ... not smart ...) - if data_loaders_val is not None and args.eval_every > 0 and (epoch+1) % args.eval_every == 0: - val_epoch_start = time.time() - val_stats = validate_one_epoch(model, criterion, metrics, data_loaders_val, device, epoch, log_writer=log_writer, args=args) - val_epoch_time = time.time() - val_epoch_start - - val_best = val_stats[bestmetric] - - # Save best of all - if val_best <= best_so_far: - best_so_far = val_best - misc.save_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler, epoch=epoch, best_so_far=best_so_far, fname='best') - - log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, - 'epoch': epoch, - **{f'val_{k}': v for k, v in val_stats.items()}} - else: - log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, - 'epoch': epoch,} - - if args.distributed: dist.barrier() - - # Save stuff - if args.output_dir and ((epoch+1) % args.save_every == 0 or epoch + 1 == args.epochs): - misc.save_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler, epoch=epoch, best_so_far=best_so_far, fname='last') - - if args.output_dir: - if log_writer is not None: - log_writer.flush() - with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f: - f.write(json.dumps(log_stats) + "\n") - - total_time = time.time() - start_time - total_time_str = str(datetime.timedelta(seconds=int(total_time))) - print('Training time {}'.format(total_time_str)) - -if __name__ == '__main__': - args = get_args_parser() - args = args.parse_args() - main(args) \ No newline at end of file diff --git a/modules/croco/utils/misc.py b/modules/croco/utils/misc.py deleted file mode 100644 index 132e102a662c987dce5282633cb8730b0e0d5c2d..0000000000000000000000000000000000000000 --- a/modules/croco/utils/misc.py +++ /dev/null @@ -1,463 +0,0 @@ -# Copyright (C) 2022-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). -# -# -------------------------------------------------------- -# utilitary functions for CroCo -# -------------------------------------------------------- -# References: -# MAE: https://github.com/facebookresearch/mae -# DeiT: https://github.com/facebookresearch/deit -# BEiT: https://github.com/microsoft/unilm/tree/master/beit -# -------------------------------------------------------- - -import builtins -import datetime -import os -import time -import math -import json -from collections import defaultdict, deque -from pathlib import Path -import numpy as np - -import torch -import torch.distributed as dist -from torch import inf - -class SmoothedValue(object): - """Track a series of values and provide access to smoothed values over a - window or the global series average. - """ - - def __init__(self, window_size=20, fmt=None): - if fmt is None: - fmt = "{median:.4f} ({global_avg:.4f})" - self.deque = deque(maxlen=window_size) - self.total = 0.0 - self.count = 0 - self.fmt = fmt - - def update(self, value, n=1): - self.deque.append(value) - self.count += n - self.total += value * n - - def synchronize_between_processes(self): - """ - Warning: does not synchronize the deque! - """ - if not is_dist_avail_and_initialized(): - return - t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') - dist.barrier() - dist.all_reduce(t) - t = t.tolist() - self.count = int(t[0]) - self.total = t[1] - - @property - def median(self): - d = torch.tensor(list(self.deque)) - return d.median().item() - - @property - def avg(self): - d = torch.tensor(list(self.deque), dtype=torch.float32) - return d.mean().item() - - @property - def global_avg(self): - return self.total / self.count - - @property - def max(self): - return max(self.deque) - - @property - def value(self): - return self.deque[-1] - - def __str__(self): - return self.fmt.format( - median=self.median, - avg=self.avg, - global_avg=self.global_avg, - max=self.max, - value=self.value) - - -class MetricLogger(object): - def __init__(self, delimiter="\t"): - self.meters = defaultdict(SmoothedValue) - self.delimiter = delimiter - - def update(self, **kwargs): - for k, v in kwargs.items(): - if v is None: - continue - if isinstance(v, torch.Tensor): - v = v.item() - assert isinstance(v, (float, int)) - self.meters[k].update(v) - - def __getattr__(self, attr): - if attr in self.meters: - return self.meters[attr] - if attr in self.__dict__: - return self.__dict__[attr] - raise AttributeError("'{}' object has no attribute '{}'".format( - type(self).__name__, attr)) - - def __str__(self): - loss_str = [] - for name, meter in self.meters.items(): - loss_str.append( - "{}: {}".format(name, str(meter)) - ) - return self.delimiter.join(loss_str) - - def synchronize_between_processes(self): - for meter in self.meters.values(): - meter.synchronize_between_processes() - - def add_meter(self, name, meter): - self.meters[name] = meter - - def log_every(self, iterable, print_freq, header=None, max_iter=None): - i = 0 - if not header: - header = '' - start_time = time.time() - end = time.time() - iter_time = SmoothedValue(fmt='{avg:.4f}') - data_time = SmoothedValue(fmt='{avg:.4f}') - len_iterable = min(len(iterable), max_iter) if max_iter else len(iterable) - space_fmt = ':' + str(len(str(len_iterable))) + 'd' - log_msg = [ - header, - '[{0' + space_fmt + '}/{1}]', - 'eta: {eta}', - '{meters}', - 'time: {time}', - 'data: {data}' - ] - if torch.cuda.is_available(): - log_msg.append('max mem: {memory:.0f}') - log_msg = self.delimiter.join(log_msg) - MB = 1024.0 * 1024.0 - for it,obj in enumerate(iterable): - data_time.update(time.time() - end) - yield obj - iter_time.update(time.time() - end) - if i % print_freq == 0 or i == len_iterable - 1: - eta_seconds = iter_time.global_avg * (len_iterable - i) - eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) - if torch.cuda.is_available(): - print(log_msg.format( - i, len_iterable, eta=eta_string, - meters=str(self), - time=str(iter_time), data=str(data_time), - memory=torch.cuda.max_memory_allocated() / MB)) - else: - print(log_msg.format( - i, len_iterable, eta=eta_string, - meters=str(self), - time=str(iter_time), data=str(data_time))) - i += 1 - end = time.time() - if max_iter and it >= max_iter: - break - total_time = time.time() - start_time - total_time_str = str(datetime.timedelta(seconds=int(total_time))) - print('{} Total time: {} ({:.4f} s / it)'.format( - header, total_time_str, total_time / len_iterable)) - - -def setup_for_distributed(is_master): - """ - This function disables printing when not in master process - """ - builtin_print = builtins.print - - def print(*args, **kwargs): - force = kwargs.pop('force', False) - force = force or (get_world_size() > 8) - if is_master or force: - now = datetime.datetime.now().time() - builtin_print('[{}] '.format(now), end='') # print with time stamp - builtin_print(*args, **kwargs) - - builtins.print = print - - -def is_dist_avail_and_initialized(): - if not dist.is_available(): - return False - if not dist.is_initialized(): - return False - return True - - -def get_world_size(): - if not is_dist_avail_and_initialized(): - return 1 - return dist.get_world_size() - - -def get_rank(): - if not is_dist_avail_and_initialized(): - return 0 - return dist.get_rank() - - -def is_main_process(): - return get_rank() == 0 - - -def save_on_master(*args, **kwargs): - if is_main_process(): - torch.save(*args, **kwargs) - - -def init_distributed_mode(args): - nodist = args.nodist if hasattr(args,'nodist') else False - if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ and not nodist: - args.rank = int(os.environ["RANK"]) - args.world_size = int(os.environ['WORLD_SIZE']) - args.gpu = int(os.environ['LOCAL_RANK']) - else: - print('Not using distributed mode') - setup_for_distributed(is_master=True) # hack - args.distributed = False - return - - args.distributed = True - - torch.cuda.set_device(args.gpu) - args.dist_backend = 'nccl' - print('| distributed init (rank {}): {}, gpu {}'.format( - args.rank, args.dist_url, args.gpu), flush=True) - torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, - world_size=args.world_size, rank=args.rank) - torch.distributed.barrier() - setup_for_distributed(args.rank == 0) - - -class NativeScalerWithGradNormCount: - state_dict_key = "amp_scaler" - - def __init__(self, enabled=True): - self._scaler = torch.cuda.amp.GradScaler(enabled=enabled) - - def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False, update_grad=True): - self._scaler.scale(loss).backward(create_graph=create_graph) - if update_grad: - if clip_grad is not None: - assert parameters is not None - self._scaler.unscale_(optimizer) # unscale the gradients of optimizer's assigned params in-place - norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad) - else: - self._scaler.unscale_(optimizer) - norm = get_grad_norm_(parameters) - self._scaler.step(optimizer) - self._scaler.update() - else: - norm = None - return norm - - def state_dict(self): - return self._scaler.state_dict() - - def load_state_dict(self, state_dict): - self._scaler.load_state_dict(state_dict) - - -def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor: - if isinstance(parameters, torch.Tensor): - parameters = [parameters] - parameters = [p for p in parameters if p.grad is not None] - norm_type = float(norm_type) - if len(parameters) == 0: - return torch.tensor(0.) - device = parameters[0].grad.device - if norm_type == inf: - total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters) - else: - total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type) - return total_norm - - - - -def save_model(args, epoch, model_without_ddp, optimizer, loss_scaler, fname=None, best_so_far=None): - output_dir = Path(args.output_dir) - if fname is None: fname = str(epoch) - checkpoint_path = output_dir / ('checkpoint-%s.pth' % fname) - to_save = { - 'model': model_without_ddp.state_dict(), - 'optimizer': optimizer.state_dict(), - 'scaler': loss_scaler.state_dict(), - 'args': args, - 'epoch': epoch, - } - if best_so_far is not None: to_save['best_so_far'] = best_so_far - print(f'>> Saving model to {checkpoint_path} ...') - save_on_master(to_save, checkpoint_path) - - -def load_model(args, model_without_ddp, optimizer, loss_scaler): - args.start_epoch = 0 - best_so_far = None - if args.resume is not None: - if args.resume.startswith('https'): - checkpoint = torch.hub.load_state_dict_from_url( - args.resume, map_location='cpu', check_hash=True) - else: - checkpoint = torch.load(args.resume, map_location='cpu') - print("Resume checkpoint %s" % args.resume) - model_without_ddp.load_state_dict(checkpoint['model'], strict=False) - args.start_epoch = checkpoint['epoch'] + 1 - optimizer.load_state_dict(checkpoint['optimizer']) - if 'scaler' in checkpoint: - loss_scaler.load_state_dict(checkpoint['scaler']) - if 'best_so_far' in checkpoint: - best_so_far = checkpoint['best_so_far'] - print(" & best_so_far={:g}".format(best_so_far)) - else: - print("") - print("With optim & sched! start_epoch={:d}".format(args.start_epoch), end='') - return best_so_far - -def all_reduce_mean(x): - world_size = get_world_size() - if world_size > 1: - x_reduce = torch.tensor(x).cuda() - dist.all_reduce(x_reduce) - x_reduce /= world_size - return x_reduce.item() - else: - return x - -def _replace(text, src, tgt, rm=''): - """ Advanced string replacement. - Given a text: - - replace all elements in src by the corresponding element in tgt - - remove all elements in rm - """ - if len(tgt) == 1: - tgt = tgt * len(src) - assert len(src) == len(tgt), f"'{src}' and '{tgt}' should have the same len" - for s,t in zip(src, tgt): - text = text.replace(s,t) - for c in rm: - text = text.replace(c,'') - return text - -def filename( obj ): - """ transform a python obj or cmd into a proper filename. - - \1 gets replaced by slash '/' - - \2 gets replaced by comma ',' - """ - if not isinstance(obj, str): - obj = repr(obj) - obj = str(obj).replace('()','') - obj = _replace(obj, '_,(*/\1\2','-__x%/,', rm=' )\'"') - assert all(len(s) < 256 for s in obj.split(os.sep)), 'filename too long (>256 characters):\n'+obj - return obj - -def _get_num_layer_for_vit(var_name, enc_depth, dec_depth): - if var_name in ("cls_token", "mask_token", "pos_embed", "global_tokens"): - return 0 - elif var_name.startswith("patch_embed"): - return 0 - elif var_name.startswith("enc_blocks"): - layer_id = int(var_name.split('.')[1]) - return layer_id + 1 - elif var_name.startswith('decoder_embed') or var_name.startswith('enc_norm'): # part of the last black - return enc_depth - elif var_name.startswith('dec_blocks'): - layer_id = int(var_name.split('.')[1]) - return enc_depth + layer_id + 1 - elif var_name.startswith('dec_norm'): # part of the last block - return enc_depth + dec_depth - elif any(var_name.startswith(k) for k in ['head','prediction_head']): - return enc_depth + dec_depth + 1 - else: - raise NotImplementedError(var_name) - -def get_parameter_groups(model, weight_decay, layer_decay=1.0, skip_list=(), no_lr_scale_list=[]): - parameter_group_names = {} - parameter_group_vars = {} - enc_depth, dec_depth = None, None - # prepare layer decay values - assert layer_decay==1.0 or 0. 9 - l2 = net.dec_depth - feature_dim = 256 - last_dim = feature_dim // 2 - out_nchan = 3 - ed = net.enc_embed_dim - dd = net.dec_embed_dim - return Cat_MLP_LocalFeatures_DPT_Pts3d(net, local_feat_dim=local_feat_dim, has_conf=has_conf, - num_channels=out_nchan + has_conf, - feature_dim=feature_dim, - last_dim=last_dim, - hooks_idx=[0, l2 * 2 // 4, l2 * 3 // 4, l2], - dim_tokens=[ed, dd, dd, dd], - postprocess=postprocess, - depth_mode=net.depth_mode, - conf_mode=net.conf_mode, - head_type='regression') - else: - raise NotImplementedError( - f"unexpected {head_type=} and {output_mode=}") diff --git a/modules/mast3r/cloud_opt/__init__.py b/modules/mast3r/cloud_opt/__init__.py deleted file mode 100644 index d7dd877d649ce4dbd749dd7195a8b34c0f91d4f0..0000000000000000000000000000000000000000 --- a/modules/mast3r/cloud_opt/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (C) 2024-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). \ No newline at end of file diff --git a/modules/mast3r/cloud_opt/__pycache__/__init__.cpython-312.pyc b/modules/mast3r/cloud_opt/__pycache__/__init__.cpython-312.pyc deleted file mode 100644 index 147d3ccea6436a3d0c599dd11741e48369c2c841..0000000000000000000000000000000000000000 Binary files a/modules/mast3r/cloud_opt/__pycache__/__init__.cpython-312.pyc and /dev/null differ diff --git a/modules/mast3r/cloud_opt/__pycache__/sparse_ga.cpython-312.pyc b/modules/mast3r/cloud_opt/__pycache__/sparse_ga.cpython-312.pyc deleted file mode 100644 index 10e57fc8d69ece65aa17249b62b0978cb88d69ae..0000000000000000000000000000000000000000 Binary files a/modules/mast3r/cloud_opt/__pycache__/sparse_ga.cpython-312.pyc and /dev/null differ diff --git a/modules/mast3r/cloud_opt/__pycache__/tsdf_optimizer.cpython-312.pyc b/modules/mast3r/cloud_opt/__pycache__/tsdf_optimizer.cpython-312.pyc deleted file mode 100644 index 54a61a60cc5ba7856dcaa896f26bf6d3812baefb..0000000000000000000000000000000000000000 Binary files a/modules/mast3r/cloud_opt/__pycache__/tsdf_optimizer.cpython-312.pyc and /dev/null differ diff --git a/modules/mast3r/cloud_opt/sparse_ga.py b/modules/mast3r/cloud_opt/sparse_ga.py deleted file mode 100644 index eb1eb6b4d264e458d4efdc4e50281f1d0c7c4012..0000000000000000000000000000000000000000 --- a/modules/mast3r/cloud_opt/sparse_ga.py +++ /dev/null @@ -1,1040 +0,0 @@ -# Copyright (C) 2024-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). -# -# -------------------------------------------------------- -# MASt3R Sparse Global Alignement -# -------------------------------------------------------- -from tqdm import tqdm -import roma -import torch -import torch.nn as nn -import torch.nn.functional as F -import numpy as np -import os -from collections import namedtuple -from functools import lru_cache -from scipy import sparse as sp -import copy - -from mast3r.utils.misc import mkdir_for, hash_md5 -from mast3r.cloud_opt.utils.losses import gamma_loss -from mast3r.cloud_opt.utils.schedules import linear_schedule, cosine_schedule -from mast3r.fast_nn import fast_reciprocal_NNs, merge_corres - -import mast3r.utils.path_to_dust3r # noqa -from dust3r.utils.geometry import inv, geotrf # noqa -from dust3r.utils.device import to_cpu, to_numpy, todevice # noqa -from dust3r.post_process import estimate_focal_knowing_depth # noqa -from dust3r.optim_factory import adjust_learning_rate_by_lr # noqa -from dust3r.cloud_opt.base_opt import clean_pointcloud -from dust3r.viz import SceneViz - - -class SparseGA(): - def __init__(self, img_paths, pairs_in, res_fine, anchors, canonical_paths=None): - def fetch_img(im): - def torgb(x): return (x[0].permute(1, 2, 0).numpy() * .5 + .5).clip(min=0., max=1.) - for im1, im2 in pairs_in: - if im1['instance'] == im: - return torgb(im1['img']) - if im2['instance'] == im: - return torgb(im2['img']) - self.canonical_paths = canonical_paths - self.img_paths = img_paths - self.imgs = [fetch_img(img) for img in img_paths] - self.intrinsics = res_fine['intrinsics'] - self.cam2w = res_fine['cam2w'] - self.depthmaps = res_fine['depthmaps'] - self.pts3d = res_fine['pts3d'] - self.pts3d_colors = [] - self.working_device = self.cam2w.device - for i in range(len(self.imgs)): - im = self.imgs[i] - x, y = anchors[i][0][..., :2].detach().cpu().numpy().T - self.pts3d_colors.append(im[y, x]) - assert self.pts3d_colors[-1].shape == self.pts3d[i].shape - self.n_imgs = len(self.imgs) - - def get_focals(self): - return torch.tensor([ff[0, 0] for ff in self.intrinsics]).to(self.working_device) - - def get_principal_points(self): - return torch.stack([ff[:2, -1] for ff in self.intrinsics]).to(self.working_device) - - def get_im_poses(self): - return self.cam2w - - def get_sparse_pts3d(self): - return self.pts3d - - def get_dense_pts3d(self, clean_depth=True, subsample=8): - assert self.canonical_paths, 'cache_path is required for dense 3d points' - device = self.cam2w.device - confs = [] - base_focals = [] - anchors = {} - for i, canon_path in enumerate(self.canonical_paths): - (canon, canon2, conf), focal = torch.load(canon_path, map_location=device) - confs.append(conf) - base_focals.append(focal) - - H, W = conf.shape - pixels = torch.from_numpy(np.mgrid[:W, :H].T.reshape(-1, 2)).float().to(device) - idxs, offsets = anchor_depth_offsets(canon2, {i: (pixels, None)}, subsample=subsample) - anchors[i] = (pixels, idxs[i], offsets[i]) - - # densify sparse depthmaps - pts3d, depthmaps = make_pts3d(anchors, self.intrinsics, self.cam2w, [ - d.ravel() for d in self.depthmaps], base_focals=base_focals, ret_depth=True) - - if clean_depth: - confs = clean_pointcloud(confs, self.intrinsics, inv(self.cam2w), depthmaps, pts3d) - - return pts3d, depthmaps, confs - - def get_pts3d_colors(self): - return self.pts3d_colors - - def get_depthmaps(self): - return self.depthmaps - - def get_masks(self): - return [slice(None, None) for _ in range(len(self.imgs))] - - def show(self, show_cams=True): - pts3d, _, confs = self.get_dense_pts3d() - show_reconstruction(self.imgs, self.intrinsics if show_cams else None, self.cam2w, - [p.clip(min=-50, max=50) for p in pts3d], - masks=[c > 1 for c in confs]) - - -def convert_dust3r_pairs_naming(imgs, pairs_in): - for pair_id in range(len(pairs_in)): - for i in range(2): - pairs_in[pair_id][i]['instance'] = imgs[pairs_in[pair_id][i]['idx']] - return pairs_in - - -def sparse_global_alignment(imgs, pairs_in, cache_path, model, subsample=8, desc_conf='desc_conf', - device='cuda', dtype=torch.float32, shared_intrinsics=False, **kw): - """ Sparse alignment with MASt3R - imgs: list of image paths - cache_path: path where to dump temporary files (str) - - lr1, niter1: learning rate and #iterations for coarse global alignment (3D matching) - lr2, niter2: learning rate and #iterations for refinement (2D reproj error) - - lora_depth: smart dimensionality reduction with depthmaps - """ - # Convert pair naming convention from dust3r to mast3r - pairs_in = convert_dust3r_pairs_naming(imgs, pairs_in) - # forward pass - pairs, cache_path = forward_mast3r(pairs_in, model, - cache_path=cache_path, subsample=subsample, - desc_conf=desc_conf, device=device) - - # extract canonical pointmaps - tmp_pairs, pairwise_scores, canonical_views, canonical_paths, preds_21 = \ - prepare_canonical_data(imgs, pairs, subsample, cache_path=cache_path, mode='avg-angle', device=device) - - # compute minimal spanning tree - mst = compute_min_spanning_tree(pairwise_scores) - - # remove all edges not in the spanning tree? - # min_spanning_tree = {(imgs[i],imgs[j]) for i,j in mst[1]} - # tmp_pairs = {(a,b):v for (a,b),v in tmp_pairs.items() if {(a,b),(b,a)} & min_spanning_tree} - - # smartly combine all useful data - imsizes, pps, base_focals, core_depth, anchors, corres, corres2d, preds_21 = \ - condense_data(imgs, tmp_pairs, canonical_views, preds_21, dtype) - - imgs, res_coarse, res_fine = sparse_scene_optimizer( - imgs, subsample, imsizes, pps, base_focals, core_depth, anchors, corres, corres2d, preds_21, canonical_paths, mst, - shared_intrinsics=shared_intrinsics, cache_path=cache_path, device=device, dtype=dtype, **kw) - - return SparseGA(imgs, pairs_in, res_fine or res_coarse, anchors, canonical_paths) - - -def sparse_scene_optimizer(imgs, subsample, imsizes, pps, base_focals, core_depth, anchors, corres, corres2d, - preds_21, canonical_paths, mst, cache_path, - lr1=0.2, niter1=500, loss1=gamma_loss(1.1), - lr2=0.02, niter2=500, loss2=gamma_loss(0.4), - lossd=gamma_loss(1.1), - opt_pp=True, opt_depth=True, - schedule=cosine_schedule, depth_mode='add', exp_depth=False, - lora_depth=False, # dict(k=96, gamma=15, min_norm=.5), - shared_intrinsics=False, - init={}, device='cuda', dtype=torch.float32, - matching_conf_thr=5., loss_dust3r_w=0.01, - verbose=True, dbg=()): - init = copy.deepcopy(init) - # extrinsic parameters - vec0001 = torch.tensor((0, 0, 0, 1), dtype=dtype, device=device) - quats = [nn.Parameter(vec0001.clone()) for _ in range(len(imgs))] - trans = [nn.Parameter(torch.zeros(3, device=device, dtype=dtype)) for _ in range(len(imgs))] - - # initialize - ones = torch.ones((len(imgs), 1), device=device, dtype=dtype) - median_depths = torch.ones(len(imgs), device=device, dtype=dtype) - for img in imgs: - idx = imgs.index(img) - init_values = init.setdefault(img, {}) - if verbose and init_values: - print(f' >> initializing img=...{img[-25:]} [{idx}] for {set(init_values)}') - - K = init_values.get('intrinsics') - if K is not None: - K = K.detach() - focal = K[:2, :2].diag().mean() - pp = K[:2, 2] - base_focals[idx] = focal - pps[idx] = pp - pps[idx] /= imsizes[idx] # default principal_point would be (0.5, 0.5) - - depth = init_values.get('depthmap') - if depth is not None: - core_depth[idx] = depth.detach() - - median_depths[idx] = med_depth = core_depth[idx].median() - core_depth[idx] /= med_depth - - cam2w = init_values.get('cam2w') - if cam2w is not None: - rot = cam2w[:3, :3].detach() - cam_center = cam2w[:3, 3].detach() - quats[idx].data[:] = roma.rotmat_to_unitquat(rot) - trans_offset = med_depth * torch.cat((imsizes[idx] / base_focals[idx] * (0.5 - pps[idx]), ones[:1, 0])) - trans[idx].data[:] = cam_center + rot @ trans_offset - del rot - assert False, 'inverse kinematic chain not yet implemented' - - # intrinsics parameters - if shared_intrinsics: - # Optimize a single set of intrinsics for all cameras. Use averages as init. - confs = torch.stack([torch.load(pth)[0][2].mean() for pth in canonical_paths]).to(pps) - weighting = confs / confs.sum() - pp = nn.Parameter((weighting @ pps).to(dtype)) - pps = [pp for _ in range(len(imgs))] - focal_m = weighting @ base_focals - log_focal = nn.Parameter(focal_m.view(1).log().to(dtype)) - log_focals = [log_focal for _ in range(len(imgs))] - else: - pps = [nn.Parameter(pp.to(dtype)) for pp in pps] - log_focals = [nn.Parameter(f.view(1).log().to(dtype)) for f in base_focals] - - diags = imsizes.float().norm(dim=1) - min_focals = 0.25 * diags # diag = 1.2~1.4*max(W,H) => beta >= 1/(2*1.2*tan(fov/2)) ~= 0.26 - max_focals = 10 * diags - - assert len(mst[1]) == len(pps) - 1 - - def make_K_cam_depth(log_focals, pps, trans, quats, log_sizes, core_depth): - # make intrinsics - focals = torch.cat(log_focals).exp().clip(min=min_focals, max=max_focals) - pps = torch.stack(pps) - K = torch.eye(3, dtype=dtype, device=device)[None].expand(len(imgs), 3, 3).clone() - K[:, 0, 0] = K[:, 1, 1] = focals - K[:, 0:2, 2] = pps * imsizes - if trans is None: - return K - - # security! optimization is always trying to crush the scale down - sizes = torch.cat(log_sizes).exp() - global_scaling = 1 / sizes.min() - - # compute distance of camera to focal plane - # tan(fov) = W/2 / focal - z_cameras = sizes * median_depths * focals / base_focals - - # make extrinsic - rel_cam2cam = torch.eye(4, dtype=dtype, device=device)[None].expand(len(imgs), 4, 4).clone() - rel_cam2cam[:, :3, :3] = roma.unitquat_to_rotmat(F.normalize(torch.stack(quats), dim=1)) - rel_cam2cam[:, :3, 3] = torch.stack(trans) - - # camera are defined as a kinematic chain - tmp_cam2w = [None] * len(K) - tmp_cam2w[mst[0]] = rel_cam2cam[mst[0]] - for i, j in mst[1]: - # i is the cam_i_to_world reference, j is the relative pose = cam_j_to_cam_i - tmp_cam2w[j] = tmp_cam2w[i] @ rel_cam2cam[j] - tmp_cam2w = torch.stack(tmp_cam2w) - - # smart reparameterizaton of cameras - trans_offset = z_cameras.unsqueeze(1) * torch.cat((imsizes / focals.unsqueeze(1) * (0.5 - pps), ones), dim=-1) - new_trans = global_scaling * (tmp_cam2w[:, :3, 3:4] - tmp_cam2w[:, :3, :3] @ trans_offset.unsqueeze(-1)) - cam2w = torch.cat((torch.cat((tmp_cam2w[:, :3, :3], new_trans), dim=2), - vec0001.view(1, 1, 4).expand(len(K), 1, 4)), dim=1) - - depthmaps = [] - for i in range(len(imgs)): - core_depth_img = core_depth[i] - if exp_depth: - core_depth_img = core_depth_img.exp() - if lora_depth: # compute core_depth as a low-rank decomposition of 3d points - core_depth_img = lora_depth_proj[i] @ core_depth_img - if depth_mode == 'add': - core_depth_img = z_cameras[i] + (core_depth_img - 1) * (median_depths[i] * sizes[i]) - elif depth_mode == 'mul': - core_depth_img = z_cameras[i] * core_depth_img - else: - raise ValueError(f'Bad {depth_mode=}') - depthmaps.append(global_scaling * core_depth_img) - - return K, (inv(cam2w), cam2w), depthmaps - - K = make_K_cam_depth(log_focals, pps, None, None, None, None) - - if shared_intrinsics: - print('init focal (shared) = ', to_numpy(K[0, 0, 0]).round(2)) - else: - print('init focals =', to_numpy(K[:, 0, 0])) - - # spectral low-rank projection of depthmaps - if lora_depth: - core_depth, lora_depth_proj = spectral_projection_of_depthmaps( - imgs, K, core_depth, subsample, cache_path=cache_path, **lora_depth) - if exp_depth: - core_depth = [d.clip(min=1e-4).log() for d in core_depth] - core_depth = [nn.Parameter(d.ravel().to(dtype)) for d in core_depth] - log_sizes = [nn.Parameter(torch.zeros(1, dtype=dtype, device=device)) for _ in range(len(imgs))] - - # Fetch img slices - _, confs_sum, imgs_slices = corres - - # Define which pairs are fine to use with matching - def matching_check(x): return x.max() > matching_conf_thr - is_matching_ok = {} - for s in imgs_slices: - is_matching_ok[s.img1, s.img2] = matching_check(s.confs) - - # Prepare slices and corres for losses - dust3r_slices = [s for s in imgs_slices if not is_matching_ok[s.img1, s.img2]] - loss3d_slices = [s for s in imgs_slices if is_matching_ok[s.img1, s.img2]] - cleaned_corres2d = [] - for cci, (img1, pix1, confs, confsum, imgs_slices) in enumerate(corres2d): - cf_sum = 0 - pix1_filtered = [] - confs_filtered = [] - curstep = 0 - cleaned_slices = [] - for img2, slice2 in imgs_slices: - if is_matching_ok[img1, img2]: - tslice = slice(curstep, curstep + slice2.stop - slice2.start, slice2.step) - pix1_filtered.append(pix1[tslice]) - confs_filtered.append(confs[tslice]) - cleaned_slices.append((img2, slice2)) - curstep += slice2.stop - slice2.start - if pix1_filtered != []: - pix1_filtered = torch.cat(pix1_filtered) - confs_filtered = torch.cat(confs_filtered) - cf_sum = confs_filtered.sum() - cleaned_corres2d.append((img1, pix1_filtered, confs_filtered, cf_sum, cleaned_slices)) - - def loss_dust3r(cam2w, pts3d, pix_loss): - # In the case no correspondence could be established, fallback to DUSt3R GA regression loss formulation (sparsified) - loss = 0. - cf_sum = 0. - for s in dust3r_slices: - if init[imgs[s.img1]].get('freeze') and init[imgs[s.img2]].get('freeze'): - continue - # fallback to dust3r regression - tgt_pts, tgt_confs = preds_21[imgs[s.img2]][imgs[s.img1]] - tgt_pts = geotrf(cam2w[s.img2], tgt_pts) - cf_sum += tgt_confs.sum() - loss += tgt_confs @ pix_loss(pts3d[s.img1], tgt_pts) - return loss / cf_sum if cf_sum != 0. else 0. - - def loss_3d(K, w2cam, pts3d, pix_loss): - # For each correspondence, we have two 3D points (one for each image of the pair). - # For each 3D point, we have 2 reproj errors - if any(v.get('freeze') for v in init.values()): - pts3d_1 = [] - pts3d_2 = [] - confs = [] - for s in loss3d_slices: - if init[imgs[s.img1]].get('freeze') and init[imgs[s.img2]].get('freeze'): - continue - pts3d_1.append(pts3d[s.img1][s.slice1]) - pts3d_2.append(pts3d[s.img2][s.slice2]) - confs.append(s.confs) - else: - pts3d_1 = [pts3d[s.img1][s.slice1] for s in loss3d_slices] - pts3d_2 = [pts3d[s.img2][s.slice2] for s in loss3d_slices] - confs = [s.confs for s in loss3d_slices] - - if pts3d_1 != []: - confs = torch.cat(confs) - pts3d_1 = torch.cat(pts3d_1) - pts3d_2 = torch.cat(pts3d_2) - loss = confs @ pix_loss(pts3d_1, pts3d_2) - cf_sum = confs.sum() - else: - loss = 0. - cf_sum = 1. - - return loss / cf_sum - - def loss_2d(K, w2cam, pts3d, pix_loss): - # For each correspondence, we have two 3D points (one for each image of the pair). - # For each 3D point, we have 2 reproj errors - proj_matrix = K @ w2cam[:, :3] - loss = npix = 0 - for img1, pix1_filtered, confs_filtered, cf_sum, cleaned_slices in cleaned_corres2d: - if init[imgs[img1]].get('freeze', 0) >= 1: - continue # no need - pts3d_in_img1 = [pts3d[img2][slice2] for img2, slice2 in cleaned_slices] - if pts3d_in_img1 != []: - pts3d_in_img1 = torch.cat(pts3d_in_img1) - loss += confs_filtered @ pix_loss(pix1_filtered, reproj2d(proj_matrix[img1], pts3d_in_img1)) - npix += confs_filtered.sum() - - return loss / npix if npix != 0 else 0. - - def optimize_loop(loss_func, lr_base, niter, pix_loss, lr_end=0): - # create optimizer - params = pps + log_focals + quats + trans + log_sizes + core_depth - optimizer = torch.optim.Adam(params, lr=1, weight_decay=0, betas=(0.9, 0.9)) - ploss = pix_loss if 'meta' in repr(pix_loss) else (lambda a: pix_loss) - - with tqdm(total=niter) as bar: - for iter in range(niter or 1): - K, (w2cam, cam2w), depthmaps = make_K_cam_depth(log_focals, pps, trans, quats, log_sizes, core_depth) - pts3d = make_pts3d(anchors, K, cam2w, depthmaps, base_focals=base_focals) - if niter == 0: - break - - alpha = (iter / niter) - lr = schedule(alpha, lr_base, lr_end) - adjust_learning_rate_by_lr(optimizer, lr) - pix_loss = ploss(1 - alpha) - optimizer.zero_grad() - loss = loss_func(K, w2cam, pts3d, pix_loss) + loss_dust3r_w * loss_dust3r(cam2w, pts3d, lossd) - loss.backward() - optimizer.step() - - # make sure the pose remains well optimizable - for i in range(len(imgs)): - quats[i].data[:] /= quats[i].data.norm() - - loss = float(loss) - if loss != loss: - break # NaN loss - bar.set_postfix_str(f'{lr=:.4f}, {loss=:.3f}') - bar.update(1) - - if niter: - print(f'>> final loss = {loss}') - return dict(intrinsics=K.detach(), cam2w=cam2w.detach(), - depthmaps=[d.detach() for d in depthmaps], pts3d=[p.detach() for p in pts3d]) - - # at start, don't optimize 3d points - for i, img in enumerate(imgs): - trainable = not (init[img].get('freeze')) - pps[i].requires_grad_(False) - log_focals[i].requires_grad_(False) - quats[i].requires_grad_(trainable) - trans[i].requires_grad_(trainable) - log_sizes[i].requires_grad_(trainable) - core_depth[i].requires_grad_(False) - - res_coarse = optimize_loop(loss_3d, lr_base=lr1, niter=niter1, pix_loss=loss1) - - res_fine = None - if niter2: - # now we can optimize 3d points - for i, img in enumerate(imgs): - if init[img].get('freeze', 0) >= 1: - continue - pps[i].requires_grad_(bool(opt_pp)) - log_focals[i].requires_grad_(True) - core_depth[i].requires_grad_(opt_depth) - - # refinement with 2d reproj - res_fine = optimize_loop(loss_2d, lr_base=lr2, niter=niter2, pix_loss=loss2) - - K = make_K_cam_depth(log_focals, pps, None, None, None, None) - if shared_intrinsics: - print('Final focal (shared) = ', to_numpy(K[0, 0, 0]).round(2)) - else: - print('Final focals =', to_numpy(K[:, 0, 0])) - - return imgs, res_coarse, res_fine - - -@lru_cache -def mask110(device, dtype): - return torch.tensor((1, 1, 0), device=device, dtype=dtype) - - -def proj3d(inv_K, pixels, z): - if pixels.shape[-1] == 2: - pixels = torch.cat((pixels, torch.ones_like(pixels[..., :1])), dim=-1) - return z.unsqueeze(-1) * (pixels * inv_K.diag() + inv_K[:, 2] * mask110(z.device, z.dtype)) - - -def make_pts3d(anchors, K, cam2w, depthmaps, base_focals=None, ret_depth=False): - focals = K[:, 0, 0] - invK = inv(K) - all_pts3d = [] - depth_out = [] - - for img, (pixels, idxs, offsets) in anchors.items(): - # from depthmaps to 3d points - if base_focals is None: - pass - else: - # compensate for focal - # depth + depth * (offset - 1) * base_focal / focal - # = depth * (1 + (offset - 1) * (base_focal / focal)) - offsets = 1 + (offsets - 1) * (base_focals[img] / focals[img]) - - pts3d = proj3d(invK[img], pixels, depthmaps[img][idxs] * offsets) - if ret_depth: - depth_out.append(pts3d[..., 2]) # before camera rotation - - # rotate to world coordinate - pts3d = geotrf(cam2w[img], pts3d) - all_pts3d.append(pts3d) - - if ret_depth: - return all_pts3d, depth_out - return all_pts3d - - -def make_dense_pts3d(intrinsics, cam2w, depthmaps, canonical_paths, subsample, device='cuda'): - base_focals = [] - anchors = {} - confs = [] - for i, canon_path in enumerate(canonical_paths): - (canon, canon2, conf), focal = torch.load(canon_path, map_location=device) - confs.append(conf) - base_focals.append(focal) - H, W = conf.shape - pixels = torch.from_numpy(np.mgrid[:W, :H].T.reshape(-1, 2)).float().to(device) - idxs, offsets = anchor_depth_offsets(canon2, {i: (pixels, None)}, subsample=subsample) - anchors[i] = (pixels, idxs[i], offsets[i]) - - # densify sparse depthmaps - pts3d, depthmaps_out = make_pts3d(anchors, intrinsics, cam2w, [ - d.ravel() for d in depthmaps], base_focals=base_focals, ret_depth=True) - - return pts3d, depthmaps_out, confs - - -@torch.no_grad() -def forward_mast3r(pairs, model, cache_path, desc_conf='desc_conf', - device='cuda', subsample=8, **matching_kw): - res_paths = {} - - for img1, img2 in tqdm(pairs): - idx1 = hash_md5(img1['instance']) - idx2 = hash_md5(img2['instance']) - - path1 = cache_path + f'/forward/{idx1}/{idx2}.pth' - path2 = cache_path + f'/forward/{idx2}/{idx1}.pth' - path_corres = cache_path + f'/corres_conf={desc_conf}_{subsample=}/{idx1}-{idx2}.pth' - path_corres2 = cache_path + f'/corres_conf={desc_conf}_{subsample=}/{idx2}-{idx1}.pth' - - if os.path.isfile(path_corres2) and not os.path.isfile(path_corres): - score, (xy1, xy2, confs) = torch.load(path_corres2) - torch.save((score, (xy2, xy1, confs)), path_corres) - - if not all(os.path.isfile(p) for p in (path1, path2, path_corres)): - if model is None: - continue - res = symmetric_inference(model, img1, img2, device=device) - X11, X21, X22, X12 = [r['pts3d'][0] for r in res] - C11, C21, C22, C12 = [r['conf'][0] for r in res] - descs = [r['desc'][0] for r in res] - qonfs = [r[desc_conf][0] for r in res] - - # save - torch.save(to_cpu((X11, C11, X21, C21)), mkdir_for(path1)) - torch.save(to_cpu((X22, C22, X12, C12)), mkdir_for(path2)) - - # perform reciprocal matching - corres = extract_correspondences(descs, qonfs, device=device, subsample=subsample) - - conf_score = (C11.mean() * C12.mean() * C21.mean() * C22.mean()).sqrt().sqrt() - matching_score = (float(conf_score), float(corres[2].sum()), len(corres[2])) - if cache_path is not None: - torch.save((matching_score, corres), mkdir_for(path_corres)) - - res_paths[img1['instance'], img2['instance']] = (path1, path2), path_corres - - del model - torch.cuda.empty_cache() - - return res_paths, cache_path - - -def symmetric_inference(model, img1, img2, device): - shape1 = torch.from_numpy(img1['true_shape']).to(device, non_blocking=True) - shape2 = torch.from_numpy(img2['true_shape']).to(device, non_blocking=True) - img1 = img1['img'].to(device, non_blocking=True) - img2 = img2['img'].to(device, non_blocking=True) - - # compute encoder only once - feat1, feat2, pos1, pos2 = model._encode_image_pairs(img1, img2, shape1, shape2) - - def decoder(feat1, feat2, pos1, pos2, shape1, shape2): - dec1, dec2 = model._decoder(feat1, pos1, feat2, pos2) - with torch.cuda.amp.autocast(enabled=False): - res1 = model._downstream_head(1, [tok.float() for tok in dec1], shape1) - res2 = model._downstream_head(2, [tok.float() for tok in dec2], shape2) - return res1, res2 - - # decoder 1-2 - res11, res21 = decoder(feat1, feat2, pos1, pos2, shape1, shape2) - # decoder 2-1 - res22, res12 = decoder(feat2, feat1, pos2, pos1, shape2, shape1) - - return (res11, res21, res22, res12) - - -def extract_correspondences(feats, qonfs, subsample=8, device=None, ptmap_key='pred_desc'): - feat11, feat21, feat22, feat12 = feats - qonf11, qonf21, qonf22, qonf12 = qonfs - assert feat11.shape[:2] == feat12.shape[:2] == qonf11.shape == qonf12.shape - assert feat21.shape[:2] == feat22.shape[:2] == qonf21.shape == qonf22.shape - - if '3d' in ptmap_key: - opt = dict(device='cpu', workers=32) - else: - opt = dict(device=device, dist='dot', block_size=2**13) - - # matching the two pairs - idx1 = [] - idx2 = [] - qonf1 = [] - qonf2 = [] - # TODO add non symmetric / pixel_tol options - for A, B, QA, QB in [(feat11, feat21, qonf11.cpu(), qonf21.cpu()), - (feat12, feat22, qonf12.cpu(), qonf22.cpu())]: - nn1to2 = fast_reciprocal_NNs(A, B, subsample_or_initxy1=subsample, ret_xy=False, **opt) - nn2to1 = fast_reciprocal_NNs(B, A, subsample_or_initxy1=subsample, ret_xy=False, **opt) - - idx1.append(np.r_[nn1to2[0], nn2to1[1]]) - idx2.append(np.r_[nn1to2[1], nn2to1[0]]) - qonf1.append(QA.ravel()[idx1[-1]]) - qonf2.append(QB.ravel()[idx2[-1]]) - - # merge corres from opposite pairs - H1, W1 = feat11.shape[:2] - H2, W2 = feat22.shape[:2] - cat = np.concatenate - - xy1, xy2, idx = merge_corres(cat(idx1), cat(idx2), (H1, W1), (H2, W2), ret_xy=True, ret_index=True) - corres = (xy1.copy(), xy2.copy(), np.sqrt(cat(qonf1)[idx] * cat(qonf2)[idx])) - - return todevice(corres, device) - - -@torch.no_grad() -def prepare_canonical_data(imgs, tmp_pairs, subsample, order_imgs=False, min_conf_thr=0, - cache_path=None, device='cuda', **kw): - canonical_views = {} - pairwise_scores = torch.zeros((len(imgs), len(imgs)), device=device) - canonical_paths = [] - preds_21 = {} - - for img in tqdm(imgs): - if cache_path: - cache = os.path.join(cache_path, 'canon_views', hash_md5(img) + f'_{subsample=}_{kw=}.pth') - canonical_paths.append(cache) - try: - (canon, canon2, cconf), focal = torch.load(cache, map_location=device) - except IOError: - # cache does not exist yet, we create it! - canon = focal = None - - # collect all pred1 - n_pairs = sum((img in pair) for pair in tmp_pairs) - - ptmaps11 = None - pixels = {} - n = 0 - for (img1, img2), ((path1, path2), path_corres) in tmp_pairs.items(): - score = None - if img == img1: - X, C, X2, C2 = torch.load(path1, map_location=device) - score, (xy1, xy2, confs) = load_corres(path_corres, device, min_conf_thr) - pixels[img2] = xy1, confs - if img not in preds_21: - preds_21[img] = {} - # Subsample preds_21 - preds_21[img][img2] = X2[::subsample, ::subsample].reshape(-1, 3), C2[::subsample, ::subsample].ravel() - - if img == img2: - X, C, X2, C2 = torch.load(path2, map_location=device) - score, (xy1, xy2, confs) = load_corres(path_corres, device, min_conf_thr) - pixels[img1] = xy2, confs - if img not in preds_21: - preds_21[img] = {} - preds_21[img][img1] = X2[::subsample, ::subsample].reshape(-1, 3), C2[::subsample, ::subsample].ravel() - - if score is not None: - i, j = imgs.index(img1), imgs.index(img2) - # score = score[0] - # score = np.log1p(score[2]) - score = score[2] - pairwise_scores[i, j] = score - pairwise_scores[j, i] = score - - if canon is not None: - continue - if ptmaps11 is None: - H, W = C.shape - ptmaps11 = torch.empty((n_pairs, H, W, 3), device=device) - confs11 = torch.empty((n_pairs, H, W), device=device) - - ptmaps11[n] = X - confs11[n] = C - n += 1 - - if canon is None: - canon, canon2, cconf = canonical_view(ptmaps11, confs11, subsample, **kw) - del ptmaps11 - del confs11 - - # compute focals - H, W = canon.shape[:2] - pp = torch.tensor([W / 2, H / 2], device=device) - if focal is None: - focal = estimate_focal_knowing_depth(canon[None], pp, focal_mode='weiszfeld', min_focal=0.5, max_focal=3.5) - if cache: - torch.save(to_cpu(((canon, canon2, cconf), focal)), mkdir_for(cache)) - - # extract depth offsets with correspondences - core_depth = canon[subsample // 2::subsample, subsample // 2::subsample, 2] - idxs, offsets = anchor_depth_offsets(canon2, pixels, subsample=subsample) - - canonical_views[img] = (pp, (H, W), focal.view(1), core_depth, pixels, idxs, offsets) - - return tmp_pairs, pairwise_scores, canonical_views, canonical_paths, preds_21 - - -def load_corres(path_corres, device, min_conf_thr): - score, (xy1, xy2, confs) = torch.load(path_corres, map_location=device) - valid = confs > min_conf_thr if min_conf_thr else slice(None) - # valid = (xy1 > 0).all(dim=1) & (xy2 > 0).all(dim=1) & (xy1 < 512).all(dim=1) & (xy2 < 512).all(dim=1) - # print(f'keeping {valid.sum()} / {len(valid)} correspondences') - return score, (xy1[valid], xy2[valid], confs[valid]) - - -PairOfSlices = namedtuple( - 'ImgPair', 'img1, slice1, pix1, anchor_idxs1, img2, slice2, pix2, anchor_idxs2, confs, confs_sum') - - -def condense_data(imgs, tmp_paths, canonical_views, preds_21, dtype=torch.float32): - # aggregate all data properly - set_imgs = set(imgs) - - principal_points = [] - shapes = [] - focals = [] - core_depth = [] - img_anchors = {} - tmp_pixels = {} - - for idx1, img1 in enumerate(imgs): - # load stuff - pp, shape, focal, anchors, pixels_confs, idxs, offsets = canonical_views[img1] - - principal_points.append(pp) - shapes.append(shape) - focals.append(focal) - core_depth.append(anchors) - - img_uv1 = [] - img_idxs = [] - img_offs = [] - cur_n = [0] - - for img2, (pixels, match_confs) in pixels_confs.items(): - if img2 not in set_imgs: - continue - assert len(pixels) == len(idxs[img2]) == len(offsets[img2]) - img_uv1.append(torch.cat((pixels, torch.ones_like(pixels[:, :1])), dim=-1)) - img_idxs.append(idxs[img2]) - img_offs.append(offsets[img2]) - cur_n.append(cur_n[-1] + len(pixels)) - # store the position of 3d points - tmp_pixels[img1, img2] = pixels.to(dtype), match_confs.to(dtype), slice(*cur_n[-2:]) - img_anchors[idx1] = (torch.cat(img_uv1), torch.cat(img_idxs), torch.cat(img_offs)) - - all_confs = [] - imgs_slices = [] - corres2d = {img: [] for img in range(len(imgs))} - - for img1, img2 in tmp_paths: - try: - pix1, confs1, slice1 = tmp_pixels[img1, img2] - pix2, confs2, slice2 = tmp_pixels[img2, img1] - except KeyError: - continue - img1 = imgs.index(img1) - img2 = imgs.index(img2) - confs = (confs1 * confs2).sqrt() - - # prepare for loss_3d - all_confs.append(confs) - anchor_idxs1 = canonical_views[imgs[img1]][5][imgs[img2]] - anchor_idxs2 = canonical_views[imgs[img2]][5][imgs[img1]] - imgs_slices.append(PairOfSlices(img1, slice1, pix1, anchor_idxs1, - img2, slice2, pix2, anchor_idxs2, - confs, float(confs.sum()))) - - # prepare for loss_2d - corres2d[img1].append((pix1, confs, img2, slice2)) - corres2d[img2].append((pix2, confs, img1, slice1)) - - all_confs = torch.cat(all_confs) - corres = (all_confs, float(all_confs.sum()), imgs_slices) - - def aggreg_matches(img1, list_matches): - pix1, confs, img2, slice2 = zip(*list_matches) - all_pix1 = torch.cat(pix1).to(dtype) - all_confs = torch.cat(confs).to(dtype) - return img1, all_pix1, all_confs, float(all_confs.sum()), [(j, sl2) for j, sl2 in zip(img2, slice2)] - corres2d = [aggreg_matches(img, m) for img, m in corres2d.items()] - - imsizes = torch.tensor([(W, H) for H, W in shapes], device=pp.device) # (W,H) - principal_points = torch.stack(principal_points) - focals = torch.cat(focals) - - # Subsample preds_21 - subsamp_preds_21 = {} - for imk, imv in preds_21.items(): - subsamp_preds_21[imk] = {} - for im2k, (pred, conf) in preds_21[imk].items(): - idxs = img_anchors[imgs.index(im2k)][1] - subsamp_preds_21[imk][im2k] = (pred[idxs], conf[idxs]) # anchors subsample - - return imsizes, principal_points, focals, core_depth, img_anchors, corres, corres2d, subsamp_preds_21 - - -def canonical_view(ptmaps11, confs11, subsample, mode='avg-angle'): - assert len(ptmaps11) == len(confs11) > 0, 'not a single view1 for img={i}' - - # canonical pointmap is just a weighted average - confs11 = confs11.unsqueeze(-1) - 0.999 - canon = (confs11 * ptmaps11).sum(0) / confs11.sum(0) - - canon_depth = ptmaps11[..., 2].unsqueeze(1) - S = slice(subsample // 2, None, subsample) - center_depth = canon_depth[:, :, S, S] - center_depth = torch.clip(center_depth, min=torch.finfo(center_depth.dtype).eps) - - stacked_depth = F.pixel_unshuffle(canon_depth, subsample) - stacked_confs = F.pixel_unshuffle(confs11[:, None, :, :, 0], subsample) - - if mode == 'avg-reldepth': - rel_depth = stacked_depth / center_depth - stacked_canon = (stacked_confs * rel_depth).sum(dim=0) / stacked_confs.sum(dim=0) - canon2 = F.pixel_shuffle(stacked_canon.unsqueeze(0), subsample).squeeze() - - elif mode == 'avg-angle': - xy = ptmaps11[..., 0:2].permute(0, 3, 1, 2) - stacked_xy = F.pixel_unshuffle(xy, subsample) - B, _, H, W = stacked_xy.shape - stacked_radius = (stacked_xy.view(B, 2, -1, H, W) - xy[:, :, None, S, S]).norm(dim=1) - stacked_radius.clip_(min=1e-8) - - stacked_angle = torch.arctan((stacked_depth - center_depth) / stacked_radius) - avg_angle = (stacked_confs * stacked_angle).sum(dim=0) / stacked_confs.sum(dim=0) - - # back to depth - stacked_depth = stacked_radius.mean(dim=0) * torch.tan(avg_angle) - - canon2 = F.pixel_shuffle((1 + stacked_depth / canon[S, S, 2]).unsqueeze(0), subsample).squeeze() - else: - raise ValueError(f'bad {mode=}') - - confs = (confs11.square().sum(dim=0) / confs11.sum(dim=0)).squeeze() - return canon, canon2, confs - - -def anchor_depth_offsets(canon_depth, pixels, subsample=8): - device = canon_depth.device - - # create a 2D grid of anchor 3D points - H1, W1 = canon_depth.shape - yx = np.mgrid[subsample // 2:H1:subsample, subsample // 2:W1:subsample] - H2, W2 = yx.shape[1:] - cy, cx = yx.reshape(2, -1) - core_depth = canon_depth[cy, cx] - assert (core_depth > 0).all() - - # slave 3d points (attached to core 3d points) - core_idxs = {} # core_idxs[img2] = {corr_idx:core_idx} - core_offs = {} # core_offs[img2] = {corr_idx:3d_offset} - - for img2, (xy1, _confs) in pixels.items(): - px, py = xy1.long().T - - # find nearest anchor == block quantization - core_idx = (py // subsample) * W2 + (px // subsample) - core_idxs[img2] = core_idx.to(device) - - # compute relative depth offsets w.r.t. anchors - ref_z = core_depth[core_idx] - pts_z = canon_depth[py, px] - offset = pts_z / ref_z - core_offs[img2] = offset.detach().to(device) - - return core_idxs, core_offs - - -def spectral_clustering(graph, k=None, normalized_cuts=False): - graph.fill_diagonal_(0) - - # graph laplacian - degrees = graph.sum(dim=-1) - laplacian = torch.diag(degrees) - graph - if normalized_cuts: - i_inv = torch.diag(degrees.sqrt().reciprocal()) - laplacian = i_inv @ laplacian @ i_inv - - # compute eigenvectors! - eigval, eigvec = torch.linalg.eigh(laplacian) - return eigval[:k], eigvec[:, :k] - - -def sim_func(p1, p2, gamma): - diff = (p1 - p2).norm(dim=-1) - avg_depth = (p1[:, :, 2] + p2[:, :, 2]) - rel_distance = diff / avg_depth - sim = torch.exp(-gamma * rel_distance.square()) - return sim - - -def backproj(K, depthmap, subsample): - H, W = depthmap.shape - uv = np.mgrid[subsample // 2:subsample * W:subsample, subsample // 2:subsample * H:subsample].T.reshape(H, W, 2) - xyz = depthmap.unsqueeze(-1) * geotrf(inv(K), todevice(uv, K.device), ncol=3) - return xyz - - -def spectral_projection_depth(K, depthmap, subsample, k=64, cache_path='', - normalized_cuts=True, gamma=7, min_norm=5): - try: - if cache_path: - cache_path = cache_path + f'_{k=}_norm={normalized_cuts}_{gamma=}.pth' - lora_proj = torch.load(cache_path, map_location=K.device) - - except IOError: - # reconstruct 3d points in camera coordinates - xyz = backproj(K, depthmap, subsample) - - # compute all distances - xyz = xyz.reshape(-1, 3) - graph = sim_func(xyz[:, None], xyz[None, :], gamma=gamma) - _, lora_proj = spectral_clustering(graph, k, normalized_cuts=normalized_cuts) - - if cache_path: - torch.save(lora_proj.cpu(), mkdir_for(cache_path)) - - lora_proj, coeffs = lora_encode_normed(lora_proj, depthmap.ravel(), min_norm=min_norm) - - # depthmap ~= lora_proj @ coeffs - return coeffs, lora_proj - - -def lora_encode_normed(lora_proj, x, min_norm, global_norm=False): - # encode the pointmap - coeffs = torch.linalg.pinv(lora_proj) @ x - - # rectify the norm of basis vector to be ~ equal - if coeffs.ndim == 1: - coeffs = coeffs[:, None] - if global_norm: - lora_proj *= coeffs[1:].norm() * min_norm / coeffs.shape[1] - elif min_norm: - lora_proj *= coeffs.norm(dim=1).clip(min=min_norm) - # can have rounding errors here! - coeffs = (torch.linalg.pinv(lora_proj.double()) @ x.double()).float() - - return lora_proj.detach(), coeffs.detach() - - -@torch.no_grad() -def spectral_projection_of_depthmaps(imgs, intrinsics, depthmaps, subsample, cache_path=None, **kw): - # recover 3d points - core_depth = [] - lora_proj = [] - - for i, img in enumerate(tqdm(imgs)): - cache = os.path.join(cache_path, 'lora_depth', hash_md5(img)) if cache_path else None - depth, proj = spectral_projection_depth(intrinsics[i], depthmaps[i], subsample, - cache_path=cache, **kw) - core_depth.append(depth) - lora_proj.append(proj) - - return core_depth, lora_proj - - -def reproj2d(Trf, pts3d): - res = (pts3d @ Trf[:3, :3].transpose(-1, -2)) + Trf[:3, 3] - clipped_z = res[:, 2:3].clip(min=1e-3) # make sure we don't have nans! - uv = res[:, 0:2] / clipped_z - return uv.clip(min=-1000, max=2000) - - -def bfs(tree, start_node): - order, predecessors = sp.csgraph.breadth_first_order(tree, start_node, directed=False) - ranks = np.arange(len(order)) - ranks[order] = ranks.copy() - return ranks, predecessors - - -def compute_min_spanning_tree(pws): - sparse_graph = sp.dok_array(pws.shape) - for i, j in pws.nonzero().cpu().tolist(): - sparse_graph[i, j] = -float(pws[i, j]) - msp = sp.csgraph.minimum_spanning_tree(sparse_graph) - - # now reorder the oriented edges, starting from the central point - ranks1, _ = bfs(msp, 0) - ranks2, _ = bfs(msp, ranks1.argmax()) - ranks1, _ = bfs(msp, ranks2.argmax()) - # this is the point farther from any leaf - root = np.minimum(ranks1, ranks2).argmax() - - # find the ordered list of edges that describe the tree - order, predecessors = sp.csgraph.breadth_first_order(msp, root, directed=False) - order = order[1:] # root not do not have a predecessor - edges = [(predecessors[i], i) for i in order] - - return root, edges - - -def show_reconstruction(shapes_or_imgs, K, cam2w, pts3d, gt_cam2w=None, gt_K=None, cam_size=None, masks=None, **kw): - viz = SceneViz() - - cc = cam2w[:, :3, 3] - cs = cam_size or float(torch.cdist(cc, cc).fill_diagonal_(np.inf).min(dim=0).values.median()) - colors = 64 + np.random.randint(255 - 64, size=(len(cam2w), 3)) - - if isinstance(shapes_or_imgs, np.ndarray) and shapes_or_imgs.ndim == 2: - cam_kws = dict(imsizes=shapes_or_imgs[:, ::-1], cam_size=cs) - else: - imgs = shapes_or_imgs - cam_kws = dict(images=imgs, cam_size=cs) - if K is not None: - viz.add_cameras(to_numpy(cam2w), to_numpy(K), colors=colors, **cam_kws) - - if gt_cam2w is not None: - if gt_K is None: - gt_K = K - viz.add_cameras(to_numpy(gt_cam2w), to_numpy(gt_K), colors=colors, marker='o', **cam_kws) - - if pts3d is not None: - for i, p in enumerate(pts3d): - if not len(p): - continue - if masks is None: - viz.add_pointcloud(to_numpy(p), color=tuple(colors[i].tolist())) - else: - viz.add_pointcloud(to_numpy(p), mask=masks[i], color=imgs[i]) - viz.show(**kw) diff --git a/modules/mast3r/cloud_opt/triangulation.py b/modules/mast3r/cloud_opt/triangulation.py deleted file mode 100644 index 2af88df37bfd360161b4e96b93b0fd28a0ecf183..0000000000000000000000000000000000000000 --- a/modules/mast3r/cloud_opt/triangulation.py +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright (C) 2024-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). -# -# -------------------------------------------------------- -# Matches Triangulation Utils -# -------------------------------------------------------- - -import numpy as np -import torch - -# Batched Matches Triangulation -def batched_triangulate(pts2d, # [B, Ncams, Npts, 2] - proj_mats): # [B, Ncams, 3, 4] I@E projection matrix - B, Ncams, Npts, two = pts2d.shape - assert two==2 - assert proj_mats.shape == (B, Ncams, 3, 4) - # P - xP - x = proj_mats[...,0,:][...,None,:] - torch.einsum('bij,bik->bijk', pts2d[...,0], proj_mats[...,2,:]) # [B, Ncams, Npts, 4] - y = proj_mats[...,1,:][...,None,:] - torch.einsum('bij,bik->bijk', pts2d[...,1], proj_mats[...,2,:]) # [B, Ncams, Npts, 4] - eq = torch.cat([x, y], dim=1).transpose(1, 2) # [B, Npts, 2xNcams, 4] - return torch.linalg.lstsq(eq[...,:3], -eq[...,3]).solution - -def matches_to_depths(intrinsics, # input camera intrinsics [B, Ncams, 3, 3] - extrinsics, # input camera extrinsics [B, Ncams, 3, 4] - matches, # input correspondences [B, Ncams, Npts, 2] - batchsize=16, # bs for batched processing - min_num_valids_ratio=.3 # at least this ratio of image pairs need to predict a match for a given pixel of img1 - ): - B, Nv, H, W, five = matches.shape - min_num_valids = np.floor(Nv*min_num_valids_ratio) - out_aggregated_points, out_depths, out_confs = [], [], [] - for b in range(B//batchsize+1): # batched processing - start, stop = b*batchsize,min(B,(b+1)*batchsize) - sub_batch=slice(start,stop) - sub_batchsize = stop-start - if sub_batchsize==0:continue - points1, points2, confs = matches[sub_batch, ..., :2], matches[sub_batch, ..., 2:4], matches[sub_batch, ..., -1] - allpoints = torch.cat([points1.view([sub_batchsize*Nv,1,H*W,2]), points2.view([sub_batchsize*Nv,1,H*W,2])],dim=1) # [BxNv, 2, HxW, 2] - - allcam_Ps = intrinsics[sub_batch] @ extrinsics[sub_batch,:,:3,:] - cam_Ps1, cam_Ps2 = allcam_Ps[:,[0]].repeat([1,Nv,1,1]), allcam_Ps[:,1:] # [B, Nv, 3, 4] - formatted_camPs = torch.cat([cam_Ps1.reshape([sub_batchsize*Nv,1,3,4]), cam_Ps2.reshape([sub_batchsize*Nv,1,3,4])],dim=1) # [BxNv, 2, 3, 4] - - # Triangulate matches to 3D - points_3d_world = batched_triangulate(allpoints, formatted_camPs) # [BxNv, HxW, three] - - # Aggregate pairwise predictions - points_3d_world = points_3d_world.view([sub_batchsize,Nv,H,W,3]) - valids = points_3d_world.isfinite() - valids_sum = valids.sum(dim=-1) - validsuni=valids_sum.unique() - assert torch.all(torch.logical_or(validsuni == 0 , validsuni == 3)), "Error, can only be nan for none or all XYZ values, not a subset" - confs[valids_sum==0] = 0. - points_3d_world = points_3d_world*confs[...,None] - - # Take care of NaNs - normalization = confs.sum(dim=1)[:,None].repeat(1,Nv,1,1) - normalization[normalization <= 1e-5] = 1. - points_3d_world[valids] /= normalization[valids_sum==3][:,None].repeat(1,3).view(-1) - points_3d_world[~valids] = 0. - aggregated_points = points_3d_world.sum(dim=1) # weighted average (by confidence value) ignoring nans - - # Reset invalid values to nans, with a min visibility threshold - aggregated_points[valids_sum.sum(dim=1)/3 <= min_num_valids] = torch.nan - - # From 3D to depths - refcamE = extrinsics[sub_batch, 0] - points_3d_camera = (refcamE[:,:3, :3] @ aggregated_points.view(sub_batchsize,-1,3).transpose(-2,-1) + refcamE[:,:3,[3]]).transpose(-2,-1) # [B,HxW,3] - depths = points_3d_camera.view(sub_batchsize,H,W,3)[..., 2] # [B,H,W] - - # Cat results - out_aggregated_points.append(aggregated_points.cpu()) - out_depths.append(depths.cpu()) - out_confs.append(confs.sum(dim=1).cpu()) - - out_aggregated_points = torch.cat(out_aggregated_points,dim=0) - out_depths = torch.cat(out_depths,dim=0) - out_confs = torch.cat(out_confs,dim=0) - - return out_aggregated_points, out_depths, out_confs diff --git a/modules/mast3r/cloud_opt/tsdf_optimizer.py b/modules/mast3r/cloud_opt/tsdf_optimizer.py deleted file mode 100644 index 69f138c0301e4ad3cd4804d265f241b923e1b2b8..0000000000000000000000000000000000000000 --- a/modules/mast3r/cloud_opt/tsdf_optimizer.py +++ /dev/null @@ -1,273 +0,0 @@ -import torch -from torch import nn -import numpy as np -from tqdm import tqdm -from matplotlib import pyplot as pl - -import mast3r.utils.path_to_dust3r # noqa -from dust3r.utils.geometry import depthmap_to_pts3d, geotrf, inv -from dust3r.cloud_opt.base_opt import clean_pointcloud - - -class TSDFPostProcess: - """ Optimizes a signed distance-function to improve depthmaps. - """ - - def __init__(self, optimizer, subsample=8, TSDF_thresh=0., TSDF_batchsize=int(1e7)): - self.TSDF_thresh = TSDF_thresh # None -> no TSDF - self.TSDF_batchsize = TSDF_batchsize - self.optimizer = optimizer - - pts3d, depthmaps, confs = optimizer.get_dense_pts3d(clean_depth=False, subsample=subsample) - pts3d, depthmaps = self._TSDF_postprocess_or_not(pts3d, depthmaps, confs) - self.pts3d = pts3d - self.depthmaps = depthmaps - self.confs = confs - - def _get_depthmaps(self, TSDF_filtering_thresh=None): - if TSDF_filtering_thresh: - self._refine_depths_with_TSDF(self.optimizer, TSDF_filtering_thresh) # compute refined depths if needed - dms = self.TSDF_im_depthmaps if TSDF_filtering_thresh else self.im_depthmaps - return [d.exp() for d in dms] - - @torch.no_grad() - def _refine_depths_with_TSDF(self, TSDF_filtering_thresh, niter=1, nsamples=1000): - """ - Leverage TSDF to post-process estimated depths - for each pixel, find zero level of TSDF along ray (or closest to 0) - """ - print("Post-Processing Depths with TSDF fusion.") - self.TSDF_im_depthmaps = [] - alldepths, allposes, allfocals, allpps, allimshapes = self._get_depthmaps(), self.optimizer.get_im_poses( - ), self.optimizer.get_focals(), self.optimizer.get_principal_points(), self.imshapes - for vi in tqdm(range(self.optimizer.n_imgs)): - dm, pose, focal, pp, imshape = alldepths[vi], allposes[vi], allfocals[vi], allpps[vi], allimshapes[vi] - minvals = torch.full(dm.shape, 1e20) - - for it in range(niter): - H, W = dm.shape - curthresh = (niter - it) * TSDF_filtering_thresh - dm_offsets = (torch.randn(H, W, nsamples).to(dm) - 1.) * \ - curthresh # decreasing search std along with iterations - newdm = dm[..., None] + dm_offsets # [H,W,Nsamp] - curproj = self._backproj_pts3d(in_depths=[newdm], in_im_poses=pose[None], in_focals=focal[None], in_pps=pp[None], in_imshapes=[ - imshape])[0] # [H,W,Nsamp,3] - # Batched TSDF eval - curproj = curproj.view(-1, 3) - tsdf_vals = [] - valids = [] - for batch in range(0, len(curproj), self.TSDF_batchsize): - values, valid = self._TSDF_query( - curproj[batch:min(batch + self.TSDF_batchsize, len(curproj))], curthresh) - tsdf_vals.append(values) - valids.append(valid) - tsdf_vals = torch.cat(tsdf_vals, dim=0) - valids = torch.cat(valids, dim=0) - - tsdf_vals = tsdf_vals.view([H, W, nsamples]) - valids = valids.view([H, W, nsamples]) - - # keep depth value that got us the closest to 0 - tsdf_vals[~valids] = torch.inf # ignore invalid values - tsdf_vals = tsdf_vals.abs() - mins = torch.argmin(tsdf_vals, dim=-1, keepdim=True) - # when all samples live on a very flat zone, do nothing - allbad = (tsdf_vals == curthresh).sum(dim=-1) == nsamples - dm[~allbad] = torch.gather(newdm, -1, mins)[..., 0][~allbad] - - # Save refined depth map - self.TSDF_im_depthmaps.append(dm.log()) - - def _TSDF_query(self, qpoints, TSDF_filtering_thresh, weighted=True): - """ - TSDF query call: returns the weighted TSDF value for each query point [N, 3] - """ - N, three = qpoints.shape - assert three == 3 - qpoints = qpoints[None].repeat(self.optimizer.n_imgs, 1, 1) # [B,N,3] - # get projection coordinates and depths onto images - coords_and_depth = self._proj_pts3d(pts3d=qpoints, cam2worlds=self.optimizer.get_im_poses( - ), focals=self.optimizer.get_focals(), pps=self.optimizer.get_principal_points()) - image_coords = coords_and_depth[..., :2].round().to(int) # for now, there's no interpolation... - proj_depths = coords_and_depth[..., -1] - # recover depth values after scene optim - pred_depths, pred_confs, valids = self._get_pixel_depths(image_coords) - # Gather TSDF scores - all_SDF_scores = pred_depths - proj_depths # SDF - unseen = all_SDF_scores < -TSDF_filtering_thresh # handle visibility - # all_TSDF_scores = all_SDF_scores.clip(-TSDF_filtering_thresh,TSDF_filtering_thresh) # SDF -> TSDF - all_TSDF_scores = all_SDF_scores.clip(-TSDF_filtering_thresh, 1e20) # SDF -> TSDF - # Gather TSDF confidences and ignore points that are unseen, either OOB during reproj or too far behind seen depth - all_TSDF_weights = (~unseen).float() * valids.float() - if weighted: - all_TSDF_weights = pred_confs.exp() * all_TSDF_weights - # Aggregate all votes, ignoring zeros - TSDF_weights = all_TSDF_weights.sum(dim=0) - valids = TSDF_weights != 0. - TSDF_wsum = (all_TSDF_weights * all_TSDF_scores).sum(dim=0) - TSDF_wsum[valids] /= TSDF_weights[valids] - return TSDF_wsum, valids - - def _get_pixel_depths(self, image_coords, TSDF_filtering_thresh=None, with_normals_conf=False): - """ Recover depth value for each input pixel coordinate, along with OOB validity mask - """ - B, N, two = image_coords.shape - assert B == self.optimizer.n_imgs and two == 2 - depths = torch.zeros([B, N], device=image_coords.device) - valids = torch.zeros([B, N], dtype=bool, device=image_coords.device) - confs = torch.zeros([B, N], device=image_coords.device) - curconfs = self._get_confs_with_normals() if with_normals_conf else self.im_conf - for ni, (imc, depth, conf) in enumerate(zip(image_coords, self._get_depthmaps(TSDF_filtering_thresh), curconfs)): - H, W = depth.shape - valids[ni] = torch.logical_and(0 <= imc[:, 1], imc[:, 1] < - H) & torch.logical_and(0 <= imc[:, 0], imc[:, 0] < W) - imc[~valids[ni]] = 0 - depths[ni] = depth[imc[:, 1], imc[:, 0]] - confs[ni] = conf.cuda()[imc[:, 1], imc[:, 0]] - return depths, confs, valids - - def _get_confs_with_normals(self): - outconfs = [] - # Confidence basedf on depth gradient - - class Sobel(nn.Module): - def __init__(self): - super().__init__() - self.filter = nn.Conv2d(in_channels=1, out_channels=2, kernel_size=3, stride=1, padding=1, bias=False) - Gx = torch.tensor([[2.0, 0.0, -2.0], [4.0, 0.0, -4.0], [2.0, 0.0, -2.0]]) - Gy = torch.tensor([[2.0, 4.0, 2.0], [0.0, 0.0, 0.0], [-2.0, -4.0, -2.0]]) - G = torch.cat([Gx.unsqueeze(0), Gy.unsqueeze(0)], 0) - G = G.unsqueeze(1) - self.filter.weight = nn.Parameter(G, requires_grad=False) - - def forward(self, img): - x = self.filter(img) - x = torch.mul(x, x) - x = torch.sum(x, dim=1, keepdim=True) - x = torch.sqrt(x) - return x - - grad_op = Sobel().to(self.im_depthmaps[0].device) - for conf, depth in zip(self.im_conf, self.im_depthmaps): - grad_confs = (1. - grad_op(depth[None, None])[0, 0]).clip(0) - if not 'dbg show': - pl.imshow(grad_confs.cpu()) - pl.show() - outconfs.append(conf * grad_confs.to(conf)) - return outconfs - - def _proj_pts3d(self, pts3d, cam2worlds, focals, pps): - """ - Projection operation: from 3D points to 2D coordinates + depths - """ - B = pts3d.shape[0] - assert pts3d.shape[0] == cam2worlds.shape[0] - # prepare Extrinsincs - R, t = cam2worlds[:, :3, :3], cam2worlds[:, :3, -1] - Rinv = R.transpose(-2, -1) - tinv = -Rinv @ t[..., None] - - # prepare intrinsics - intrinsics = torch.eye(3).to(cam2worlds)[None].repeat(focals.shape[0], 1, 1) - if len(focals.shape) == 1: - focals = torch.stack([focals, focals], dim=-1) - intrinsics[:, 0, 0] = focals[:, 0] - intrinsics[:, 1, 1] = focals[:, 1] - intrinsics[:, :2, -1] = pps - # Project - projpts = intrinsics @ (Rinv @ pts3d.transpose(-2, -1) + tinv) # I(RX+t) : [B,3,N] - projpts = projpts.transpose(-2, -1) # [B,N,3] - projpts[..., :2] /= projpts[..., [-1]] # [B,N,3] (X/Z , Y/Z, Z) - return projpts - - def _backproj_pts3d(self, in_depths=None, in_im_poses=None, - in_focals=None, in_pps=None, in_imshapes=None): - """ - Backprojection operation: from image depths to 3D points - """ - # Get depths and projection params if not provided - focals = self.optimizer.get_focals() if in_focals is None else in_focals - im_poses = self.optimizer.get_im_poses() if in_im_poses is None else in_im_poses - depth = self._get_depthmaps() if in_depths is None else in_depths - pp = self.optimizer.get_principal_points() if in_pps is None else in_pps - imshapes = self.imshapes if in_imshapes is None else in_imshapes - def focal_ex(i): return focals[i][..., None, None].expand(1, *focals[i].shape, *imshapes[i]) - dm_to_3d = [depthmap_to_pts3d(depth[i][None], focal_ex(i), pp=pp[[i]]) for i in range(im_poses.shape[0])] - - def autoprocess(x): - x = x[0] - return x.transpose(-2, -1) if len(x.shape) == 4 else x - return [geotrf(pose, autoprocess(pt)) for pose, pt in zip(im_poses, dm_to_3d)] - - def _pts3d_to_depth(self, pts3d, cam2worlds, focals, pps): - """ - Projection operation: from 3D points to 2D coordinates + depths - """ - B = pts3d.shape[0] - assert pts3d.shape[0] == cam2worlds.shape[0] - # prepare Extrinsincs - R, t = cam2worlds[:, :3, :3], cam2worlds[:, :3, -1] - Rinv = R.transpose(-2, -1) - tinv = -Rinv @ t[..., None] - - # prepare intrinsics - intrinsics = torch.eye(3).to(cam2worlds)[None].repeat(self.optimizer.n_imgs, 1, 1) - if len(focals.shape) == 1: - focals = torch.stack([focals, focals], dim=-1) - intrinsics[:, 0, 0] = focals[:, 0] - intrinsics[:, 1, 1] = focals[:, 1] - intrinsics[:, :2, -1] = pps - # Project - projpts = intrinsics @ (Rinv @ pts3d.transpose(-2, -1) + tinv) # I(RX+t) : [B,3,N] - projpts = projpts.transpose(-2, -1) # [B,N,3] - projpts[..., :2] /= projpts[..., [-1]] # [B,N,3] (X/Z , Y/Z, Z) - return projpts - - def _depth_to_pts3d(self, in_depths=None, in_im_poses=None, in_focals=None, in_pps=None, in_imshapes=None): - """ - Backprojection operation: from image depths to 3D points - """ - # Get depths and projection params if not provided - focals = self.optimizer.get_focals() if in_focals is None else in_focals - im_poses = self.optimizer.get_im_poses() if in_im_poses is None else in_im_poses - depth = self._get_depthmaps() if in_depths is None else in_depths - pp = self.optimizer.get_principal_points() if in_pps is None else in_pps - imshapes = self.imshapes if in_imshapes is None else in_imshapes - - def focal_ex(i): return focals[i][..., None, None].expand(1, *focals[i].shape, *imshapes[i]) - - dm_to_3d = [depthmap_to_pts3d(depth[i][None], focal_ex(i), pp=pp[i:i + 1]) for i in range(im_poses.shape[0])] - - def autoprocess(x): - x = x[0] - H, W, three = x.shape[:3] - return x.transpose(-2, -1) if len(x.shape) == 4 else x - return [geotrf(pp, autoprocess(pt)) for pp, pt in zip(im_poses, dm_to_3d)] - - def _get_pts3d(self, TSDF_filtering_thresh=None, **kw): - """ - return 3D points (possibly filtering depths with TSDF) - """ - return self._backproj_pts3d(in_depths=self._get_depthmaps(TSDF_filtering_thresh=TSDF_filtering_thresh), **kw) - - def _TSDF_postprocess_or_not(self, pts3d, depthmaps, confs, niter=1): - # Setup inner variables - self.imshapes = [im.shape[:2] for im in self.optimizer.imgs] - self.im_depthmaps = [dd.log().view(imshape) for dd, imshape in zip(depthmaps, self.imshapes)] - self.im_conf = confs - - if self.TSDF_thresh > 0.: - # Create or update self.TSDF_im_depthmaps that contain logdepths filtered with TSDF - self._refine_depths_with_TSDF(self.TSDF_thresh, niter=niter) - depthmaps = [dd.exp() for dd in self.TSDF_im_depthmaps] - # Turn them into 3D points - pts3d = self._backproj_pts3d(in_depths=depthmaps) - depthmaps = [dd.flatten() for dd in depthmaps] - pts3d = [pp.view(-1, 3) for pp in pts3d] - return pts3d, depthmaps - - def get_dense_pts3d(self, clean_depth=True): - if clean_depth: - confs = clean_pointcloud(self.confs, self.optimizer.intrinsics, inv(self.optimizer.cam2w), - self.depthmaps, self.pts3d) - return self.pts3d, self.depthmaps, confs diff --git a/modules/mast3r/cloud_opt/utils/__init__.py b/modules/mast3r/cloud_opt/utils/__init__.py deleted file mode 100644 index d7dd877d649ce4dbd749dd7195a8b34c0f91d4f0..0000000000000000000000000000000000000000 --- a/modules/mast3r/cloud_opt/utils/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (C) 2024-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). \ No newline at end of file diff --git a/modules/mast3r/cloud_opt/utils/__pycache__/__init__.cpython-312.pyc b/modules/mast3r/cloud_opt/utils/__pycache__/__init__.cpython-312.pyc deleted file mode 100644 index b0ea4751e1ca4170186f19709b1df61b129371b8..0000000000000000000000000000000000000000 Binary files a/modules/mast3r/cloud_opt/utils/__pycache__/__init__.cpython-312.pyc and /dev/null differ diff --git a/modules/mast3r/cloud_opt/utils/__pycache__/losses.cpython-312.pyc b/modules/mast3r/cloud_opt/utils/__pycache__/losses.cpython-312.pyc deleted file mode 100644 index 0bca3830b488842cec734b59d5eef5862c3457a2..0000000000000000000000000000000000000000 Binary files a/modules/mast3r/cloud_opt/utils/__pycache__/losses.cpython-312.pyc and /dev/null differ diff --git a/modules/mast3r/cloud_opt/utils/__pycache__/schedules.cpython-312.pyc b/modules/mast3r/cloud_opt/utils/__pycache__/schedules.cpython-312.pyc deleted file mode 100644 index dde1f492887f2380b8362947434eb4a7a822bc81..0000000000000000000000000000000000000000 Binary files a/modules/mast3r/cloud_opt/utils/__pycache__/schedules.cpython-312.pyc and /dev/null differ diff --git a/modules/mast3r/cloud_opt/utils/losses.py b/modules/mast3r/cloud_opt/utils/losses.py deleted file mode 100644 index e1dd36afd6862592b8d00c499988136a972bd6e6..0000000000000000000000000000000000000000 --- a/modules/mast3r/cloud_opt/utils/losses.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright (C) 2024-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). -# -# -------------------------------------------------------- -# losses for sparse ga -# -------------------------------------------------------- -import torch -import numpy as np - - -def l05_loss(x, y): - return torch.linalg.norm(x - y, dim=-1).sqrt() - - -def l1_loss(x, y): - return torch.linalg.norm(x - y, dim=-1) - - -def gamma_loss(gamma, mul=1, offset=None, clip=np.inf): - if offset is None: - if gamma == 1: - return l1_loss - # d(x**p)/dx = 1 ==> p * x**(p-1) == 1 ==> x = (1/p)**(1/(p-1)) - offset = (1 / gamma)**(1 / (gamma - 1)) - - def loss_func(x, y): - return (mul * l1_loss(x, y).clip(max=clip) + offset) ** gamma - offset ** gamma - return loss_func - - -def meta_gamma_loss(): - return lambda alpha: gamma_loss(alpha) diff --git a/modules/mast3r/cloud_opt/utils/schedules.py b/modules/mast3r/cloud_opt/utils/schedules.py deleted file mode 100644 index d96253b4348d2f089c10142c5991e5afb8a9b683..0000000000000000000000000000000000000000 --- a/modules/mast3r/cloud_opt/utils/schedules.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (C) 2024-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). -# -# -------------------------------------------------------- -# lr schedules for sparse ga -# -------------------------------------------------------- -import numpy as np - - -def linear_schedule(alpha, lr_base, lr_end=0): - lr = (1 - alpha) * lr_base + alpha * lr_end - return lr - - -def cosine_schedule(alpha, lr_base, lr_end=0): - lr = lr_end + (lr_base - lr_end) * (1 + np.cos(alpha * np.pi)) / 2 - return lr diff --git a/modules/mast3r/colmap/__init__.py b/modules/mast3r/colmap/__init__.py deleted file mode 100644 index d7dd877d649ce4dbd749dd7195a8b34c0f91d4f0..0000000000000000000000000000000000000000 --- a/modules/mast3r/colmap/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (C) 2024-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). \ No newline at end of file diff --git a/modules/mast3r/colmap/database.py b/modules/mast3r/colmap/database.py deleted file mode 100644 index 5de83a35664d4038a99713de7f397e83940e5421..0000000000000000000000000000000000000000 --- a/modules/mast3r/colmap/database.py +++ /dev/null @@ -1,383 +0,0 @@ -# Copyright (C) 2024-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). -# -# -------------------------------------------------------- -# MASt3R to colmap export functions -# -------------------------------------------------------- -import os -import torch -import copy -import numpy as np -import torchvision -import numpy as np -from tqdm import tqdm -from scipy.cluster.hierarchy import DisjointSet -from scipy.spatial.transform import Rotation as R - -from mast3r.utils.misc import hash_md5 - -from mast3r.fast_nn import extract_correspondences_nonsym, bruteforce_reciprocal_nns - -import mast3r.utils.path_to_dust3r # noqa -from dust3r.utils.geometry import find_reciprocal_matches, xy_grid, geotrf # noqa - - -def convert_im_matches_pairs(img0, img1, image_to_colmap, im_keypoints, matches_im0, matches_im1, viz): - if viz: - from matplotlib import pyplot as pl - - image_mean = torch.as_tensor( - [0.5, 0.5, 0.5], device='cpu').reshape(1, 3, 1, 1) - image_std = torch.as_tensor( - [0.5, 0.5, 0.5], device='cpu').reshape(1, 3, 1, 1) - rgb0 = img0['img'] * image_std + image_mean - rgb0 = torchvision.transforms.functional.to_pil_image(rgb0[0]) - rgb0 = np.array(rgb0) - - rgb1 = img1['img'] * image_std + image_mean - rgb1 = torchvision.transforms.functional.to_pil_image(rgb1[0]) - rgb1 = np.array(rgb1) - - imgs = [rgb0, rgb1] - # visualize a few matches - n_viz = 100 - num_matches = matches_im0.shape[0] - match_idx_to_viz = np.round(np.linspace( - 0, num_matches - 1, n_viz)).astype(int) - viz_matches_im0, viz_matches_im1 = matches_im0[match_idx_to_viz], matches_im1[match_idx_to_viz] - - H0, W0, H1, W1 = *imgs[0].shape[:2], *imgs[1].shape[:2] - rgb0 = np.pad(imgs[0], ((0, max(H1 - H0, 0)), - (0, 0), (0, 0)), 'constant', constant_values=0) - rgb1 = np.pad(imgs[1], ((0, max(H0 - H1, 0)), - (0, 0), (0, 0)), 'constant', constant_values=0) - img = np.concatenate((rgb0, rgb1), axis=1) - pl.figure() - pl.imshow(img) - cmap = pl.get_cmap('jet') - for ii in range(n_viz): - (x0, y0), (x1, - y1) = viz_matches_im0[ii].T, viz_matches_im1[ii].T - pl.plot([x0, x1 + W0], [y0, y1], '-+', color=cmap(ii / - (n_viz - 1)), scalex=False, scaley=False) - pl.show(block=True) - - matches = [matches_im0.astype(np.float64), matches_im1.astype(np.float64)] - imgs = [img0, img1] - imidx0 = img0['idx'] - imidx1 = img1['idx'] - ravel_matches = [] - for j in range(2): - H, W = imgs[j]['true_shape'][0] - with np.errstate(invalid='ignore'): - qx, qy = matches[j].round().astype(np.int32).T - ravel_matches_j = qx.clip(min=0, max=W - 1, out=qx) + W * qy.clip(min=0, max=H - 1, out=qy) - ravel_matches.append(ravel_matches_j) - imidxj = imgs[j]['idx'] - for m in ravel_matches_j: - if m not in im_keypoints[imidxj]: - im_keypoints[imidxj][m] = 0 - im_keypoints[imidxj][m] += 1 - imid0 = copy.deepcopy(image_to_colmap[imidx0]['colmap_imid']) - imid1 = copy.deepcopy(image_to_colmap[imidx1]['colmap_imid']) - if imid0 > imid1: - colmap_matches = np.stack([ravel_matches[1], ravel_matches[0]], axis=-1) - imid0, imid1 = imid1, imid0 - imidx0, imidx1 = imidx1, imidx0 - else: - colmap_matches = np.stack([ravel_matches[0], ravel_matches[1]], axis=-1) - colmap_matches = np.unique(colmap_matches, axis=0) - return imidx0, imidx1, colmap_matches - - -def get_im_matches(pred1, pred2, pairs, image_to_colmap, im_keypoints, conf_thr, - is_sparse=True, subsample=8, pixel_tol=0, viz=False, device='cuda'): - im_matches = {} - for i in range(len(pred1['pts3d'])): - imidx0 = pairs[i][0]['idx'] - imidx1 = pairs[i][1]['idx'] - if 'desc' in pred1: # mast3r - descs = [pred1['desc'][i], pred2['desc'][i]] - confidences = [pred1['desc_conf'][i], pred2['desc_conf'][i]] - desc_dim = descs[0].shape[-1] - - if is_sparse: - corres = extract_correspondences_nonsym(descs[0], descs[1], confidences[0], confidences[1], - device=device, subsample=subsample, pixel_tol=pixel_tol) - conf = corres[2] - mask = conf >= conf_thr - matches_im0 = corres[0][mask].cpu().numpy() - matches_im1 = corres[1][mask].cpu().numpy() - else: - confidence_masks = [confidences[0] >= - conf_thr, confidences[1] >= conf_thr] - pts2d_list, desc_list = [], [] - for j in range(2): - conf_j = confidence_masks[j].cpu().numpy().flatten() - true_shape_j = pairs[i][j]['true_shape'][0] - pts2d_j = xy_grid( - true_shape_j[1], true_shape_j[0]).reshape(-1, 2)[conf_j] - desc_j = descs[j].detach().cpu( - ).numpy().reshape(-1, desc_dim)[conf_j] - pts2d_list.append(pts2d_j) - desc_list.append(desc_j) - if len(desc_list[0]) == 0 or len(desc_list[1]) == 0: - continue - - nn0, nn1 = bruteforce_reciprocal_nns(desc_list[0], desc_list[1], - device=device, dist='dot', block_size=2**13) - reciprocal_in_P0 = (nn1[nn0] == np.arange(len(nn0))) - - matches_im1 = pts2d_list[1][nn0][reciprocal_in_P0] - matches_im0 = pts2d_list[0][reciprocal_in_P0] - else: - pts3d = [pred1['pts3d'][i], pred2['pts3d_in_other_view'][i]] - confidences = [pred1['conf'][i], pred2['conf'][i]] - - if is_sparse: - corres = extract_correspondences_nonsym(pts3d[0], pts3d[1], confidences[0], confidences[1], - device=device, subsample=subsample, pixel_tol=pixel_tol, - ptmap_key='3d') - conf = corres[2] - mask = conf >= conf_thr - matches_im0 = corres[0][mask].cpu().numpy() - matches_im1 = corres[1][mask].cpu().numpy() - else: - confidence_masks = [confidences[0] >= - conf_thr, confidences[1] >= conf_thr] - # find 2D-2D matches between the two images - pts2d_list, pts3d_list = [], [] - for j in range(2): - conf_j = confidence_masks[j].cpu().numpy().flatten() - true_shape_j = pairs[i][j]['true_shape'][0] - pts2d_j = xy_grid(true_shape_j[1], true_shape_j[0]).reshape(-1, 2)[conf_j] - pts3d_j = pts3d[j].detach().cpu().numpy().reshape(-1, 3)[conf_j] - pts2d_list.append(pts2d_j) - pts3d_list.append(pts3d_j) - - PQ, PM = pts3d_list[0], pts3d_list[1] - if len(PQ) == 0 or len(PM) == 0: - continue - reciprocal_in_PM, nnM_in_PQ, num_matches = find_reciprocal_matches( - PQ, PM) - - matches_im1 = pts2d_list[1][reciprocal_in_PM] - matches_im0 = pts2d_list[0][nnM_in_PQ][reciprocal_in_PM] - - if len(matches_im0) == 0: - continue - imidx0, imidx1, colmap_matches = convert_im_matches_pairs(pairs[i][0], pairs[i][1], - image_to_colmap, im_keypoints, - matches_im0, matches_im1, viz) - im_matches[(imidx0, imidx1)] = colmap_matches - return im_matches - - -def get_im_matches_from_cache(pairs, cache_path, desc_conf, subsample, - image_to_colmap, im_keypoints, conf_thr, - viz=False, device='cuda'): - im_matches = {} - for i in range(len(pairs)): - imidx0 = pairs[i][0]['idx'] - imidx1 = pairs[i][1]['idx'] - - corres_idx1 = hash_md5(pairs[i][0]['instance']) - corres_idx2 = hash_md5(pairs[i][1]['instance']) - - path_corres = cache_path + f'/corres_conf={desc_conf}_{subsample=}/{corres_idx1}-{corres_idx2}.pth' - if os.path.isfile(path_corres): - score, (xy1, xy2, confs) = torch.load(path_corres, map_location=device) - else: - path_corres = cache_path + f'/corres_conf={desc_conf}_{subsample=}/{corres_idx2}-{corres_idx1}.pth' - score, (xy2, xy1, confs) = torch.load(path_corres, map_location=device) - mask = confs >= conf_thr - matches_im0 = xy1[mask].cpu().numpy() - matches_im1 = xy2[mask].cpu().numpy() - - if len(matches_im0) == 0: - continue - imidx0, imidx1, colmap_matches = convert_im_matches_pairs(pairs[i][0], pairs[i][1], - image_to_colmap, im_keypoints, - matches_im0, matches_im1, viz) - im_matches[(imidx0, imidx1)] = colmap_matches - return im_matches - - -def export_images(db, images, image_paths, focals, ga_world_to_cam, camera_model): - # add cameras/images to the db - # with the output of ga as prior - image_to_colmap = {} - im_keypoints = {} - for idx in range(len(image_paths)): - im_keypoints[idx] = {} - H, W = images[idx]["orig_shape"] - if focals is None: - focal_x = focal_y = 1.2 * max(W, H) - prior_focal_length = False - cx = W / 2.0 - cy = H / 2.0 - elif isinstance(focals[idx], np.ndarray) and len(focals[idx].shape) == 2: - # intrinsics - focal_x = focals[idx][0, 0] - focal_y = focals[idx][1, 1] - cx = focals[idx][0, 2] * images[idx]["to_orig"][0, 0] - cy = focals[idx][1, 2] * images[idx]["to_orig"][1, 1] - prior_focal_length = True - else: - focal_x = focal_y = float(focals[idx]) - prior_focal_length = True - cx = W / 2.0 - cy = H / 2.0 - focal_x = focal_x * images[idx]["to_orig"][0, 0] - focal_y = focal_y * images[idx]["to_orig"][1, 1] - - if camera_model == "SIMPLE_PINHOLE": - model_id = 0 - focal = (focal_x + focal_y) / 2.0 - params = np.asarray([focal, cx, cy], np.float64) - elif camera_model == "PINHOLE": - model_id = 1 - params = np.asarray([focal_x, focal_y, cx, cy], np.float64) - elif camera_model == "SIMPLE_RADIAL": - model_id = 2 - focal = (focal_x + focal_y) / 2.0 - params = np.asarray([focal, cx, cy, 0.0], np.float64) - elif camera_model == "OPENCV": - model_id = 4 - params = np.asarray([focal_x, focal_y, cx, cy, 0.0, 0.0, 0.0, 0.0], np.float64) - else: - raise ValueError(f"invalid camera model {camera_model}") - - H, W = int(H), int(W) - # OPENCV camera model - camid = db.add_camera( - model_id, W, H, params, prior_focal_length=prior_focal_length) - if ga_world_to_cam is None: - prior_t = np.zeros(3) - prior_q = np.zeros(4) - else: - q = R.from_matrix(ga_world_to_cam[idx][:3, :3]).as_quat() - prior_t = ga_world_to_cam[idx][:3, 3] - prior_q = np.array([q[-1], q[0], q[1], q[2]]) - imid = db.add_image( - image_paths[idx], camid, prior_q=prior_q, prior_t=prior_t) - image_to_colmap[idx] = { - 'colmap_imid': imid, - 'colmap_camid': camid - } - return image_to_colmap, im_keypoints - - -def export_matches(db, images, image_to_colmap, im_keypoints, im_matches, min_len_track, skip_geometric_verification): - colmap_image_pairs = [] - # 2D-2D are quite dense - # we want to remove the very small tracks - # and export only kpt for which we have values - # build tracks - print("building tracks") - keypoints_to_track_id = {} - track_id_to_kpt_list = [] - to_merge = [] - for (imidx0, imidx1), colmap_matches in tqdm(im_matches.items()): - if imidx0 not in keypoints_to_track_id: - keypoints_to_track_id[imidx0] = {} - if imidx1 not in keypoints_to_track_id: - keypoints_to_track_id[imidx1] = {} - - for m in colmap_matches: - if m[0] not in keypoints_to_track_id[imidx0] and m[1] not in keypoints_to_track_id[imidx1]: - # new pair of kpts never seen before - track_idx = len(track_id_to_kpt_list) - keypoints_to_track_id[imidx0][m[0]] = track_idx - keypoints_to_track_id[imidx1][m[1]] = track_idx - track_id_to_kpt_list.append( - [(imidx0, m[0]), (imidx1, m[1])]) - elif m[1] not in keypoints_to_track_id[imidx1]: - # 0 has a track, not 1 - track_idx = keypoints_to_track_id[imidx0][m[0]] - keypoints_to_track_id[imidx1][m[1]] = track_idx - track_id_to_kpt_list[track_idx].append((imidx1, m[1])) - elif m[0] not in keypoints_to_track_id[imidx0]: - # 1 has a track, not 0 - track_idx = keypoints_to_track_id[imidx1][m[1]] - keypoints_to_track_id[imidx0][m[0]] = track_idx - track_id_to_kpt_list[track_idx].append((imidx0, m[0])) - else: - # both have tracks, merge them - track_idx0 = keypoints_to_track_id[imidx0][m[0]] - track_idx1 = keypoints_to_track_id[imidx1][m[1]] - if track_idx0 != track_idx1: - # let's deal with them later - to_merge.append((track_idx0, track_idx1)) - - # regroup merge targets - print("merging tracks") - unique = np.unique(to_merge) - tree = DisjointSet(unique) - for track_idx0, track_idx1 in tqdm(to_merge): - tree.merge(track_idx0, track_idx1) - - subsets = tree.subsets() - print("applying merge") - for setvals in tqdm(subsets): - new_trackid = len(track_id_to_kpt_list) - kpt_list = [] - for track_idx in setvals: - kpt_list.extend(track_id_to_kpt_list[track_idx]) - for imidx, kpid in track_id_to_kpt_list[track_idx]: - keypoints_to_track_id[imidx][kpid] = new_trackid - track_id_to_kpt_list.append(kpt_list) - - # binc = np.bincount([len(v) for v in track_id_to_kpt_list]) - # nonzero = np.nonzero(binc) - # nonzerobinc = binc[nonzero[0]] - # print(nonzero[0].tolist()) - # print(nonzerobinc) - num_valid_tracks = sum( - [1 for v in track_id_to_kpt_list if len(v) >= min_len_track]) - - keypoints_to_idx = {} - print(f"squashing keypoints - {num_valid_tracks} valid tracks") - for imidx, keypoints_imid in tqdm(im_keypoints.items()): - imid = image_to_colmap[imidx]['colmap_imid'] - keypoints_kept = [] - keypoints_to_idx[imidx] = {} - for kp in keypoints_imid.keys(): - if kp not in keypoints_to_track_id[imidx]: - continue - track_idx = keypoints_to_track_id[imidx][kp] - track_length = len(track_id_to_kpt_list[track_idx]) - if track_length < min_len_track: - continue - keypoints_to_idx[imidx][kp] = len(keypoints_kept) - keypoints_kept.append(kp) - if len(keypoints_kept) == 0: - continue - keypoints_kept = np.array(keypoints_kept) - keypoints_kept = np.unravel_index(keypoints_kept, images[imidx]['true_shape'][0])[ - 0].base[:, ::-1].copy().astype(np.float32) - # rescale coordinates - keypoints_kept[:, 0] += 0.5 - keypoints_kept[:, 1] += 0.5 - keypoints_kept = geotrf(images[imidx]['to_orig'], keypoints_kept, norm=True) - - H, W = images[imidx]['orig_shape'] - keypoints_kept[:, 0] = keypoints_kept[:, 0].clip(min=0, max=W - 0.01) - keypoints_kept[:, 1] = keypoints_kept[:, 1].clip(min=0, max=H - 0.01) - - db.add_keypoints(imid, keypoints_kept) - - print("exporting im_matches") - for (imidx0, imidx1), colmap_matches in im_matches.items(): - imid0, imid1 = image_to_colmap[imidx0]['colmap_imid'], image_to_colmap[imidx1]['colmap_imid'] - assert imid0 < imid1 - final_matches = np.array([[keypoints_to_idx[imidx0][m[0]], keypoints_to_idx[imidx1][m[1]]] - for m in colmap_matches - if m[0] in keypoints_to_idx[imidx0] and m[1] in keypoints_to_idx[imidx1]]) - if len(final_matches) > 0: - colmap_image_pairs.append( - (images[imidx0]['instance'], images[imidx1]['instance'])) - db.add_matches(imid0, imid1, final_matches) - if skip_geometric_verification: - db.add_two_view_geometry(imid0, imid1, final_matches) - return colmap_image_pairs diff --git a/modules/mast3r/datasets/__init__.py b/modules/mast3r/datasets/__init__.py deleted file mode 100644 index c625aca0a773c105ed229ff87364721b4755bc8d..0000000000000000000000000000000000000000 --- a/modules/mast3r/datasets/__init__.py +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright (C) 2024-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). - -from .base.mast3r_base_stereo_view_dataset import MASt3RBaseStereoViewDataset - -import mast3r.utils.path_to_dust3r # noqa -from dust3r.datasets.arkitscenes import ARKitScenes as DUSt3R_ARKitScenes # noqa -from dust3r.datasets.blendedmvs import BlendedMVS as DUSt3R_BlendedMVS # noqa -from dust3r.datasets.co3d import Co3d as DUSt3R_Co3d # noqa -from dust3r.datasets.megadepth import MegaDepth as DUSt3R_MegaDepth # noqa -from dust3r.datasets.scannetpp import ScanNetpp as DUSt3R_ScanNetpp # noqa -from dust3r.datasets.staticthings3d import StaticThings3D as DUSt3R_StaticThings3D # noqa -from dust3r.datasets.waymo import Waymo as DUSt3R_Waymo # noqa -from dust3r.datasets.wildrgbd import WildRGBD as DUSt3R_WildRGBD # noqa - - -class ARKitScenes(DUSt3R_ARKitScenes, MASt3RBaseStereoViewDataset): - def __init__(self, *args, split, ROOT, **kwargs): - super().__init__(*args, split=split, ROOT=ROOT, **kwargs) - self.is_metric_scale = True - - -class BlendedMVS(DUSt3R_BlendedMVS, MASt3RBaseStereoViewDataset): - def __init__(self, *args, ROOT, split=None, **kwargs): - super().__init__(*args, ROOT=ROOT, split=split, **kwargs) - self.is_metric_scale = False - - -class Co3d(DUSt3R_Co3d, MASt3RBaseStereoViewDataset): - def __init__(self, mask_bg=True, *args, ROOT, **kwargs): - super().__init__(mask_bg, *args, ROOT=ROOT, **kwargs) - self.is_metric_scale = False - - -class MegaDepth(DUSt3R_MegaDepth, MASt3RBaseStereoViewDataset): - def __init__(self, *args, split, ROOT, **kwargs): - super().__init__(*args, split=split, ROOT=ROOT, **kwargs) - self.is_metric_scale = False - - -class ScanNetpp(DUSt3R_ScanNetpp, MASt3RBaseStereoViewDataset): - def __init__(self, *args, ROOT, **kwargs): - super().__init__(*args, ROOT=ROOT, **kwargs) - self.is_metric_scale = True - - -class StaticThings3D(DUSt3R_StaticThings3D, MASt3RBaseStereoViewDataset): - def __init__(self, ROOT, *args, mask_bg='rand', **kwargs): - super().__init__(ROOT, *args, mask_bg=mask_bg, **kwargs) - self.is_metric_scale = False - - -class Waymo(DUSt3R_Waymo, MASt3RBaseStereoViewDataset): - def __init__(self, *args, ROOT, **kwargs): - super().__init__(*args, ROOT=ROOT, **kwargs) - self.is_metric_scale = True - - -class WildRGBD(DUSt3R_WildRGBD, MASt3RBaseStereoViewDataset): - def __init__(self, mask_bg=True, *args, ROOT, **kwargs): - super().__init__(mask_bg, *args, ROOT=ROOT, **kwargs) - self.is_metric_scale = True diff --git a/modules/mast3r/datasets/base/__init__.py b/modules/mast3r/datasets/base/__init__.py deleted file mode 100644 index d7dd877d649ce4dbd749dd7195a8b34c0f91d4f0..0000000000000000000000000000000000000000 --- a/modules/mast3r/datasets/base/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (C) 2024-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). \ No newline at end of file diff --git a/modules/mast3r/datasets/base/mast3r_base_stereo_view_dataset.py b/modules/mast3r/datasets/base/mast3r_base_stereo_view_dataset.py deleted file mode 100644 index 3ced0ef0dc6b1d6225781af55d3e924e133fdeaf..0000000000000000000000000000000000000000 --- a/modules/mast3r/datasets/base/mast3r_base_stereo_view_dataset.py +++ /dev/null @@ -1,355 +0,0 @@ -# Copyright (C) 2024-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). -# -# -------------------------------------------------------- -# base class for implementing datasets -# -------------------------------------------------------- -import PIL.Image -import PIL.Image as Image -import numpy as np -import torch -import copy - -from mast3r.datasets.utils.cropping import (extract_correspondences_from_pts3d, - gen_random_crops, in2d_rect, crop_to_homography) - -import mast3r.utils.path_to_dust3r # noqa -from dust3r.datasets.base.base_stereo_view_dataset import BaseStereoViewDataset, view_name, is_good_type # noqa -from dust3r.datasets.utils.transforms import ImgNorm -from dust3r.utils.geometry import depthmap_to_absolute_camera_coordinates, geotrf, depthmap_to_camera_coordinates -import dust3r.datasets.utils.cropping as cropping - - -class MASt3RBaseStereoViewDataset(BaseStereoViewDataset): - def __init__(self, *, # only keyword arguments - split=None, - resolution=None, # square_size or (width, height) or list of [(width,height), ...] - transform=ImgNorm, - aug_crop=False, - aug_swap=False, - aug_monocular=False, - aug_portrait_or_landscape=True, # automatic choice between landscape/portrait when possible - aug_rot90=False, - n_corres=0, - nneg=0, - n_tentative_crops=4, - seed=None): - super().__init__(split=split, resolution=resolution, transform=transform, aug_crop=aug_crop, seed=seed) - self.is_metric_scale = False # by default a dataset is not metric scale, subclasses can overwrite this - - self.aug_swap = aug_swap - self.aug_monocular = aug_monocular - self.aug_portrait_or_landscape = aug_portrait_or_landscape - self.aug_rot90 = aug_rot90 - - self.n_corres = n_corres - self.nneg = nneg - assert self.n_corres == 'all' or isinstance(self.n_corres, int) or (isinstance(self.n_corres, list) and len( - self.n_corres) == self.num_views), f"Error, n_corres should either be 'all', a single integer or a list of length {self.num_views}" - assert self.nneg == 0 or self.n_corres != 'all' - self.n_tentative_crops = n_tentative_crops - - def _swap_view_aug(self, views): - if self._rng.random() < 0.5: - views.reverse() - - def _crop_resize_if_necessary(self, image, depthmap, intrinsics, resolution, rng=None, info=None): - """ This function: - - first downsizes the image with LANCZOS inteprolation, - which is better than bilinear interpolation in - """ - if not isinstance(image, PIL.Image.Image): - image = PIL.Image.fromarray(image) - - # transpose the resolution if necessary - W, H = image.size # new size - assert resolution[0] >= resolution[1] - if H > 1.1 * W: - # image is portrait mode - resolution = resolution[::-1] - elif 0.9 < H / W < 1.1 and resolution[0] != resolution[1]: - # image is square, so we chose (portrait, landscape) randomly - if rng.integers(2) and self.aug_portrait_or_landscape: - resolution = resolution[::-1] - - # high-quality Lanczos down-scaling - target_resolution = np.array(resolution) - image, depthmap, intrinsics = cropping.rescale_image_depthmap(image, depthmap, intrinsics, target_resolution) - - # actual cropping (if necessary) with bilinear interpolation - offset_factor = 0.5 - intrinsics2 = cropping.camera_matrix_of_crop(intrinsics, image.size, resolution, offset_factor=offset_factor) - crop_bbox = cropping.bbox_from_intrinsics_in_out(intrinsics, intrinsics2, resolution) - image, depthmap, intrinsics2 = cropping.crop_image_depthmap(image, depthmap, intrinsics, crop_bbox) - - return image, depthmap, intrinsics2 - - def generate_crops_from_pair(self, view1, view2, resolution, aug_crop_arg, n_crops=4, rng=np.random): - views = [view1, view2] - - if aug_crop_arg is False: - # compatibility - for i in range(2): - view = views[i] - view['img'], view['depthmap'], view['camera_intrinsics'] = self._crop_resize_if_necessary(view['img'], - view['depthmap'], - view['camera_intrinsics'], - resolution, - rng=rng) - view['pts3d'], view['valid_mask'] = depthmap_to_absolute_camera_coordinates(view['depthmap'], - view['camera_intrinsics'], - view['camera_pose']) - return - - # extract correspondences - corres = extract_correspondences_from_pts3d(*views, target_n_corres=None, rng=rng) - - # generate 4 random crops in each view - view_crops = [] - crops_resolution = [] - corres_msks = [] - for i in range(2): - - if aug_crop_arg == 'auto': - S = min(views[i]['img'].size) - R = min(resolution) - aug_crop = S * (S - R) // R - aug_crop = max(.1 * S, aug_crop) # for cropping: augment scale of at least 10%, and more if possible - else: - aug_crop = aug_crop_arg - - # tranpose the target resolution if necessary - assert resolution[0] >= resolution[1] - W, H = imsize = views[i]['img'].size - crop_resolution = resolution - if H > 1.1 * W: - # image is portrait mode - crop_resolution = resolution[::-1] - elif 0.9 < H / W < 1.1 and resolution[0] != resolution[1]: - # image is square, so we chose (portrait, landscape) randomly - if rng.integers(2): - crop_resolution = resolution[::-1] - - crops = gen_random_crops(imsize, n_crops, crop_resolution, aug_crop=aug_crop, rng=rng) - view_crops.append(crops) - crops_resolution.append(crop_resolution) - - # compute correspondences - corres_msks.append(in2d_rect(corres[i], crops)) - - # compute IoU for each - intersection = np.float32(corres_msks[0]).T @ np.float32(corres_msks[1]) - # select best pair of crops - best = np.unravel_index(intersection.argmax(), (n_crops, n_crops)) - crops = [view_crops[i][c] for i, c in enumerate(best)] - - # crop with the homography - for i in range(2): - view = views[i] - imsize, K_new, R, H = crop_to_homography(view['camera_intrinsics'], crops[i], crops_resolution[i]) - # imsize, K_new, H = upscale_homography(imsize, resolution, K_new, H) - - # update camera params - K_old = view['camera_intrinsics'] - view['camera_intrinsics'] = K_new - view['camera_pose'] = view['camera_pose'].copy() - view['camera_pose'][:3, :3] = view['camera_pose'][:3, :3] @ R - - # apply homography to image and depthmap - homo8 = (H / H[2, 2]).ravel().tolist()[:8] - view['img'] = view['img'].transform(imsize, Image.Transform.PERSPECTIVE, - homo8, - resample=Image.Resampling.BICUBIC) - - depthmap2 = depthmap_to_camera_coordinates(view['depthmap'], K_old)[0] @ R[:, 2] - view['depthmap'] = np.array(Image.fromarray(depthmap2).transform( - imsize, Image.Transform.PERSPECTIVE, homo8)) - - if 'track_labels' in view: - # convert from uint64 --> uint32, because PIL.Image cannot handle uint64 - mapping, track_labels = np.unique(view['track_labels'], return_inverse=True) - track_labels = track_labels.astype(np.uint32).reshape(view['track_labels'].shape) - - # homography transformation - res = np.array(Image.fromarray(track_labels).transform(imsize, Image.Transform.PERSPECTIVE, homo8)) - view['track_labels'] = mapping[res] # mapping back to uint64 - - # recompute 3d points from scratch - view['pts3d'], view['valid_mask'] = depthmap_to_absolute_camera_coordinates(view['depthmap'], - view['camera_intrinsics'], - view['camera_pose']) - - def __getitem__(self, idx): - if isinstance(idx, tuple): - # the idx is specifying the aspect-ratio - idx, ar_idx = idx - else: - assert len(self._resolutions) == 1 - ar_idx = 0 - - # set-up the rng - if self.seed: # reseed for each __getitem__ - self._rng = np.random.default_rng(seed=self.seed + idx) - elif not hasattr(self, '_rng'): - seed = torch.initial_seed() # this is different for each dataloader process - self._rng = np.random.default_rng(seed=seed) - - # over-loaded code - resolution = self._resolutions[ar_idx] # DO NOT CHANGE THIS (compatible with BatchedRandomSampler) - views = self._get_views(idx, resolution, self._rng) - assert len(views) == self.num_views - - for v, view in enumerate(views): - assert 'pts3d' not in view, f"pts3d should not be there, they will be computed afterwards based on intrinsics+depthmap for view {view_name(view)}" - view['idx'] = (idx, ar_idx, v) - view['is_metric_scale'] = self.is_metric_scale - - assert 'camera_intrinsics' in view - if 'camera_pose' not in view: - view['camera_pose'] = np.full((4, 4), np.nan, dtype=np.float32) - else: - assert np.isfinite(view['camera_pose']).all(), f'NaN in camera pose for view {view_name(view)}' - assert 'pts3d' not in view - assert 'valid_mask' not in view - assert np.isfinite(view['depthmap']).all(), f'NaN in depthmap for view {view_name(view)}' - - pts3d, valid_mask = depthmap_to_absolute_camera_coordinates(**view) - - view['pts3d'] = pts3d - view['valid_mask'] = valid_mask & np.isfinite(pts3d).all(axis=-1) - - self.generate_crops_from_pair(views[0], views[1], resolution=resolution, - aug_crop_arg=self.aug_crop, - n_crops=self.n_tentative_crops, - rng=self._rng) - for v, view in enumerate(views): - # encode the image - width, height = view['img'].size - view['true_shape'] = np.int32((height, width)) - view['img'] = self.transform(view['img']) - # Pixels for which depth is fundamentally undefined - view['sky_mask'] = (view['depthmap'] < 0) - - if self.aug_swap: - self._swap_view_aug(views) - - if self.aug_monocular: - if self._rng.random() < self.aug_monocular: - views = [copy.deepcopy(views[0]) for _ in range(len(views))] - - # automatic extraction of correspondences from pts3d + pose - if self.n_corres > 0 and ('corres' not in view): - corres1, corres2, valid = extract_correspondences_from_pts3d(*views, self.n_corres, - self._rng, nneg=self.nneg) - views[0]['corres'] = corres1 - views[1]['corres'] = corres2 - views[0]['valid_corres'] = valid - views[1]['valid_corres'] = valid - - if self.aug_rot90 is False: - pass - elif self.aug_rot90 == 'same': - rotate_90(views, k=self._rng.choice(4)) - elif self.aug_rot90 == 'diff': - rotate_90(views[:1], k=self._rng.choice(4)) - rotate_90(views[1:], k=self._rng.choice(4)) - else: - raise ValueError(f'Bad value for {self.aug_rot90=}') - - # check data-types metric_scale - for v, view in enumerate(views): - if 'corres' not in view: - view['corres'] = np.full((self.n_corres, 2), np.nan, dtype=np.float32) - - # check all datatypes - for key, val in view.items(): - res, err_msg = is_good_type(key, val) - assert res, f"{err_msg} with {key}={val} for view {view_name(view)}" - K = view['camera_intrinsics'] - - # check shapes - assert view['depthmap'].shape == view['img'].shape[1:] - assert view['depthmap'].shape == view['pts3d'].shape[:2] - assert view['depthmap'].shape == view['valid_mask'].shape - - # last thing done! - for view in views: - # transpose to make sure all views are the same size - transpose_to_landscape(view) - # this allows to check whether the RNG is is the same state each time - view['rng'] = int.from_bytes(self._rng.bytes(4), 'big') - - return views - - -def transpose_to_landscape(view, revert=False): - height, width = view['true_shape'] - - if width < height: - if revert: - height, width = width, height - - # rectify portrait to landscape - assert view['img'].shape == (3, height, width) - view['img'] = view['img'].swapaxes(1, 2) - - assert view['valid_mask'].shape == (height, width) - view['valid_mask'] = view['valid_mask'].swapaxes(0, 1) - - assert view['sky_mask'].shape == (height, width) - view['sky_mask'] = view['sky_mask'].swapaxes(0, 1) - - assert view['depthmap'].shape == (height, width) - view['depthmap'] = view['depthmap'].swapaxes(0, 1) - - assert view['pts3d'].shape == (height, width, 3) - view['pts3d'] = view['pts3d'].swapaxes(0, 1) - - # transpose x and y pixels - view['camera_intrinsics'] = view['camera_intrinsics'][[1, 0, 2]] - - # transpose correspondences x and y - view['corres'] = view['corres'][:, [1, 0]] - - -def rotate_90(views, k=1): - from scipy.spatial.transform import Rotation - # print('rotation =', k) - - RT = np.eye(4, dtype=np.float32) - RT[:3, :3] = Rotation.from_euler('z', 90 * k, degrees=True).as_matrix() - - for view in views: - view['img'] = torch.rot90(view['img'], k=k, dims=(-2, -1)) # WARNING!! dims=(-1,-2) != dims=(-2,-1) - view['depthmap'] = np.rot90(view['depthmap'], k=k).copy() - view['camera_pose'] = view['camera_pose'] @ RT - - RT2 = np.eye(3, dtype=np.float32) - RT2[:2, :2] = RT[:2, :2] * ((1, -1), (-1, 1)) - H, W = view['depthmap'].shape - if k % 4 == 0: - pass - elif k % 4 == 1: - # top-left (0,0) pixel becomes (0,H-1) - RT2[:2, 2] = (0, H - 1) - elif k % 4 == 2: - # top-left (0,0) pixel becomes (W-1,H-1) - RT2[:2, 2] = (W - 1, H - 1) - elif k % 4 == 3: - # top-left (0,0) pixel becomes (W-1,0) - RT2[:2, 2] = (W - 1, 0) - else: - raise ValueError(f'Bad value for {k=}') - - view['camera_intrinsics'][:2, 2] = geotrf(RT2, view['camera_intrinsics'][:2, 2]) - if k % 2 == 1: - K = view['camera_intrinsics'] - np.fill_diagonal(K, K.diagonal()[[1, 0, 2]]) - - pts3d, valid_mask = depthmap_to_absolute_camera_coordinates(**view) - view['pts3d'] = pts3d - view['valid_mask'] = np.rot90(view['valid_mask'], k=k).copy() - view['sky_mask'] = np.rot90(view['sky_mask'], k=k).copy() - - view['corres'] = geotrf(RT2, view['corres']).round().astype(view['corres'].dtype) - view['true_shape'] = np.int32((H, W)) diff --git a/modules/mast3r/datasets/utils/__init__.py b/modules/mast3r/datasets/utils/__init__.py deleted file mode 100644 index a32692113d830ddc4af4e6ed608f222fbe062e6e..0000000000000000000000000000000000000000 --- a/modules/mast3r/datasets/utils/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (C) 2024-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). diff --git a/modules/mast3r/datasets/utils/cropping.py b/modules/mast3r/datasets/utils/cropping.py deleted file mode 100644 index 57f4d84b019eaac9cf0c308a94f2cb8e2ec1a6ba..0000000000000000000000000000000000000000 --- a/modules/mast3r/datasets/utils/cropping.py +++ /dev/null @@ -1,219 +0,0 @@ -# Copyright (C) 2024-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). -# -# -------------------------------------------------------- -# cropping/match extraction -# -------------------------------------------------------- -import numpy as np -import mast3r.utils.path_to_dust3r # noqa -from dust3r.utils.device import to_numpy -from dust3r.utils.geometry import inv, geotrf - - -def reciprocal_1d(corres_1_to_2, corres_2_to_1, ret_recip=False): - is_reciprocal1 = (corres_2_to_1[corres_1_to_2] == np.arange(len(corres_1_to_2))) - pos1 = is_reciprocal1.nonzero()[0] - pos2 = corres_1_to_2[pos1] - if ret_recip: - return is_reciprocal1, pos1, pos2 - return pos1, pos2 - - -def extract_correspondences_from_pts3d(view1, view2, target_n_corres, rng=np.random, ret_xy=True, nneg=0): - view1, view2 = to_numpy((view1, view2)) - # project pixels from image1 --> 3d points --> image2 pixels - shape1, corres1_to_2 = reproject_view(view1['pts3d'], view2) - shape2, corres2_to_1 = reproject_view(view2['pts3d'], view1) - - # compute reciprocal correspondences: - # pos1 == valid pixels (correspondences) in image1 - is_reciprocal1, pos1, pos2 = reciprocal_1d(corres1_to_2, corres2_to_1, ret_recip=True) - is_reciprocal2 = (corres1_to_2[corres2_to_1] == np.arange(len(corres2_to_1))) - - if target_n_corres is None: - if ret_xy: - pos1 = unravel_xy(pos1, shape1) - pos2 = unravel_xy(pos2, shape2) - return pos1, pos2 - - available_negatives = min((~is_reciprocal1).sum(), (~is_reciprocal2).sum()) - target_n_positives = int(target_n_corres * (1 - nneg)) - n_positives = min(len(pos1), target_n_positives) - n_negatives = min(target_n_corres - n_positives, available_negatives) - - if n_negatives + n_positives != target_n_corres: - # should be really rare => when there are not enough negatives - # in that case, break nneg and add a few more positives ? - n_positives = target_n_corres - n_negatives - assert n_positives <= len(pos1) - - assert n_positives <= len(pos1) - assert n_positives <= len(pos2) - assert n_negatives <= (~is_reciprocal1).sum() - assert n_negatives <= (~is_reciprocal2).sum() - assert n_positives + n_negatives == target_n_corres - - valid = np.ones(n_positives, dtype=bool) - if n_positives < len(pos1): - # random sub-sampling of valid correspondences - perm = rng.permutation(len(pos1))[:n_positives] - pos1 = pos1[perm] - pos2 = pos2[perm] - - if n_negatives > 0: - # add false correspondences if not enough - def norm(p): return p / p.sum() - pos1 = np.r_[pos1, rng.choice(shape1[0] * shape1[1], size=n_negatives, replace=False, p=norm(~is_reciprocal1))] - pos2 = np.r_[pos2, rng.choice(shape2[0] * shape2[1], size=n_negatives, replace=False, p=norm(~is_reciprocal2))] - valid = np.r_[valid, np.zeros(n_negatives, dtype=bool)] - - # convert (x+W*y) back to 2d (x,y) coordinates - if ret_xy: - pos1 = unravel_xy(pos1, shape1) - pos2 = unravel_xy(pos2, shape2) - return pos1, pos2, valid - - -def reproject_view(pts3d, view2): - shape = view2['pts3d'].shape[:2] - return reproject(pts3d, view2['camera_intrinsics'], inv(view2['camera_pose']), shape) - - -def reproject(pts3d, K, world2cam, shape): - H, W, THREE = pts3d.shape - assert THREE == 3 - - # reproject in camera2 space - with np.errstate(divide='ignore', invalid='ignore'): - pos = geotrf(K @ world2cam[:3], pts3d, norm=1, ncol=2) - - # quantize to pixel positions - return (H, W), ravel_xy(pos, shape) - - -def ravel_xy(pos, shape): - H, W = shape - with np.errstate(invalid='ignore'): - qx, qy = pos.reshape(-1, 2).round().astype(np.int32).T - quantized_pos = qx.clip(min=0, max=W - 1, out=qx) + W * qy.clip(min=0, max=H - 1, out=qy) - return quantized_pos - - -def unravel_xy(pos, shape): - # convert (x+W*y) back to 2d (x,y) coordinates - return np.unravel_index(pos, shape)[0].base[:, ::-1].copy() - - -def _rotation_origin_to_pt(target): - """ Align the origin (0,0,1) with the target point (x,y,1) in projective space. - Method: rotate z to put target on (x'+,0,1), then rotate on Y to get (0,0,1) and un-rotate z. - """ - from scipy.spatial.transform import Rotation - x, y = target - rot_z = np.arctan2(y, x) - rot_y = np.arctan(np.linalg.norm(target)) - R = Rotation.from_euler('ZYZ', [rot_z, rot_y, -rot_z]).as_matrix() - return R - - -def _dotmv(Trf, pts, ncol=None, norm=False): - assert Trf.ndim >= 2 - ncol = ncol or pts.shape[-1] - - # adapt shape if necessary - output_reshape = pts.shape[:-1] - if Trf.ndim >= 3: - n = Trf.ndim - 2 - assert Trf.shape[:n] == pts.shape[:n], 'batch size does not match' - Trf = Trf.reshape(-1, Trf.shape[-2], Trf.shape[-1]) - - if pts.ndim > Trf.ndim: - # Trf == (B,d,d) & pts == (B,H,W,d) --> (B, H*W, d) - pts = pts.reshape(Trf.shape[0], -1, pts.shape[-1]) - elif pts.ndim == 2: - # Trf == (B,d,d) & pts == (B,d) --> (B, 1, d) - pts = pts[:, None, :] - - if pts.shape[-1] + 1 == Trf.shape[-1]: - Trf = Trf.swapaxes(-1, -2) # transpose Trf - pts = pts @ Trf[..., :-1, :] + Trf[..., -1:, :] - - elif pts.shape[-1] == Trf.shape[-1]: - Trf = Trf.swapaxes(-1, -2) # transpose Trf - pts = pts @ Trf - else: - pts = Trf @ pts.T - if pts.ndim >= 2: - pts = pts.swapaxes(-1, -2) - - if norm: - pts = pts / pts[..., -1:] # DONT DO /= BECAUSE OF WEIRD PYTORCH BUG - if norm != 1: - pts *= norm - - res = pts[..., :ncol].reshape(*output_reshape, ncol) - return res - - -def crop_to_homography(K, crop, target_size=None): - """ Given an image and its intrinsics, - we want to replicate a rectangular crop with an homography, - so that the principal point of the new 'crop' is centered. - """ - # build intrinsics for the crop - crop = np.round(crop) - crop_size = crop[2:] - crop[:2] - K2 = K.copy() # same focal - K2[:2, 2] = crop_size / 2 # new principal point is perfectly centered - - # find which corner is the most far-away from current principal point - # so that the final homography does not go over the image borders - corners = crop.reshape(-1, 2) - corner_idx = np.abs(corners - K[:2, 2]).argmax(0) - corner = corners[corner_idx, [0, 1]] - # align with the corresponding corner from the target view - corner2 = np.c_[[0, 0], crop_size][[0, 1], corner_idx] - - old_pt = _dotmv(np.linalg.inv(K), corner, norm=1) - new_pt = _dotmv(np.linalg.inv(K2), corner2, norm=1) - R = _rotation_origin_to_pt(old_pt) @ np.linalg.inv(_rotation_origin_to_pt(new_pt)) - - if target_size is not None: - imsize = target_size - target_size = np.asarray(target_size) - scaling = min(target_size / crop_size) - K2[:2] *= scaling - K2[:2, 2] = target_size / 2 - else: - imsize = tuple(np.int32(crop_size).tolist()) - - return imsize, K2, R, K @ R @ np.linalg.inv(K2) - - -def gen_random_crops(imsize, n_crops, resolution, aug_crop, rng=np.random): - """ Generate random crops of size=resolution, - for an input image upscaled to (imsize + randint(0 , aug_crop)) - """ - resolution_crop = np.array(resolution) * min(np.array(imsize) / resolution) - - # (virtually) upscale the input image - # scaling = rng.uniform(1, 1+(aug_crop+1)/min(imsize)) - scaling = np.exp(rng.uniform(0, np.log(1 + aug_crop / min(imsize)))) - imsize2 = np.int32(np.array(imsize) * scaling) - - # generate some random crops - topleft = rng.random((n_crops, 2)) * (imsize2 - resolution_crop) - crops = np.c_[topleft, topleft + resolution_crop] - # print(f"{scaling=}, {topleft=}") - # reduce the resolution to come back to original size - crops /= scaling - return crops - - -def in2d_rect(corres, crops): - # corres = (N,2) - # crops = (M,4) - # output = (N, M) - is_sup = (corres[:, None] >= crops[None, :, 0:2]) - is_inf = (corres[:, None] < crops[None, :, 2:4]) - return (is_sup & is_inf).all(axis=-1) diff --git a/modules/mast3r/demo.py b/modules/mast3r/demo.py deleted file mode 100644 index 22b6a66c24666776a7197844a0463d7821ed53ce..0000000000000000000000000000000000000000 --- a/modules/mast3r/demo.py +++ /dev/null @@ -1,331 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (C) 2024-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). -# -# -------------------------------------------------------- -# sparse gradio demo functions -# -------------------------------------------------------- -import math -import gradio -import os -import numpy as np -import functools -import trimesh -import copy -from scipy.spatial.transform import Rotation -import tempfile -import shutil - -from mast3r.cloud_opt.sparse_ga import sparse_global_alignment -from mast3r.cloud_opt.tsdf_optimizer import TSDFPostProcess - -import mast3r.utils.path_to_dust3r # noqa -from dust3r.image_pairs import make_pairs -from dust3r.utils.image import load_images -from dust3r.utils.device import to_numpy -from dust3r.viz import add_scene_cam, CAM_COLORS, OPENGL, pts3d_to_trimesh, cat_meshes -from dust3r.demo import get_args_parser as dust3r_get_args_parser - -import matplotlib.pyplot as pl - - -class SparseGAState(): - def __init__(self, sparse_ga, should_delete=False, cache_dir=None, outfile_name=None): - self.sparse_ga = sparse_ga - self.cache_dir = cache_dir - self.outfile_name = outfile_name - self.should_delete = should_delete - - def __del__(self): - if not self.should_delete: - return - if self.cache_dir is not None and os.path.isdir(self.cache_dir): - shutil.rmtree(self.cache_dir) - self.cache_dir = None - if self.outfile_name is not None and os.path.isfile(self.outfile_name): - os.remove(self.outfile_name) - self.outfile_name = None - - -def get_args_parser(): - parser = dust3r_get_args_parser() - parser.add_argument('--share', action='store_true') - parser.add_argument('--gradio_delete_cache', default=None, type=int, - help='age/frequency at which gradio removes the file. If >0, matching cache is purged') - - actions = parser._actions - for action in actions: - if action.dest == 'model_name': - action.choices = ["MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric"] - # change defaults - parser.prog = 'mast3r demo' - return parser - - -def _convert_scene_output_to_glb(outfile, imgs, pts3d, mask, focals, cams2world, cam_size=0.05, - cam_color=None, as_pointcloud=False, - transparent_cams=False, silent=False): - assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals) - pts3d = to_numpy(pts3d) - imgs = to_numpy(imgs) - focals = to_numpy(focals) - cams2world = to_numpy(cams2world) - - scene = trimesh.Scene() - - # full pointcloud - if as_pointcloud: - pts = np.concatenate([p[m.ravel()] for p, m in zip(pts3d, mask)]).reshape(-1, 3) - col = np.concatenate([p[m] for p, m in zip(imgs, mask)]).reshape(-1, 3) - valid_msk = np.isfinite(pts.sum(axis=1)) - pct = trimesh.PointCloud(pts[valid_msk], colors=col[valid_msk]) - scene.add_geometry(pct) - else: - meshes = [] - for i in range(len(imgs)): - pts3d_i = pts3d[i].reshape(imgs[i].shape) - msk_i = mask[i] & np.isfinite(pts3d_i.sum(axis=-1)) - meshes.append(pts3d_to_trimesh(imgs[i], pts3d_i, msk_i)) - mesh = trimesh.Trimesh(**cat_meshes(meshes)) - scene.add_geometry(mesh) - - # add each camera - for i, pose_c2w in enumerate(cams2world): - if isinstance(cam_color, list): - camera_edge_color = cam_color[i] - else: - camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)] - add_scene_cam(scene, pose_c2w, camera_edge_color, - None if transparent_cams else imgs[i], focals[i], - imsize=imgs[i].shape[1::-1], screen_width=cam_size) - - rot = np.eye(4) - rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix() - scene.apply_transform(np.linalg.inv(cams2world[0] @ OPENGL @ rot)) - if not silent: - print('(exporting 3D scene to', outfile, ')') - scene.export(file_obj=outfile) - return outfile - - -def get_3D_model_from_scene(silent, scene_state, min_conf_thr=2, as_pointcloud=False, mask_sky=False, - clean_depth=False, transparent_cams=False, cam_size=0.05, TSDF_thresh=0): - """ - extract 3D_model (glb file) from a reconstructed scene - """ - if scene_state is None: - return None - outfile = scene_state.outfile_name - if outfile is None: - return None - - # get optimized values from scene - scene = scene_state.sparse_ga - rgbimg = scene.imgs - focals = scene.get_focals().cpu() - cams2world = scene.get_im_poses().cpu() - - # 3D pointcloud from depthmap, poses and intrinsics - if TSDF_thresh > 0: - tsdf = TSDFPostProcess(scene, TSDF_thresh=TSDF_thresh) - pts3d, _, confs = to_numpy(tsdf.get_dense_pts3d(clean_depth=clean_depth)) - else: - pts3d, _, confs = to_numpy(scene.get_dense_pts3d(clean_depth=clean_depth)) - msk = to_numpy([c > min_conf_thr for c in confs]) - return _convert_scene_output_to_glb(outfile, rgbimg, pts3d, msk, focals, cams2world, as_pointcloud=as_pointcloud, - transparent_cams=transparent_cams, cam_size=cam_size, silent=silent) - - -def get_reconstructed_scene(outdir, gradio_delete_cache, model, device, silent, image_size, current_scene_state, - filelist, optim_level, lr1, niter1, lr2, niter2, min_conf_thr, matching_conf_thr, - as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size, scenegraph_type, winsize, - win_cyclic, refid, TSDF_thresh, shared_intrinsics, **kw): - """ - from a list of images, run mast3r inference, sparse global aligner. - then run get_3D_model_from_scene - """ - imgs = load_images(filelist, size=image_size, verbose=not silent) - if len(imgs) == 1: - imgs = [imgs[0], copy.deepcopy(imgs[0])] - imgs[1]['idx'] = 1 - filelist = [filelist[0], filelist[0] + '_2'] - - scene_graph_params = [scenegraph_type] - if scenegraph_type in ["swin", "logwin"]: - scene_graph_params.append(str(winsize)) - elif scenegraph_type == "oneref": - scene_graph_params.append(str(refid)) - if scenegraph_type in ["swin", "logwin"] and not win_cyclic: - scene_graph_params.append('noncyclic') - scene_graph = '-'.join(scene_graph_params) - pairs = make_pairs(imgs, scene_graph=scene_graph, prefilter=None, symmetrize=True) - if optim_level == 'coarse': - niter2 = 0 - # Sparse GA (forward mast3r -> matching -> 3D optim -> 2D refinement -> triangulation) - if current_scene_state is not None and \ - not current_scene_state.should_delete and \ - current_scene_state.cache_dir is not None: - cache_dir = current_scene_state.cache_dir - elif gradio_delete_cache: - cache_dir = tempfile.mkdtemp(suffix='_cache', dir=outdir) - else: - cache_dir = os.path.join(outdir, 'cache') - os.makedirs(cache_dir, exist_ok=True) - scene = sparse_global_alignment(filelist, pairs, cache_dir, - model, lr1=lr1, niter1=niter1, lr2=lr2, niter2=niter2, device=device, - opt_depth='depth' in optim_level, shared_intrinsics=shared_intrinsics, - matching_conf_thr=matching_conf_thr, **kw) - if current_scene_state is not None and \ - not current_scene_state.should_delete and \ - current_scene_state.outfile_name is not None: - outfile_name = current_scene_state.outfile_name - else: - outfile_name = tempfile.mktemp(suffix='_scene.glb', dir=outdir) - - scene_state = SparseGAState(scene, gradio_delete_cache, cache_dir, outfile_name) - outfile = get_3D_model_from_scene(silent, scene_state, min_conf_thr, as_pointcloud, mask_sky, - clean_depth, transparent_cams, cam_size, TSDF_thresh) - return scene_state, outfile - - -def set_scenegraph_options(inputfiles, win_cyclic, refid, scenegraph_type): - num_files = len(inputfiles) if inputfiles is not None else 1 - show_win_controls = scenegraph_type in ["swin", "logwin"] - show_winsize = scenegraph_type in ["swin", "logwin"] - show_cyclic = scenegraph_type in ["swin", "logwin"] - max_winsize, min_winsize = 1, 1 - if scenegraph_type == "swin": - if win_cyclic: - max_winsize = max(1, math.ceil((num_files - 1) / 2)) - else: - max_winsize = num_files - 1 - elif scenegraph_type == "logwin": - if win_cyclic: - half_size = math.ceil((num_files - 1) / 2) - max_winsize = max(1, math.ceil(math.log(half_size, 2))) - else: - max_winsize = max(1, math.ceil(math.log(num_files, 2))) - winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize, - minimum=min_winsize, maximum=max_winsize, step=1, visible=show_winsize) - win_cyclic = gradio.Checkbox(value=win_cyclic, label="Cyclic sequence", visible=show_cyclic) - win_col = gradio.Column(visible=show_win_controls) - refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0, - maximum=num_files - 1, step=1, visible=scenegraph_type == 'oneref') - return win_col, winsize, win_cyclic, refid - - -def main_demo(tmpdirname, model, device, image_size, server_name, server_port, silent=False, - share=False, gradio_delete_cache=False): - if not silent: - print('Outputing stuff in', tmpdirname) - - recon_fun = functools.partial(get_reconstructed_scene, tmpdirname, gradio_delete_cache, model, device, - silent, image_size) - model_from_scene_fun = functools.partial(get_3D_model_from_scene, silent) - - def get_context(delete_cache): - css = """.gradio-container {margin: 0 !important; min-width: 100%};""" - title = "MASt3R Demo" - if delete_cache: - return gradio.Blocks(css=css, title=title, delete_cache=(delete_cache, delete_cache)) - else: - return gradio.Blocks(css=css, title="MASt3R Demo") # for compatibility with older versions - - with get_context(gradio_delete_cache) as demo: - # scene state is save so that you can change conf_thr, cam_size... without rerunning the inference - scene = gradio.State(None) - gradio.HTML('

MASt3R Demo

') - with gradio.Column(): - inputfiles = gradio.File(file_count="multiple") - with gradio.Row(): - with gradio.Column(): - with gradio.Row(): - lr1 = gradio.Slider(label="Coarse LR", value=0.07, minimum=0.01, maximum=0.2, step=0.01) - niter1 = gradio.Number(value=500, precision=0, minimum=0, maximum=10_000, - label="num_iterations", info="For coarse alignment!") - lr2 = gradio.Slider(label="Fine LR", value=0.014, minimum=0.005, maximum=0.05, step=0.001) - niter2 = gradio.Number(value=200, precision=0, minimum=0, maximum=100_000, - label="num_iterations", info="For refinement!") - optim_level = gradio.Dropdown(["coarse", "refine", "refine+depth"], - value='refine+depth', label="OptLevel", - info="Optimization level") - with gradio.Row(): - matching_conf_thr = gradio.Slider(label="Matching Confidence Thr", value=5., - minimum=0., maximum=30., step=0.1, - info="Before Fallback to Regr3D!") - shared_intrinsics = gradio.Checkbox(value=False, label="Shared intrinsics", - info="Only optimize one set of intrinsics for all views") - scenegraph_type = gradio.Dropdown([("complete: all possible image pairs", "complete"), - ("swin: sliding window", "swin"), - ("logwin: sliding window with long range", "logwin"), - ("oneref: match one image with all", "oneref")], - value='complete', label="Scenegraph", - info="Define how to make pairs", - interactive=True) - with gradio.Column(visible=False) as win_col: - winsize = gradio.Slider(label="Scene Graph: Window Size", value=1, - minimum=1, maximum=1, step=1) - win_cyclic = gradio.Checkbox(value=False, label="Cyclic sequence") - refid = gradio.Slider(label="Scene Graph: Id", value=0, - minimum=0, maximum=0, step=1, visible=False) - run_btn = gradio.Button("Run") - - with gradio.Row(): - # adjust the confidence threshold - min_conf_thr = gradio.Slider(label="min_conf_thr", value=1.5, minimum=0.0, maximum=10, step=0.1) - # adjust the camera size in the output pointcloud - cam_size = gradio.Slider(label="cam_size", value=0.2, minimum=0.001, maximum=1.0, step=0.001) - TSDF_thresh = gradio.Slider(label="TSDF Threshold", value=0., minimum=0., maximum=1., step=0.01) - with gradio.Row(): - as_pointcloud = gradio.Checkbox(value=True, label="As pointcloud") - # two post process implemented - mask_sky = gradio.Checkbox(value=False, label="Mask sky") - clean_depth = gradio.Checkbox(value=True, label="Clean-up depthmaps") - transparent_cams = gradio.Checkbox(value=False, label="Transparent cameras") - - outmodel = gradio.Model3D() - - # events - scenegraph_type.change(set_scenegraph_options, - inputs=[inputfiles, win_cyclic, refid, scenegraph_type], - outputs=[win_col, winsize, win_cyclic, refid]) - inputfiles.change(set_scenegraph_options, - inputs=[inputfiles, win_cyclic, refid, scenegraph_type], - outputs=[win_col, winsize, win_cyclic, refid]) - win_cyclic.change(set_scenegraph_options, - inputs=[inputfiles, win_cyclic, refid, scenegraph_type], - outputs=[win_col, winsize, win_cyclic, refid]) - run_btn.click(fn=recon_fun, - inputs=[scene, inputfiles, optim_level, lr1, niter1, lr2, niter2, min_conf_thr, matching_conf_thr, - as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size, - scenegraph_type, winsize, win_cyclic, refid, TSDF_thresh, shared_intrinsics], - outputs=[scene, outmodel]) - min_conf_thr.release(fn=model_from_scene_fun, - inputs=[scene, min_conf_thr, as_pointcloud, mask_sky, - clean_depth, transparent_cams, cam_size, TSDF_thresh], - outputs=outmodel) - cam_size.change(fn=model_from_scene_fun, - inputs=[scene, min_conf_thr, as_pointcloud, mask_sky, - clean_depth, transparent_cams, cam_size, TSDF_thresh], - outputs=outmodel) - TSDF_thresh.change(fn=model_from_scene_fun, - inputs=[scene, min_conf_thr, as_pointcloud, mask_sky, - clean_depth, transparent_cams, cam_size, TSDF_thresh], - outputs=outmodel) - as_pointcloud.change(fn=model_from_scene_fun, - inputs=[scene, min_conf_thr, as_pointcloud, mask_sky, - clean_depth, transparent_cams, cam_size, TSDF_thresh], - outputs=outmodel) - mask_sky.change(fn=model_from_scene_fun, - inputs=[scene, min_conf_thr, as_pointcloud, mask_sky, - clean_depth, transparent_cams, cam_size, TSDF_thresh], - outputs=outmodel) - clean_depth.change(fn=model_from_scene_fun, - inputs=[scene, min_conf_thr, as_pointcloud, mask_sky, - clean_depth, transparent_cams, cam_size, TSDF_thresh], - outputs=outmodel) - transparent_cams.change(model_from_scene_fun, - inputs=[scene, min_conf_thr, as_pointcloud, mask_sky, - clean_depth, transparent_cams, cam_size, TSDF_thresh], - outputs=outmodel) - demo.launch(share=share, server_name=server_name, server_port=server_port) diff --git a/modules/mast3r/fast_nn.py b/modules/mast3r/fast_nn.py deleted file mode 100644 index 05537f43c1be10b3733e80def8295c2ff5b5b8c0..0000000000000000000000000000000000000000 --- a/modules/mast3r/fast_nn.py +++ /dev/null @@ -1,223 +0,0 @@ -# Copyright (C) 2024-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). -# -# -------------------------------------------------------- -# MASt3R Fast Nearest Neighbor -# -------------------------------------------------------- -import torch -import numpy as np -import math -from scipy.spatial import KDTree - -import mast3r.utils.path_to_dust3r # noqa -from dust3r.utils.device import to_numpy, todevice # noqa - - -@torch.no_grad() -def bruteforce_reciprocal_nns(A, B, device='cuda', block_size=None, dist='l2'): - if isinstance(A, np.ndarray): - A = torch.from_numpy(A).to(device) - if isinstance(B, np.ndarray): - B = torch.from_numpy(B).to(device) - - A = A.to(device) - B = B.to(device) - - if dist == 'l2': - dist_func = torch.cdist - argmin = torch.min - elif dist == 'dot': - def dist_func(A, B): - return A @ B.T - - def argmin(X, dim): - sim, nn = torch.max(X, dim=dim) - return sim.neg_(), nn - else: - raise ValueError(f'Unknown {dist=}') - - if block_size is None or len(A) * len(B) <= block_size**2: - dists = dist_func(A, B) - _, nn_A = argmin(dists, dim=1) - _, nn_B = argmin(dists, dim=0) - else: - dis_A = torch.full((A.shape[0],), float('inf'), device=device, dtype=A.dtype) - dis_B = torch.full((B.shape[0],), float('inf'), device=device, dtype=B.dtype) - nn_A = torch.full((A.shape[0],), -1, device=device, dtype=torch.int64) - nn_B = torch.full((B.shape[0],), -1, device=device, dtype=torch.int64) - number_of_iteration_A = math.ceil(A.shape[0] / block_size) - number_of_iteration_B = math.ceil(B.shape[0] / block_size) - - for i in range(number_of_iteration_A): - A_i = A[i * block_size:(i + 1) * block_size] - for j in range(number_of_iteration_B): - B_j = B[j * block_size:(j + 1) * block_size] - dists_blk = dist_func(A_i, B_j) # A, B, 1 - # dists_blk = dists[i * block_size:(i+1)*block_size, j * block_size:(j+1)*block_size] - min_A_i, argmin_A_i = argmin(dists_blk, dim=1) - min_B_j, argmin_B_j = argmin(dists_blk, dim=0) - - col_mask = min_A_i < dis_A[i * block_size:(i + 1) * block_size] - line_mask = min_B_j < dis_B[j * block_size:(j + 1) * block_size] - - dis_A[i * block_size:(i + 1) * block_size][col_mask] = min_A_i[col_mask] - dis_B[j * block_size:(j + 1) * block_size][line_mask] = min_B_j[line_mask] - - nn_A[i * block_size:(i + 1) * block_size][col_mask] = argmin_A_i[col_mask] + (j * block_size) - nn_B[j * block_size:(j + 1) * block_size][line_mask] = argmin_B_j[line_mask] + (i * block_size) - nn_A = nn_A.cpu().numpy() - nn_B = nn_B.cpu().numpy() - return nn_A, nn_B - - -class cdistMatcher: - def __init__(self, db_pts, device='cuda'): - self.db_pts = db_pts.to(device) - self.device = device - - def query(self, queries, k=1, **kw): - assert k == 1 - if queries.numel() == 0: - return None, [] - nnA, nnB = bruteforce_reciprocal_nns(queries, self.db_pts, device=self.device, **kw) - dis = None - return dis, nnA - - -def merge_corres(idx1, idx2, shape1=None, shape2=None, ret_xy=True, ret_index=False): - assert idx1.dtype == idx2.dtype == np.int32 - - # unique and sort along idx1 - corres = np.unique(np.c_[idx2, idx1].view(np.int64), return_index=ret_index) - if ret_index: - corres, indices = corres - xy2, xy1 = corres[:, None].view(np.int32).T - - if ret_xy: - assert shape1 and shape2 - xy1 = np.unravel_index(xy1, shape1) - xy2 = np.unravel_index(xy2, shape2) - if ret_xy != 'y_x': - xy1 = xy1[0].base[:, ::-1] - xy2 = xy2[0].base[:, ::-1] - - if ret_index: - return xy1, xy2, indices - return xy1, xy2 - - -def fast_reciprocal_NNs(pts1, pts2, subsample_or_initxy1=8, ret_xy=True, pixel_tol=0, ret_basin=False, - device='cuda', **matcher_kw): - H1, W1, DIM1 = pts1.shape - H2, W2, DIM2 = pts2.shape - assert DIM1 == DIM2 - - pts1 = pts1.reshape(-1, DIM1) - pts2 = pts2.reshape(-1, DIM2) - - if isinstance(subsample_or_initxy1, int) and pixel_tol == 0: - S = subsample_or_initxy1 - y1, x1 = np.mgrid[S // 2:H1:S, S // 2:W1:S].reshape(2, -1) - max_iter = 10 - else: - x1, y1 = subsample_or_initxy1 - if isinstance(x1, torch.Tensor): - x1 = x1.cpu().numpy() - if isinstance(y1, torch.Tensor): - y1 = y1.cpu().numpy() - max_iter = 1 - - xy1 = np.int32(np.unique(x1 + W1 * y1)) # make sure there's no doublons - xy2 = np.full_like(xy1, -1) - old_xy1 = xy1.copy() - old_xy2 = xy2.copy() - - if 'dist' in matcher_kw or 'block_size' in matcher_kw \ - or (isinstance(device, str) and device.startswith('cuda')) \ - or (isinstance(device, torch.device) and device.type.startswith('cuda')): - pts1 = pts1.to(device) - pts2 = pts2.to(device) - tree1 = cdistMatcher(pts1, device=device) - tree2 = cdistMatcher(pts2, device=device) - else: - pts1, pts2 = to_numpy((pts1, pts2)) - tree1 = KDTree(pts1) - tree2 = KDTree(pts2) - - notyet = np.ones(len(xy1), dtype=bool) - basin = np.full((H1 * W1 + 1,), -1, dtype=np.int32) if ret_basin else None - - niter = 0 - # n_notyet = [len(notyet)] - while notyet.any(): - _, xy2[notyet] = to_numpy(tree2.query(pts1[xy1[notyet]], **matcher_kw)) - if not ret_basin: - notyet &= (old_xy2 != xy2) # remove points that have converged - - _, xy1[notyet] = to_numpy(tree1.query(pts2[xy2[notyet]], **matcher_kw)) - if ret_basin: - basin[old_xy1[notyet]] = xy1[notyet] - notyet &= (old_xy1 != xy1) # remove points that have converged - - # n_notyet.append(notyet.sum()) - niter += 1 - if niter >= max_iter: - break - - old_xy2[:] = xy2 - old_xy1[:] = xy1 - - # print('notyet_stats:', ' '.join(map(str, (n_notyet+[0]*10)[:max_iter]))) - - if pixel_tol > 0: - # in case we only want to match some specific points - # and still have some way of checking reciprocity - old_yx1 = np.unravel_index(old_xy1, (H1, W1))[0].base - new_yx1 = np.unravel_index(xy1, (H1, W1))[0].base - dis = np.linalg.norm(old_yx1 - new_yx1, axis=-1) - converged = dis < pixel_tol - if not isinstance(subsample_or_initxy1, int): - xy1 = old_xy1 # replace new points by old ones - else: - converged = ~notyet # converged correspondences - - # keep only unique correspondences, and sort on xy1 - xy1, xy2 = merge_corres(xy1[converged], xy2[converged], (H1, W1), (H2, W2), ret_xy=ret_xy) - if ret_basin: - return xy1, xy2, basin - return xy1, xy2 - - -def extract_correspondences_nonsym(A, B, confA, confB, subsample=8, device=None, ptmap_key='pred_desc', pixel_tol=0): - if '3d' in ptmap_key: - opt = dict(device='cpu', workers=32) - else: - opt = dict(device=device, dist='dot', block_size=2**13) - - # matching the two pairs - idx1 = [] - idx2 = [] - # merge corres from opposite pairs - HA, WA = A.shape[:2] - HB, WB = B.shape[:2] - if pixel_tol == 0: - nn1to2 = fast_reciprocal_NNs(A, B, subsample_or_initxy1=subsample, ret_xy=False, **opt) - nn2to1 = fast_reciprocal_NNs(B, A, subsample_or_initxy1=subsample, ret_xy=False, **opt) - else: - S = subsample - yA, xA = np.mgrid[S // 2:HA:S, S // 2:WA:S].reshape(2, -1) - yB, xB = np.mgrid[S // 2:HB:S, S // 2:WB:S].reshape(2, -1) - - nn1to2 = fast_reciprocal_NNs(A, B, subsample_or_initxy1=(xA, yA), ret_xy=False, pixel_tol=pixel_tol, **opt) - nn2to1 = fast_reciprocal_NNs(B, A, subsample_or_initxy1=(xB, yB), ret_xy=False, pixel_tol=pixel_tol, **opt) - - idx1 = np.r_[nn1to2[0], nn2to1[1]] - idx2 = np.r_[nn1to2[1], nn2to1[0]] - - c1 = confA.ravel()[idx1] - c2 = confB.ravel()[idx2] - - xy1, xy2, idx = merge_corres(idx1, idx2, (HA, WA), (HB, WB), ret_xy=True, ret_index=True) - conf = np.minimum(c1[idx], c2[idx]) - corres = (xy1.copy(), xy2.copy(), conf) - return todevice(corres, device) diff --git a/modules/mast3r/losses.py b/modules/mast3r/losses.py deleted file mode 100644 index 3a50f57481e436d7752dcbf2b414be3ea65ee76b..0000000000000000000000000000000000000000 --- a/modules/mast3r/losses.py +++ /dev/null @@ -1,508 +0,0 @@ -# Copyright (C) 2024-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). -# -# -------------------------------------------------------- -# Implementation of MASt3R training losses -# -------------------------------------------------------- -import torch -import torch.nn as nn -import numpy as np -from sklearn.metrics import average_precision_score - -import mast3r.utils.path_to_dust3r # noqa -from dust3r.losses import BaseCriterion, Criterion, MultiLoss, Sum, ConfLoss -from dust3r.losses import Regr3D as Regr3D_dust3r -from dust3r.utils.geometry import (geotrf, inv, normalize_pointcloud) -from dust3r.inference import get_pred_pts3d -from dust3r.utils.geometry import get_joint_pointcloud_depth, get_joint_pointcloud_center_scale - - -def apply_log_to_norm(xyz): - d = xyz.norm(dim=-1, keepdim=True) - xyz = xyz / d.clip(min=1e-8) - xyz = xyz * torch.log1p(d) - return xyz - - -class Regr3D (Regr3D_dust3r): - def __init__(self, criterion, norm_mode='avg_dis', gt_scale=False, opt_fit_gt=False, - sky_loss_value=2, max_metric_scale=False, loss_in_log=False): - self.loss_in_log = loss_in_log - if norm_mode.startswith('?'): - # do no norm pts from metric scale datasets - self.norm_all = False - self.norm_mode = norm_mode[1:] - else: - self.norm_all = True - self.norm_mode = norm_mode - super().__init__(criterion, self.norm_mode, gt_scale) - - self.sky_loss_value = sky_loss_value - self.max_metric_scale = max_metric_scale - - def get_all_pts3d(self, gt1, gt2, pred1, pred2, dist_clip=None): - # everything is normalized w.r.t. camera of view1 - in_camera1 = inv(gt1['camera_pose']) - gt_pts1 = geotrf(in_camera1, gt1['pts3d']) # B,H,W,3 - gt_pts2 = geotrf(in_camera1, gt2['pts3d']) # B,H,W,3 - - valid1 = gt1['valid_mask'].clone() - valid2 = gt2['valid_mask'].clone() - - if dist_clip is not None: - # points that are too far-away == invalid - dis1 = gt_pts1.norm(dim=-1) # (B, H, W) - dis2 = gt_pts2.norm(dim=-1) # (B, H, W) - valid1 = valid1 & (dis1 <= dist_clip) - valid2 = valid2 & (dis2 <= dist_clip) - - if self.loss_in_log == 'before': - # this only make sense when depth_mode == 'linear' - gt_pts1 = apply_log_to_norm(gt_pts1) - gt_pts2 = apply_log_to_norm(gt_pts2) - - pr_pts1 = get_pred_pts3d(gt1, pred1, use_pose=False).clone() - pr_pts2 = get_pred_pts3d(gt2, pred2, use_pose=True).clone() - - if not self.norm_all: - if self.max_metric_scale: - B = valid1.shape[0] - # valid1: B, H, W - # torch.linalg.norm(gt_pts1, dim=-1) -> B, H, W - # dist1_to_cam1 -> reshape to B, H*W - dist1_to_cam1 = torch.where(valid1, torch.linalg.norm(gt_pts1, dim=-1), 0).view(B, -1) - dist2_to_cam1 = torch.where(valid2, torch.linalg.norm(gt_pts2, dim=-1), 0).view(B, -1) - - # is_metric_scale: B - # dist1_to_cam1.max(dim=-1).values -> B - gt1['is_metric_scale'] = gt1['is_metric_scale'] \ - & (dist1_to_cam1.max(dim=-1).values < self.max_metric_scale) \ - & (dist2_to_cam1.max(dim=-1).values < self.max_metric_scale) - gt2['is_metric_scale'] = gt1['is_metric_scale'] - - mask = ~gt1['is_metric_scale'] - else: - mask = torch.ones_like(gt1['is_metric_scale']) - # normalize 3d points - if self.norm_mode and mask.any(): - pr_pts1[mask], pr_pts2[mask] = normalize_pointcloud(pr_pts1[mask], pr_pts2[mask], self.norm_mode, - valid1[mask], valid2[mask]) - - if self.norm_mode and not self.gt_scale: - gt_pts1, gt_pts2, norm_factor = normalize_pointcloud(gt_pts1, gt_pts2, self.norm_mode, - valid1, valid2, ret_factor=True) - # apply the same normalization to prediction - pr_pts1[~mask] = pr_pts1[~mask] / norm_factor[~mask] - pr_pts2[~mask] = pr_pts2[~mask] / norm_factor[~mask] - - # return sky segmentation, making sure they don't include any labelled 3d points - sky1 = gt1['sky_mask'] & (~valid1) - sky2 = gt2['sky_mask'] & (~valid2) - return gt_pts1, gt_pts2, pr_pts1, pr_pts2, valid1, valid2, sky1, sky2, {} - - def compute_loss(self, gt1, gt2, pred1, pred2, **kw): - gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, sky1, sky2, monitoring = \ - self.get_all_pts3d(gt1, gt2, pred1, pred2, **kw) - - if self.sky_loss_value > 0: - assert self.criterion.reduction == 'none', 'sky_loss_value should be 0 if no conf loss' - # add the sky pixel as "valid" pixels... - mask1 = mask1 | sky1 - mask2 = mask2 | sky2 - - # loss on img1 side - pred_pts1 = pred_pts1[mask1] - gt_pts1 = gt_pts1[mask1] - if self.loss_in_log and self.loss_in_log != 'before': - # this only make sense when depth_mode == 'exp' - pred_pts1 = apply_log_to_norm(pred_pts1) - gt_pts1 = apply_log_to_norm(gt_pts1) - l1 = self.criterion(pred_pts1, gt_pts1) - - # loss on gt2 side - pred_pts2 = pred_pts2[mask2] - gt_pts2 = gt_pts2[mask2] - if self.loss_in_log and self.loss_in_log != 'before': - pred_pts2 = apply_log_to_norm(pred_pts2) - gt_pts2 = apply_log_to_norm(gt_pts2) - l2 = self.criterion(pred_pts2, gt_pts2) - - if self.sky_loss_value > 0: - assert self.criterion.reduction == 'none', 'sky_loss_value should be 0 if no conf loss' - # ... but force the loss to be high there - l1 = torch.where(sky1[mask1], self.sky_loss_value, l1) - l2 = torch.where(sky2[mask2], self.sky_loss_value, l2) - self_name = type(self).__name__ - details = {self_name + '_pts3d_1': float(l1.mean()), self_name + '_pts3d_2': float(l2.mean())} - return Sum((l1, mask1), (l2, mask2)), (details | monitoring) - - -class Regr3D_ShiftInv (Regr3D): - """ Same than Regr3D but invariant to depth shift. - """ - - def get_all_pts3d(self, gt1, gt2, pred1, pred2): - # compute unnormalized points - gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, sky1, sky2, monitoring = \ - super().get_all_pts3d(gt1, gt2, pred1, pred2) - - # compute median depth - gt_z1, gt_z2 = gt_pts1[..., 2], gt_pts2[..., 2] - pred_z1, pred_z2 = pred_pts1[..., 2], pred_pts2[..., 2] - gt_shift_z = get_joint_pointcloud_depth(gt_z1, gt_z2, mask1, mask2)[:, None, None] - pred_shift_z = get_joint_pointcloud_depth(pred_z1, pred_z2, mask1, mask2)[:, None, None] - - # subtract the median depth - gt_z1 -= gt_shift_z - gt_z2 -= gt_shift_z - pred_z1 -= pred_shift_z - pred_z2 -= pred_shift_z - - # monitoring = dict(monitoring, gt_shift_z=gt_shift_z.mean().detach(), pred_shift_z=pred_shift_z.mean().detach()) - return gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, sky1, sky2, monitoring - - -class Regr3D_ScaleInv (Regr3D): - """ Same than Regr3D but invariant to depth scale. - if gt_scale == True: enforce the prediction to take the same scale than GT - """ - - def get_all_pts3d(self, gt1, gt2, pred1, pred2): - # compute depth-normalized points - gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, sky1, sky2, monitoring = \ - super().get_all_pts3d(gt1, gt2, pred1, pred2) - - # measure scene scale - _, gt_scale = get_joint_pointcloud_center_scale(gt_pts1, gt_pts2, mask1, mask2) - _, pred_scale = get_joint_pointcloud_center_scale(pred_pts1, pred_pts2, mask1, mask2) - - # prevent predictions to be in a ridiculous range - pred_scale = pred_scale.clip(min=1e-3, max=1e3) - - # subtract the median depth - if self.gt_scale: - pred_pts1 *= gt_scale / pred_scale - pred_pts2 *= gt_scale / pred_scale - # monitoring = dict(monitoring, pred_scale=(pred_scale/gt_scale).mean()) - else: - gt_pts1 /= gt_scale - gt_pts2 /= gt_scale - pred_pts1 /= pred_scale - pred_pts2 /= pred_scale - # monitoring = dict(monitoring, gt_scale=gt_scale.mean(), pred_scale=pred_scale.mean().detach()) - - return gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, sky1, sky2, monitoring - - -class Regr3D_ScaleShiftInv (Regr3D_ScaleInv, Regr3D_ShiftInv): - # calls Regr3D_ShiftInv first, then Regr3D_ScaleInv - pass - - -def get_similarities(desc1, desc2, euc=False): - if euc: # euclidean distance in same range than similarities - dists = (desc1[:, :, None] - desc2[:, None]).norm(dim=-1) - sim = 1 / (1 + dists) - else: - # Compute similarities - sim = desc1 @ desc2.transpose(-2, -1) - return sim - - -class MatchingCriterion(BaseCriterion): - def __init__(self, reduction='mean', fp=torch.float32): - super().__init__(reduction) - self.fp = fp - - def forward(self, a, b, valid_matches=None, euc=False): - assert a.ndim >= 2 and 1 <= a.shape[-1], f'Bad shape = {a.shape}' - dist = self.loss(a.to(self.fp), b.to(self.fp), valid_matches, euc=euc) - # one dimension less or reduction to single value - assert (valid_matches is None and dist.ndim == a.ndim - - 1) or self.reduction in ['mean', 'sum', '1-mean', 'none'] - if self.reduction == 'none': - return dist - if self.reduction == 'sum': - return dist.sum() - if self.reduction == 'mean': - return dist.mean() if dist.numel() > 0 else dist.new_zeros(()) - if self.reduction == '1-mean': - return 1. - dist.mean() if dist.numel() > 0 else dist.new_ones(()) - raise ValueError(f'bad {self.reduction=} mode') - - def loss(self, a, b, valid_matches=None): - raise NotImplementedError - - -class InfoNCE(MatchingCriterion): - def __init__(self, temperature=0.07, eps=1e-8, mode='all', **kwargs): - super().__init__(**kwargs) - self.temperature = temperature - self.eps = eps - assert mode in ['all', 'proper', 'dual'] - self.mode = mode - - def loss(self, desc1, desc2, valid_matches=None, euc=False): - # valid positives are along diagonals - B, N, D = desc1.shape - B2, N2, D2 = desc2.shape - assert B == B2 and D == D2 - if valid_matches is None: - valid_matches = torch.ones([B, N], dtype=bool) - # torch.all(valid_matches.sum(dim=-1) > 0) some pairs have no matches???? - assert valid_matches.shape == torch.Size([B, N]) and valid_matches.sum() > 0 - - # Tempered similarities - sim = get_similarities(desc1, desc2, euc) / self.temperature - sim[sim.isnan()] = -torch.inf # ignore nans - # Softmax of positives with temperature - sim = sim.exp_() # save peak memory - positives = sim.diagonal(dim1=-2, dim2=-1) - - # Loss - if self.mode == 'all': # Previous InfoNCE - loss = -torch.log((positives / sim.sum(dim=-1).sum(dim=-1, keepdim=True)).clip(self.eps)) - elif self.mode == 'proper': # Proper InfoNCE - loss = -(torch.log((positives / sim.sum(dim=-2)).clip(self.eps)) + - torch.log((positives / sim.sum(dim=-1)).clip(self.eps))) - elif self.mode == 'dual': # Dual Softmax - loss = -(torch.log((positives**2 / sim.sum(dim=-1) / sim.sum(dim=-2)).clip(self.eps))) - else: - raise ValueError("This should not happen...") - return loss[valid_matches] - - -class APLoss (MatchingCriterion): - """ AP loss. - """ - - def __init__(self, nq='torch', min=0, max=1, euc=False, **kw): - super().__init__(**kw) - # Exact/True AP loss (not differentiable) - if nq == 0: - nq = 'sklearn' # special case - try: - self.compute_AP = eval('self.compute_true_AP_' + nq) - except: - raise ValueError("Unknown mode %s for AP loss" % nq) - - @staticmethod - def compute_true_AP_sklearn(scores, labels): - def compute_AP(label, score): - return average_precision_score(label, score) - - aps = scores.new_zeros((scores.shape[0], scores.shape[1])) - label_np = labels.cpu().numpy().astype(bool) - scores_np = scores.cpu().numpy() - for bi in range(scores_np.shape[0]): - for i in range(scores_np.shape[1]): - labels = label_np[bi, i, :] - if labels.sum() < 1: - continue - aps[bi, i] = compute_AP(labels, scores_np[bi, i, :]) - return aps - - @staticmethod - def compute_true_AP_torch(scores, labels): - assert scores.shape == labels.shape - B, N, M = labels.shape - dev = labels.device - with torch.no_grad(): - # sort scores - _, order = scores.sort(dim=-1, descending=True) - # sort labels accordingly - labels = labels[torch.arange(B, device=dev)[:, None, None].expand(order.shape), - torch.arange(N, device=dev)[None, :, None].expand(order.shape), - order] - # compute number of positives per query - npos = labels.sum(dim=-1) - assert torch.all(torch.isclose(npos, npos[0, 0]) - ), "only implemented for constant number of positives per query" - npos = int(npos[0, 0]) - # compute precision at each recall point - posrank = labels.nonzero()[:, -1].view(B, N, npos) - recall = torch.arange(1, 1 + npos, dtype=torch.float32, device=dev)[None, None, :].expand(B, N, npos) - precision = recall / (1 + posrank).float() - # average precision values at all recall points - aps = precision.mean(dim=-1) - - return aps - - def loss(self, desc1, desc2, valid_matches=None, euc=False): # if matches is None, positives are the diagonal - B, N1, D = desc1.shape - B2, N2, D2 = desc2.shape - assert B == B2 and D == D2 - - scores = get_similarities(desc1, desc2, euc) - - labels = torch.zeros([B, N1, N2], dtype=scores.dtype, device=scores.device) - - # allow all diagonal positives and only mask afterwards - labels.diagonal(dim1=-2, dim2=-1)[...] = 1. - apscore = self.compute_AP(scores, labels) - if valid_matches is not None: - apscore = apscore[valid_matches] - return apscore - - -class MatchingLoss (Criterion, MultiLoss): - """ - Matching loss per image - only compare pixels inside an image but not in the whole batch as what would be done usually - """ - - def __init__(self, criterion, withconf=False, use_pts3d=False, negatives_padding=0, blocksize=4096): - super().__init__(criterion) - self.negatives_padding = negatives_padding - self.use_pts3d = use_pts3d - self.blocksize = blocksize - self.withconf = withconf - - def add_negatives(self, outdesc2, desc2, batchid, x2, y2): - if self.negatives_padding: - B, H, W, D = desc2.shape - negatives = torch.ones([B, H, W], device=desc2.device, dtype=bool) - negatives[batchid, y2, x2] = False - sel = negatives & (negatives.view([B, -1]).cumsum(dim=-1).view(B, H, W) - <= self.negatives_padding) # take the N-first negatives - outdesc2 = torch.cat([outdesc2, desc2[sel].view([B, -1, D])], dim=1) - return outdesc2 - - def get_confs(self, pred1, pred2, sel1, sel2): - if self.withconf: - if self.use_pts3d: - outconfs1 = pred1['conf'][sel1] - outconfs2 = pred2['conf'][sel2] - else: - outconfs1 = pred1['desc_conf'][sel1] - outconfs2 = pred2['desc_conf'][sel2] - else: - outconfs1 = outconfs2 = None - return outconfs1, outconfs2 - - def get_descs(self, pred1, pred2): - if self.use_pts3d: - desc1, desc2 = pred1['pts3d'], pred2['pts3d_in_other_view'] - else: - desc1, desc2 = pred1['desc'], pred2['desc'] - return desc1, desc2 - - def get_matching_descs(self, gt1, gt2, pred1, pred2, **kw): - outdesc1 = outdesc2 = outconfs1 = outconfs2 = None - # Recover descs, GT corres and valid mask - desc1, desc2 = self.get_descs(pred1, pred2) - - (x1, y1), (x2, y2) = gt1['corres'].unbind(-1), gt2['corres'].unbind(-1) - valid_matches = gt1['valid_corres'] - - # Select descs that have GT matches - B, N = x1.shape - batchid = torch.arange(B)[:, None].repeat(1, N) # B, N - outdesc1, outdesc2 = desc1[batchid, y1, x1], desc2[batchid, y2, x2] # B, N, D - - # Padd with unused negatives - outdesc2 = self.add_negatives(outdesc2, desc2, batchid, x2, y2) - - # Gather confs if needed - sel1 = batchid, y1, x1 - sel2 = batchid, y2, x2 - outconfs1, outconfs2 = self.get_confs(pred1, pred2, sel1, sel2) - - return outdesc1, outdesc2, outconfs1, outconfs2, valid_matches, {'use_euclidean_dist': self.use_pts3d} - - def blockwise_criterion(self, descs1, descs2, confs1, confs2, valid_matches, euc, rng=np.random, shuffle=True): - loss = None - details = {} - B, N, D = descs1.shape - - if N <= self.blocksize: # Blocks are larger than provided descs, compute regular loss - loss = self.criterion(descs1, descs2, valid_matches, euc=euc) - else: # Compute criterion on the blockdiagonal only, after shuffling - # Shuffle if necessary - matches_perm = slice(None) - if shuffle: - matches_perm = np.stack([rng.choice(range(N), size=N, replace=False) for _ in range(B)]) - batchid = torch.tile(torch.arange(B), (N, 1)).T - matches_perm = batchid, matches_perm - - descs1 = descs1[matches_perm] - descs2 = descs2[matches_perm] - valid_matches = valid_matches[matches_perm] - - assert N % self.blocksize == 0, "Error, can't chunk block-diagonal, please check blocksize" - n_chunks = N // self.blocksize - descs1 = descs1.reshape([B * n_chunks, self.blocksize, D]) # [B*(N//blocksize), blocksize, D] - descs2 = descs2.reshape([B * n_chunks, self.blocksize, D]) # [B*(N//blocksize), blocksize, D] - valid_matches = valid_matches.view([B * n_chunks, self.blocksize]) - loss = self.criterion(descs1, descs2, valid_matches, euc=euc) - if self.withconf: - confs1, confs2 = map(lambda x: x[matches_perm], (confs1, confs2)) # apply perm to confidences if needed - - if self.withconf: - # split confidences between positives/negatives for loss computation - details['conf_pos'] = map(lambda x: x[valid_matches.view(B, -1)], (confs1, confs2)) - details['conf_neg'] = map(lambda x: x[~valid_matches.view(B, -1)], (confs1, confs2)) - details['Conf1_std'] = confs1.std() - details['Conf2_std'] = confs2.std() - - return loss, details - - def compute_loss(self, gt1, gt2, pred1, pred2, **kw): - # Gather preds and GT - descs1, descs2, confs1, confs2, valid_matches, monitoring = self.get_matching_descs( - gt1, gt2, pred1, pred2, **kw) - - # loss on matches - loss, details = self.blockwise_criterion(descs1, descs2, confs1, confs2, - valid_matches, euc=monitoring.pop('use_euclidean_dist', False)) - - details[type(self).__name__] = float(loss.mean()) - return loss, (details | monitoring) - - -class ConfMatchingLoss(ConfLoss): - """ Weight matching by learned confidence. Same as ConfLoss but for a matching criterion - Assuming the input matching_loss is a match-level loss. - """ - - def __init__(self, pixel_loss, alpha=1., confmode='prod', neg_conf_loss_quantile=False): - super().__init__(pixel_loss, alpha) - self.pixel_loss.withconf = True - self.confmode = confmode - self.neg_conf_loss_quantile = neg_conf_loss_quantile - - def aggregate_confs(self, confs1, confs2): # get the confidences resulting from the two view predictions - if self.confmode == 'prod': - confs = confs1 * confs2 if confs1 is not None and confs2 is not None else 1. - elif self.confmode == 'mean': - confs = .5 * (confs1 + confs2) if confs1 is not None and confs2 is not None else 1. - else: - raise ValueError(f"Unknown conf mode {self.confmode}") - return confs - - def compute_loss(self, gt1, gt2, pred1, pred2, **kw): - # compute per-pixel loss - loss, details = self.pixel_loss(gt1, gt2, pred1, pred2, **kw) - # Recover confidences for positive and negative samples - conf1_pos, conf2_pos = details.pop('conf_pos') - conf1_neg, conf2_neg = details.pop('conf_neg') - conf_pos = self.aggregate_confs(conf1_pos, conf2_pos) - - # weight Matching loss by confidence on positives - conf_pos, log_conf_pos = self.get_conf_log(conf_pos) - conf_loss = loss * conf_pos - self.alpha * log_conf_pos - # average + nan protection (in case of no valid pixels at all) - conf_loss = conf_loss.mean() if conf_loss.numel() > 0 else 0 - # Add negative confs loss to give some supervision signal to confidences for pixels that are not matched in GT - if self.neg_conf_loss_quantile: - conf_neg = torch.cat([conf1_neg, conf2_neg]) - conf_neg, log_conf_neg = self.get_conf_log(conf_neg) - - # recover quantile that will be used for negatives loss value assignment - neg_loss_value = torch.quantile(loss, self.neg_conf_loss_quantile).detach() - neg_loss = neg_loss_value * conf_neg - self.alpha * log_conf_neg - - neg_loss = neg_loss.mean() if neg_loss.numel() > 0 else 0 - conf_loss = conf_loss + neg_loss - - return conf_loss, dict(matching_conf_loss=float(conf_loss), **details) diff --git a/modules/mast3r/model.py b/modules/mast3r/model.py deleted file mode 100644 index f328c5e43b8e98f2ec960e4d25e6f235ac543544..0000000000000000000000000000000000000000 --- a/modules/mast3r/model.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright (C) 2024-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). -# -# -------------------------------------------------------- -# MASt3R model class -# -------------------------------------------------------- -import torch -import torch.nn.functional as F -import os - -from mast3r.catmlp_dpt_head import mast3r_head_factory - -import mast3r.utils.path_to_dust3r # noqa -from dust3r.model import AsymmetricCroCo3DStereo # noqa -from dust3r.utils.misc import transpose_to_landscape # noqa - - -inf = float('inf') - - -def load_model(model_path, device, verbose=True): - if verbose: - print('... loading model from', model_path) - ckpt = torch.load(model_path, map_location='cpu') - args = ckpt['args'].model.replace("ManyAR_PatchEmbed", "PatchEmbedDust3R") - if 'landscape_only' not in args: - args = args[:-1] + ', landscape_only=False)' - else: - args = args.replace(" ", "").replace('landscape_only=True', 'landscape_only=False') - assert "landscape_only=False" in args - if verbose: - print(f"instantiating : {args}") - net = eval(args) - s = net.load_state_dict(ckpt['model'], strict=False) - if verbose: - print(s) - return net.to(device) - - -class AsymmetricMASt3R(AsymmetricCroCo3DStereo): - def __init__(self, desc_mode=('norm'), two_confs=False, desc_conf_mode=None, **kwargs): - self.desc_mode = desc_mode - self.two_confs = two_confs - self.desc_conf_mode = desc_conf_mode - super().__init__(**kwargs) - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, **kw): - if os.path.isfile(pretrained_model_name_or_path): - return load_model(pretrained_model_name_or_path, device='cpu') - else: - return super(AsymmetricMASt3R, cls).from_pretrained(pretrained_model_name_or_path, **kw) - - def set_downstream_head(self, output_mode, head_type, landscape_only, depth_mode, conf_mode, patch_size, img_size, **kw): - assert img_size[0] % patch_size == 0 and img_size[ - 1] % patch_size == 0, f'{img_size=} must be multiple of {patch_size=}' - self.output_mode = output_mode - self.head_type = head_type - self.depth_mode = depth_mode - self.conf_mode = conf_mode - if self.desc_conf_mode is None: - self.desc_conf_mode = conf_mode - # allocate heads - self.downstream_head1 = mast3r_head_factory(head_type, output_mode, self, has_conf=bool(conf_mode)) - self.downstream_head2 = mast3r_head_factory(head_type, output_mode, self, has_conf=bool(conf_mode)) - # magic wrapper - self.head1 = transpose_to_landscape(self.downstream_head1, activate=landscape_only) - self.head2 = transpose_to_landscape(self.downstream_head2, activate=landscape_only) diff --git a/modules/mast3r/utils/__init__.py b/modules/mast3r/utils/__init__.py deleted file mode 100644 index d7dd877d649ce4dbd749dd7195a8b34c0f91d4f0..0000000000000000000000000000000000000000 --- a/modules/mast3r/utils/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (C) 2024-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). \ No newline at end of file diff --git a/modules/mast3r/utils/__pycache__/__init__.cpython-312.pyc b/modules/mast3r/utils/__pycache__/__init__.cpython-312.pyc deleted file mode 100644 index 59e6815dc300e569a52ac0fb7fe95107f0e9cadd..0000000000000000000000000000000000000000 Binary files a/modules/mast3r/utils/__pycache__/__init__.cpython-312.pyc and /dev/null differ diff --git a/modules/mast3r/utils/__pycache__/misc.cpython-312.pyc b/modules/mast3r/utils/__pycache__/misc.cpython-312.pyc deleted file mode 100644 index 407c57c357a9d0aae0c433c6a9f66fea9c3e4e1b..0000000000000000000000000000000000000000 Binary files a/modules/mast3r/utils/__pycache__/misc.cpython-312.pyc and /dev/null differ diff --git a/modules/mast3r/utils/__pycache__/path_to_dust3r.cpython-312.pyc b/modules/mast3r/utils/__pycache__/path_to_dust3r.cpython-312.pyc deleted file mode 100644 index c63b4a3af5c50cea24f194e4e5307acc976289f8..0000000000000000000000000000000000000000 Binary files a/modules/mast3r/utils/__pycache__/path_to_dust3r.cpython-312.pyc and /dev/null differ diff --git a/modules/mast3r/utils/coarse_to_fine.py b/modules/mast3r/utils/coarse_to_fine.py deleted file mode 100644 index c062e8608f82c608f2d605d69a95a7e0f301b3cf..0000000000000000000000000000000000000000 --- a/modules/mast3r/utils/coarse_to_fine.py +++ /dev/null @@ -1,214 +0,0 @@ -# Copyright (C) 2024-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). -# -# -------------------------------------------------------- -# coarse to fine utilities -# -------------------------------------------------------- -import numpy as np - - -def crop_tag(cell): - return f'[{cell[1]}:{cell[3]},{cell[0]}:{cell[2]}]' - - -def crop_slice(cell): - return slice(cell[1], cell[3]), slice(cell[0], cell[2]) - - -def _start_pos(total_size, win_size, overlap): - # we must have AT LEAST overlap between segments - # first segment starts at 0, last segment starts at total_size-win_size - assert 0 <= overlap < 1 - assert total_size >= win_size - spacing = win_size * (1 - overlap) - last_pt = total_size - win_size - n_windows = 2 + int((last_pt - 1) // spacing) - return np.linspace(0, last_pt, n_windows).round().astype(int) - - -def multiple_of_16(x): - return (x // 16) * 16 - - -def _make_overlapping_grid(H, W, size, overlap): - H_win = multiple_of_16(H * size // max(H, W)) - W_win = multiple_of_16(W * size // max(H, W)) - x = _start_pos(W, W_win, overlap) - y = _start_pos(H, H_win, overlap) - grid = np.stack(np.meshgrid(x, y, indexing='xy'), axis=-1) - grid = np.concatenate((grid, grid + (W_win, H_win)), axis=-1) - return grid.reshape(-1, 4) - - -def _cell_size(cell2): - width, height = cell2[:, 2] - cell2[:, 0], cell2[:, 3] - cell2[:, 1] - assert width.min() >= 0 - assert height.min() >= 0 - return width, height - - -def _norm_windows(cell2, H2, W2, forced_resolution=None): - # make sure the window aspect ratio is 3/4, or the output resolution is forced_resolution if defined - outcell = cell2.copy() - width, height = _cell_size(cell2) - width2, height2 = width.clip(max=W2), height.clip(max=H2) - if forced_resolution is None: - width2[width < height] = (height2[width < height] * 3.01 / 4).clip(max=W2) - height2[width >= height] = (width2[width >= height] * 3.01 / 4).clip(max=H2) - else: - forced_H, forced_W = forced_resolution - width2[:] = forced_W - height2[:] = forced_H - - half = (width2 - width) / 2 - outcell[:, 0] -= half - outcell[:, 2] += half - half = (height2 - height) / 2 - outcell[:, 1] -= half - outcell[:, 3] += half - - # proj to integers - outcell = np.floor(outcell).astype(int) - # Take care of flooring errors - tmpw, tmph = _cell_size(outcell) - outcell[:, 0] += tmpw.astype(tmpw.dtype) - width2.astype(tmpw.dtype) - outcell[:, 1] += tmph.astype(tmpw.dtype) - height2.astype(tmpw.dtype) - - # make sure 0 <= x < W2 and 0 <= y < H2 - outcell[:, 0::2] -= outcell[:, [0]].clip(max=0) - outcell[:, 1::2] -= outcell[:, [1]].clip(max=0) - outcell[:, 0::2] -= outcell[:, [2]].clip(min=W2) - W2 - outcell[:, 1::2] -= outcell[:, [3]].clip(min=H2) - H2 - - width, height = _cell_size(outcell) - assert np.all(width == width2.astype(width.dtype)) and np.all( - height == height2.astype(height.dtype)), "Error, output is not of the expected shape." - assert np.all(width <= W2) - assert np.all(height <= H2) - return outcell - - -def _weight_pixels(cell, pix, assigned, gauss_var=2): - center = cell.reshape(-1, 2, 2).mean(axis=1) - width, height = _cell_size(cell) - - # square distance between each cell center and each point - dist = (center[:, None] - pix[None]) / np.c_[width, height][:, None] - dist2 = np.square(dist).sum(axis=-1) - - assert assigned.shape == dist2.shape - res = np.where(assigned, np.exp(-gauss_var * dist2), 0) - return res - - -def pos2d_in_rect(p1, cell1): - x, y = p1.T - l, t, r, b = cell1 - assigned = (l <= x) & (x < r) & (t <= y) & (y < b) - return assigned - - -def _score_cell(cell1, H2, W2, p1, p2, min_corres=10, forced_resolution=None): - assert p1.shape == p2.shape - - # compute keypoint assignment - assigned = pos2d_in_rect(p1, cell1[None].T) - assert assigned.shape == (len(cell1), len(p1)) - - # remove cells without correspondences - valid_cells = assigned.sum(axis=1) >= min_corres - cell1 = cell1[valid_cells] - assigned = assigned[valid_cells] - if not valid_cells.any(): - return cell1, cell1, assigned - - # fill-in the assigned points in both image - assigned_p1 = np.empty((len(cell1), len(p1), 2), dtype=np.float32) - assigned_p2 = np.empty((len(cell1), len(p2), 2), dtype=np.float32) - assigned_p1[:] = p1[None] - assigned_p2[:] = p2[None] - assigned_p1[~assigned] = np.nan - assigned_p2[~assigned] = np.nan - - # find the median center and scale of assigned points in each cell - # cell_center1 = np.nanmean(assigned_p1, axis=1) - cell_center2 = np.nanmean(assigned_p2, axis=1) - im1_q25, im1_q75 = np.nanquantile(assigned_p1, (0.1, 0.9), axis=1) - im2_q25, im2_q75 = np.nanquantile(assigned_p2, (0.1, 0.9), axis=1) - - robust_std1 = (im1_q75 - im1_q25).clip(20.) - robust_std2 = (im2_q75 - im2_q25).clip(20.) - - cell_size1 = (cell1[:, 2:4] - cell1[:, 0:2]) - cell_size2 = cell_size1 * robust_std2 / robust_std1 - cell2 = np.c_[cell_center2 - cell_size2 / 2, cell_center2 + cell_size2 / 2] - - # make sure cell bounds are valid - cell2 = _norm_windows(cell2, H2, W2, forced_resolution=forced_resolution) - - # compute correspondence weights - corres_weights = _weight_pixels(cell1, p1, assigned) * _weight_pixels(cell2, p2, assigned) - - # return a list of window pairs and assigned correspondences - return cell1, cell2, corres_weights - - -def greedy_selection(corres_weights, target=0.9): - # corres_weight = (n_cell_pair, n_corres) matrix. - # If corres_weight[c,p]>0, means that correspondence p is visible in cell pair p - assert 0 < target <= 1 - corres_weights = corres_weights.copy() - - total = corres_weights.max(axis=0).sum() - target *= total - - # init = empty - res = [] - cur = np.zeros(corres_weights.shape[1]) # current selection - - while cur.sum() < target: - # pick the nex best cell pair - best = corres_weights.sum(axis=1).argmax() - res.append(best) - - # update current - cur += corres_weights[best] - # print('appending', best, 'with score', corres_weights[best].sum(), '-->', cur.sum()) - - # remove from all other views - corres_weights = (corres_weights - corres_weights[best]).clip(min=0) - - return res - - -def select_pairs_of_crops(img_q, img_b, pos2d_in_query, pos2d_in_ref, maxdim=512, overlap=.5, forced_resolution=None): - # prepare the overlapping cells - grid_q = _make_overlapping_grid(*img_q.shape[:2], maxdim, overlap) - grid_b = _make_overlapping_grid(*img_b.shape[:2], maxdim, overlap) - - assert forced_resolution is None or len(forced_resolution) == 2 - if isinstance(forced_resolution[0], int) or not len(forced_resolution[0]) == 2: - forced_resolution1 = forced_resolution2 = forced_resolution - else: - assert len(forced_resolution[1]) == 2 - forced_resolution1 = forced_resolution[0] - forced_resolution2 = forced_resolution[1] - - # Make sure crops respect constraints - grid_q = _norm_windows(grid_q.astype(float), *img_q.shape[:2], forced_resolution=forced_resolution1) - grid_b = _norm_windows(grid_b.astype(float), *img_b.shape[:2], forced_resolution=forced_resolution2) - - # score cells - pairs_q = _score_cell(grid_q, *img_b.shape[:2], pos2d_in_query, pos2d_in_ref, forced_resolution=forced_resolution2) - pairs_b = _score_cell(grid_b, *img_q.shape[:2], pos2d_in_ref, pos2d_in_query, forced_resolution=forced_resolution1) - pairs_b = pairs_b[1], pairs_b[0], pairs_b[2] # cellq, cellb, corres_weights - - # greedy selection until all correspondences are generated - cell1, cell2, corres_weights = map(np.concatenate, zip(pairs_q, pairs_b)) - if len(corres_weights) == 0: - return # tolerated for empty generators - order = greedy_selection(corres_weights, target=0.9) - - for i in order: - def pair_tag(qi, bi): return (str(qi) + crop_tag(cell1[i]), str(bi) + crop_tag(cell2[i])) - yield cell1[i], cell2[i], pair_tag diff --git a/modules/mast3r/utils/collate.py b/modules/mast3r/utils/collate.py deleted file mode 100644 index 72ee3a437b87ef7049dcd03b93e594a8325b780c..0000000000000000000000000000000000000000 --- a/modules/mast3r/utils/collate.py +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright (C) 2024-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). -# -# -------------------------------------------------------- -# Collate extensions -# -------------------------------------------------------- - -import torch -import collections -from torch.utils.data._utils.collate import default_collate_fn_map, default_collate_err_msg_format -from typing import Callable, Dict, Optional, Tuple, Type, Union, List - - -def cat_collate_tensor_fn(batch, *, collate_fn_map): - return torch.cat(batch, dim=0) - - -def cat_collate_list_fn(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, ...]], Callable]] = None): - return [item for bb in batch for item in bb] # concatenate all lists - - -cat_collate_fn_map = default_collate_fn_map.copy() -cat_collate_fn_map[torch.Tensor] = cat_collate_tensor_fn -cat_collate_fn_map[List] = cat_collate_list_fn -cat_collate_fn_map[type(None)] = lambda _, **kw: None # When some Nones, simply return a single None - - -def cat_collate(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, ...]], Callable]] = None): - r"""Custom collate function that concatenates stuff instead of stacking them, and handles NoneTypes """ - elem = batch[0] - elem_type = type(elem) - - if collate_fn_map is not None: - if elem_type in collate_fn_map: - return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map) - - for collate_type in collate_fn_map: - if isinstance(elem, collate_type): - return collate_fn_map[collate_type](batch, collate_fn_map=collate_fn_map) - - if isinstance(elem, collections.abc.Mapping): - try: - return elem_type({key: cat_collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem}) - except TypeError: - # The mapping type may not support `__init__(iterable)`. - return {key: cat_collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem} - elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple - return elem_type(*(cat_collate(samples, collate_fn_map=collate_fn_map) for samples in zip(*batch))) - elif isinstance(elem, collections.abc.Sequence): - transposed = list(zip(*batch)) # It may be accessed twice, so we use a list. - - if isinstance(elem, tuple): - # Backwards compatibility. - return [cat_collate(samples, collate_fn_map=collate_fn_map) for samples in transposed] - else: - try: - return elem_type([cat_collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]) - except TypeError: - # The sequence type may not support `__init__(iterable)` (e.g., `range`). - return [cat_collate(samples, collate_fn_map=collate_fn_map) for samples in transposed] - - raise TypeError(default_collate_err_msg_format.format(elem_type)) diff --git a/modules/mast3r/utils/misc.py b/modules/mast3r/utils/misc.py deleted file mode 100644 index 1a5403c67116f5156e47537df8fbcfcacb7bb474..0000000000000000000000000000000000000000 --- a/modules/mast3r/utils/misc.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (C) 2024-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). -# -# -------------------------------------------------------- -# utilitary functions for MASt3R -# -------------------------------------------------------- -import os -import hashlib - - -def mkdir_for(f): - os.makedirs(os.path.dirname(f), exist_ok=True) - return f - - -def hash_md5(s): - return hashlib.md5(s.encode('utf-8')).hexdigest() diff --git a/modules/mast3r/utils/path_to_dust3r.py b/modules/mast3r/utils/path_to_dust3r.py deleted file mode 100644 index 5d47979a1e8a0f34e28327fcfe423bd85ddfaa87..0000000000000000000000000000000000000000 --- a/modules/mast3r/utils/path_to_dust3r.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (C) 2024-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). -# -# -------------------------------------------------------- -# dust3r submodule import -# -------------------------------------------------------- - -import sys -import os.path as path -HERE_PATH = path.normpath(path.dirname(__file__)) -DUSt3R_REPO_PATH = path.normpath(path.join(HERE_PATH, '../../')) -DUSt3R_LIB_PATH = path.join(DUSt3R_REPO_PATH, 'dust3r') -# check the presence of models directory in repo to be sure its cloned -if path.isdir(DUSt3R_LIB_PATH): - # workaround for sibling import - sys.path.insert(0, DUSt3R_REPO_PATH) -else: - raise ImportError(f"dust3r is not initialized, could not find: {DUSt3R_LIB_PATH}.\n " - "Did you forget to run 'git submodule update --init --recursive' ?")