Spaces:

RGBD-SOD
/

S-MultiMAE

Sleeping

App Files Files Community

thinh-researcher commited on Apr 8, 2024

Commit

6e9c433

1 Parent(s): 2cadd70

Init

Browse files

Files changed (49) hide show

.gitignore +28 -0
README.md +72 -1
definition.py +21 -0
docs/figures/proposed_method_v5.drawio.png +0 -0
docs/references/Dataset.bib +124 -0
docs/references/References.bib +190 -0
docs/references/SOTAs.bib +355 -0
requirements-lock.txt +103 -0
requirements.txt +18 -0
s_multimae/__init__.py +0 -0
s_multimae/configs/__init__.py +0 -0
s_multimae/configs/base_config.py +164 -0
s_multimae/configs/data_augmentation_config.py +19 -0
s_multimae/configs/experiment_config.py +31 -0
s_multimae/configs/experiment_configs/__init__.py +0 -0
s_multimae/configs/experiment_configs/expv1_dynamic.py +277 -0
s_multimae/da/__init__.py +0 -0
s_multimae/da/base_da.py +33 -0
s_multimae/da/dav6.py +147 -0
s_multimae/data_augmentation.py +19 -0
s_multimae/model/__init__.py +0 -0
s_multimae/model/components.py +117 -0
s_multimae/model/multimae.py +938 -0
s_multimae/model_pl.py +105 -0
s_multimae/rgbd_model.py +60 -0
s_multimae/utils.py +236 -0
s_multimae/visualize_2d_posemb.py +58 -0
s_multimae/visualizer.py +711 -0
streamlit_apps/__init__.py +0 -0
streamlit_apps/app.py +91 -0
streamlit_apps/app_utils/__init__.py +0 -0
streamlit_apps/app_utils/app_env.py +16 -0
streamlit_apps/app_utils/app_utils.py +83 -0
streamlit_apps/app_utils/base_model.py +54 -0
streamlit_apps/app_utils/color_selection_ui.py +10 -0
streamlit_apps/app_utils/depth_model.py +77 -0
streamlit_apps/app_utils/depth_selection_ui.py +27 -0
streamlit_apps/app_utils/device.py +5 -0
streamlit_apps/app_utils/dpt/__init__.py +0 -0
streamlit_apps/app_utils/dpt/base_model.py +16 -0
streamlit_apps/app_utils/dpt/blocks.py +383 -0
streamlit_apps/app_utils/dpt/midas_net.py +78 -0
streamlit_apps/app_utils/dpt/models.py +124 -0
streamlit_apps/app_utils/dpt/transforms.py +231 -0
streamlit_apps/app_utils/dpt/vit.py +576 -0
streamlit_apps/app_utils/image_inference.py +88 -0
streamlit_apps/app_utils/model.py +84 -0
streamlit_apps/app_utils/smultimae_model.py +43 -0
streamlit_apps/app_utils/sod_selection_ui.py +111 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,28 @@

+env
+__pycache__/
+*.pth
+*.pt
+datasets/**/benchmark/*
+datasets/**/test/*
+datasets/**/train/*
+datasets/**/dev/*
+datasets/**/*.zip
+sources/deployment/*
+sources/experiment/*
+sources/pickle/*
+sources/csv/*/*
+sources/json/*
+sotas/*
+continue_training/*
+weights/*
+!*.gitkeep
+logs
+wandb
+tmp/
+wandb_cache
+script.md

README.md CHANGED Viewed

@@ -9,4 +9,75 @@ app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 pinned: false
 ---
+# S-MultiMAE
+This repository provides the official implementation of `S-MultiMAE A Multi-Ground Truth approach for RGB-D Saliency Detection`
+_Nguyen Truong Thinh Huynh, Van Linh Pham, Xuan Toan Mai and Tuan Anh Tran_
+![alt text](docs/figures/proposed_method_v5.drawio.png)
+## Model weights
+| Backbone | #params     | Training paradigm | Weights                                                                                        | Input size |
+| -------- | ----------- | ----------------- | ---------------------------------------------------------------------------------------------- | ---------- |
+| ViT-L    | 328,318,529 | Multi-GT          | [Download](https://drive.google.com/file/d/1YhAuu3DI2adPLQgbgoSt74ilZbpuKihh/view?usp=sharing) | 224x224    |
+| ViT-B    | 107,654,977 | Multi-GT          | [Download](https://drive.google.com/file/d/13Omafif3pvPKgg3Isp_srkHf8CSPx33d/view?usp=sharing) | 224x224    |
+## How to run
+### Create a virtual environment
+We recommend using python 3.10 or higher.
+```bash
+python3.10 -m venv env
+source env/bin/activate
+pip install -r requirements.txt
+```
+### Download trained weights
+- Download model weights and put it in the folder `weights`. You may also need to download the weights of [DPT model]() (a rgb2depth model). The `weights` folder will look like this:
+```bash
+├── weights
+│       ├── omnidata_rgb2depth_dpt_hybrid.pth
+│       ├── s-multimae-cfgv4_0_2006-top1.pth
+│       ├── s-multimae-cfgv4_0_2007-top1.pth
+```
+### Run
+- Run streamlit app
+```
+streamlit run streamlit_apps/app.py --server.port 9113 --browser.gatherUsageStats False --server.fileWatcherType none
+```
+## Datasets
+### COME15K dataset
+|                       | 1 GT   | 2 GTs | 3 GTs  | 4 GTs | 5 GTs |
+| --------------------- | ------ | ----- | ------ | ----- | ----- |
+| COME8K (8025 samples) | 77.61% | 1.71% | 18.28% | 2.24% | 0.16% |
+| COME-E (4600 samples) | 70.5%  | 1.87% | 21.15% | 5.70% | 0.78% |
+| COME8K (3000 samples) | 62.3%  | 2.00% | 25.63% | 8.37% | 1.70% |
+```
+@inproceedings{cascaded_rgbd_sod,
+  title={RGB-D Saliency Detection via Cascaded Mutual Information Minimization},
+  author={Zhang, Jing and Fan, Deng-Ping and Dai, Yuchao and Yu, Xin and Zhong, Yiran and Barnes, Nick and Shao, Ling},
+  booktitle={International Conference on Computer Vision (ICCV)},
+  year={2021}
+}
+```
+## References
+All references are cited in these files:
+- [Datasets](./docs/references/Dataset.bib)
+- [SOTAs](./docs/references/SOTAs.bib)
+- [Others](./docs/references/References.bib)

definition.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""
+Do not import other modules!
+"""
+class PRETRAINED_BACKBONE:
+    MULTIMAE = "multimae"
+    S_MULTIMAE = "s-multimae"
+    LARGE_S_MULTIMAE = "large-s-multimae"
+    MAE = "mae"
+    LARGE_MAE = "large-mae"
+    HUGE_MAE = "huge-mae"
+    FINETUNE_LARGE_S_MULTIMAE = "finetune-large-s-multimae"
+    FINETUNE_S_MULTIMAE = "finetune-s-multimae"
+    VIT = "vit"  # train from supervised model
+    NONE = None  # train from scratch

docs/figures/proposed_method_v5.drawio.png ADDED Viewed

docs/references/Dataset.bib ADDED Viewed

	@@ -0,0 +1,124 @@

+% Encoding: UTF-8
+% DES
+@inproceedings{cheng2014depth,
+title={Depth enhanced saliency detection method},
+author={Cheng, Yupeng and Fu, Huazhu and Wei, Xingxing and Xiao, Jiangjian and Cao, Xiaochun},
+booktitle={Proceedings of international conference on internet multimedia computing and service},
+pages={23--27},
+year={2014}
+}
+% DUT-RGBD
+@inproceedings{piao2019depth,
+title={Depth-induced multi-scale recurrent attention network for saliency detection},
+author={Piao, Yongri and Ji, Wei and Li, Jingjing and Zhang, Miao and Lu, Huchuan},
+booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+pages={7254--7263},
+year={2019}
+}
+% LFSD
+@inproceedings{li2014saliency,
+title={Saliency detection on light field},
+author={Li, Nianyi and Ye, Jinwei and Ji, Yu and Ling, Haibin and Yu, Jingyi},
+booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+pages={2806--2813},
+year={2014}
+}
+% NJU2K
+@inproceedings{ju2014depth,
+title={Depth saliency based on anisotropic center-surround difference},
+author={Ju, Ran and Ge, Ling and Geng, Wenjing and Ren, Tongwei and Wu, Gangshan},
+booktitle={2014 IEEE international conference on image processing (ICIP)},
+pages={1115--1119},
+year={2014},
+organization={IEEE}
+}
+% SSD
+@inproceedings{zhu2017three,
+  title={A three-pathway psychobiological framework of salient object detection using stereoscopic technology},
+  author={Zhu, Chunbiao and Li, Ge},
+  booktitle={Proceedings of the IEEE international conference on computer vision workshops},
+  pages={3008--3014},
+  year={2017}
+}
+% Holo50K
+@article{hua2020holopix50k,
+  title={Holopix50k: A large-scale in-the-wild stereo image dataset},
+  author={Hua, Yiwen and Kohli, Puneet and Uplavikar, Pritish and Ravi, Anand and Gunaseelan, Saravana and Orozco, Jason and Li, Edward},
+  journal={arXiv preprint arXiv:2003.11172},
+  year={2020}
+}
+% NLPR
+@inproceedings{peng2014rgbd,
+title={RGBD salient object detection: A benchmark and algorithms},
+author={Peng, Houwen and Li, Bing and Xiong, Weihua and Hu, Weiming and Ji, Rongrong},
+booktitle={European conference on computer vision},
+pages={92--109},
+year={2014},
+organization={Springer}
+}
+% SIP
+@article{fan2020rethinking,
+title={Rethinking RGB-D salient object detection: Models, data sets, and large-scale benchmarks},
+author={/project/634a6386039ac5d46d8c6ab0Fan, Deng-Ping and Lin, Zheng and Zhang, Zhao and Zhu, Menglong and Cheng, Ming-Ming},
+journal={IEEE Transactions on neural networks and learning systems},
+volume={32},
+number={5},
+pages={2075--2089},
+year={2020},
+publisher={IEEE}
+}
+% STERE
+@inproceedings{niu2012leveraging,
+title={Leveraging stereopsis for saliency analysis},
+author={Niu, Yuzhen and Geng, Yujie and Li, Xueqing and Liu, Feng},
+booktitle={2012 IEEE Conference on Computer Vision and Pattern Recognition},
+pages={454--461},
+year={2012},
+organization={IEEE}
+}
+% RGB-Thermal
+@inproceedings{ha2017mfnet,
+  title={MFNet: Towards real-time semantic segmentation for autonomous vehicles with multi-spectral scenes},
+  author={Ha, Qishen and Watanabe, Kohei and Karasawa, Takumi and Ushiku, Yoshitaka and Harada, Tatsuya},
+  booktitle={2017 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
+  pages={5108--5115},
+  year={2017},
+  organization={IEEE}
+}
+%RGB-Polarization
+@article{xiang2021polarization,
+  title={Polarization-driven semantic segmentation via efficient attention-bridged fusion},
+  author={Xiang, Kaite and Yang, Kailun and Wang, Kaiwei},
+  journal={Optics Express},
+  volume={29},
+  number={4},
+  pages={4802--4820},
+  year={2021},
+  publisher={Optica Publishing Group}
+}
+% ImageNet
+@article{russakovsky2015imagenet,
+  title={Imagenet large scale visual recognition challenge},
+  author={Russakovsky, Olga and Deng, Jia and Su, Hao and Krause, Jonathan and Satheesh, Sanjeev and Ma, Sean and Huang, Zhiheng and Karpathy, Andrej and Khosla, Aditya and Bernstein, Michael and others},
+  journal={International journal of computer vision},
+  volume={115},
+  number={3},
+  pages={211--252},
+  year={2015},
+  publisher={Springer}
+}
+@Comment{jabref-meta: databaseType:bibtex;}

docs/references/References.bib ADDED Viewed

	@@ -0,0 +1,190 @@

+% Encoding: UTF-8
+% An Empirical Study of Training Self-Supervised Vision Transformers
+@inproceedings{chen2021empirical,
+title={An empirical study of training self-supervised vision transformers},
+author={Chen, Xinlei and Xie, Saining and He, Kaiming},
+booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+pages={9640--9649},
+year={2021}
+}
+% 2D positional embedding
+@article{raisi20202d,
+title={2D positional embedding-based transformer for scene text recognition},
+author={Raisi, Zobeir and Naiel, Mohamed A and Fieguth, Paul and Wardell, Steven and Zelek, John},
+journal={Journal of Computational Vision and Imaging Systems},
+volume={6},
+number={1},
+pages={1--4},
+year={2020}
+}
+% Layer Normalization
+@article{ba2016layer,
+title={Layer normalization},
+author={Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E},
+journal={arXiv preprint arXiv:1607.06450},
+year={2016}
+}
+% Batch Normalization
+@inproceedings{ioffe2015batch,
+title={Batch normalization: Accelerating deep network training by reducing internal covariate shift},
+author={Ioffe, Sergey and Szegedy, Christian},
+booktitle={International conference on machine learning},
+pages={448--456},
+year={2015},
+organization={PMLR}
+}
+% ReLU
+@article{fukushima1975cognitron,
+title={Cognitron: A self-organizing multilayered neural network},
+author={Fukushima, Kunihiko},
+journal={Biological cybernetics},
+volume={20},
+number={3},
+pages={121--136},
+year={1975},
+publisher={Springer}
+}
+% Weight Normalization
+@article{salimans2016weight,
+title={Weight normalization: A simple reparameterization to accelerate training of deep neural networks},
+author={Salimans, Tim and Kingma, Durk P},
+journal={Advances in neural information processing systems},
+volume={29},
+year={2016}
+}
+% Stochastic depth
+@inproceedings{huang2016deep,
+title={Deep networks with stochastic depth},
+author={Huang, Gao and Sun, Yu and Liu, Zhuang and Sedra, Daniel and Weinberger, Kilian Q},
+booktitle={European conference on computer vision},
+pages={646--661},
+year={2016},
+organization={Springer}
+}
+% Stereo Matching Algorithm
+@article{zhong2020displacement,
+title={Displacement-invariant cost computation for efficient stereo matching},
+author={Zhong, Yiran and Loop, Charles and Byeon, Wonmin and Birchfield, Stan and Dai, Yuchao and Zhang, Kaihao and Kamenev, Alexey and Breuel, Thomas and Li, Hongdong and Kautz, Jan},
+journal={arXiv preprint arXiv:2012.00899},
+year={2020}
+}
+% wandb
+@misc{wandb,
+title = {Experiment Tracking with Weights and Biases},
+year = {2020},
+note = {Software available from wandb.com},
+url={https://www.wandb.com/},
+author = {Biewald, Lukas},
+}
+%
+@article{borji2015salient,
+title={Salient object detection: A benchmark},
+author={Borji, Ali and Cheng, Ming-Ming and Jiang, Huaizu and Li, Jia},
+journal={IEEE transactions on image processing},
+volume={24},
+number={12},
+pages={5706--5722},
+year={2015},
+publisher={IEEE}
+}
+% SOD metrics
+@misc{sodmetrics,
+title = {PySODMetrics: A simple and efficient implementation of SOD metrics},
+howpublished = {\url{https://github.com/lartpang/PySODMetrics}},
+note = {Accessed: 2022-10-31}
+}
+% MAE
+@inproceedings{perazzi2012saliency,
+title={Saliency filters: Contrast based filtering for salient region detection},
+author={Perazzi, Federico and Kr{\"a}henb{\"u}hl, Philipp and Pritch, Yael and Hornung, Alexander},
+booktitle={2012 IEEE conference on computer vision and pattern recognition},
+pages={733--740},
+year={2012},
+organization={IEEE}
+}
+% F-measure
+@inproceedings{achanta2009frequency,
+title={Frequency-tuned salient region detection},
+author={Achanta, Radhakrishna and Hemami, Sheila and Estrada, Francisco and Susstrunk, Sabine},
+booktitle={2009 IEEE conference on computer vision and pattern recognition},
+pages={1597--1604},
+year={2009},
+organization={IEEE}
+}
+% E-measure
+@article{fan2018enhanced,
+title={Enhanced-alignment measure for binary foreground map evaluation},
+author={Fan, Deng-Ping and Gong, Cheng and Cao, Yang and Ren, Bo and Cheng, Ming-Ming and Borji, Ali},
+journal={arXiv preprint arXiv:1805.10421},
+year={2018}
+}
+% S-measure
+@inproceedings{fan2017structure,
+title={Structure-measure: A new way to evaluate foreground maps},
+author={Fan, Deng-Ping and Cheng, Ming-Ming and Liu, Yun and Li, Tao and Borji, Ali},
+booktitle={Proceedings of the IEEE international conference on computer vision},
+pages={4548--4557},
+year={2017}
+}
+% GELU
+@article{hendrycks2016gaussian,
+title={Gaussian error linear units (gelus)},
+author={Hendrycks, Dan and Gimpel, Kevin},
+journal={arXiv preprint arXiv:1606.08415},
+year={2016}
+}
+% Instance normalization
+@article{ulyanov2016instance,
+title={Instance normalization: The missing ingredient for fast stylization},
+author={Ulyanov, Dmitry and Vedaldi, Andrea and Lempitsky, Victor},
+journal={arXiv preprint arXiv:1607.08022},
+year={2016}
+}
+% Group normalization
+@inproceedings{wu2018group,
+title={Group normalization},
+author={Wu, Yuxin and He, Kaiming},
+booktitle={Proceedings of the European conference on computer vision (ECCV)},
+pages={3--19},
+year={2018}
+}
+% timm
+@misc{rw2019timm,
+author = {Ross Wightman},
+title = {PyTorch Image Models},
+year = {2019},
+publisher = {GitHub},
+journal = {GitHub repository},
+doi = {10.5281/zenodo.4414861},
+howpublished = {\url{https://github.com/rwightman/pytorch-image-models}}
+}
+% taskonomy
+@inproceedings{zamir2018taskonomy,
+title={Taskonomy: Disentangling task transfer learning},
+author={Zamir, Amir R and Sax, Alexander and Shen, William and Guibas, Leonidas J and Malik, Jitendra and Savarese, Silvio},
+booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+pages={3712--3722},
+year={2018}
+}
+@Comment{jabref-meta: databaseType:bibtex;}

docs/references/SOTAs.bib ADDED Viewed

	@@ -0,0 +1,355 @@

+% Encoding: UTF-8
+% COME15K CMINet
+@Article{cascaded_rgbd_sod,
+title={RGB-D Saliency Detection via Cascaded Mutual Information Minimization},
+author={Zhang, Jing and Fan, Deng-Ping and Dai, Yuchao and Yu, Xin and Zhong, Yiran and Barnes, Nick and Shao, Ling},
+booktitle={International Conference on Computer Vision (ICCV)},
+year={2021}
+}
+% A2dele
+@Article{piao2020a2dele,
+title={A2dele: Adaptive and attentive depth distiller for efficient RGB-D salient object detection},
+author={Piao, Yongri and Rong, Zhengkun and Zhang, Miao and Ren, Weisong and Lu, Huchuan},
+booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
+pages={9060--9069},
+year={2020}
+}
+% BBS-Net
+@Article{fan2020bbs,
+title={BBS-Net: RGB-D salient object detection with a bifurcated backbone strategy network},
+author={Fan, Deng-Ping and Zhai, Yingjie and Borji, Ali and Yang, Jufeng and Shao, Ling},
+booktitle={European conference on computer vision},
+pages={275--292},
+year={2020},
+organization={Springer}
+}
+% MobileSal
+@article{wu2021mobilesal,
+title={MobileSal: Extremely efficient RGB-D salient object detection},
+author={Wu, Yu-Huan and Liu, Yun and Xu, Jun and Bian, Jia-Wang and Gu, Yu-Chao and Cheng, Ming-Ming},
+journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
+year={2021},
+publisher={IEEE}
+}
+% ATSA
+@Article{zhang2020asymmetric,
+title={Asymmetric two-stream architecture for accurate RGB-D saliency detection},
+author={Zhang, Miao and Fei, Sun Xiao and Liu, Jie and Xu, Shuang and Piao, Yongri and Lu, Huchuan},
+booktitle={European Conference on Computer Vision},
+pages={374--390},
+year={2020},
+organization={Springer}
+}
+% CDNet
+@article{jin2021cdnet,
+title={CDNet: Complementary depth network for RGB-D salient object detection},
+author={Jin, Wen-Da and Xu, Jun and Han, Qi and Zhang, Yi and Cheng, Ming-Ming},
+journal={IEEE Transactions on Image Processing},
+volume={30},
+pages={3376--3390},
+year={2021},
+publisher={IEEE}
+}
+% CoNet
+@Article{ji2020accurate,
+title={Accurate RGB-D salient object detection via collaborative learning},
+author={Ji, Wei and Li, Jingjing and Zhang, Miao and Piao, Yongri and Lu, Huchuan},
+booktitle={European Conference on Computer Vision},
+pages={52--69},
+year={2020},
+organization={Springer}
+}
+% SPNet
+@inproceedings{zhou2021specificity,
+title={Specificity-preserving rgb-d saliency detection},
+author={Zhou, Tao and Fu, Huazhu and Chen, Geng and Zhou, Yi and Fan, Deng-Ping and Shao, Ling},
+booktitle={Proceedings of the IEEE/CVF international conference on computer vision},
+pages={4681--4691},
+year={2021}
+}
+% C2DFNet
+@article{zhang2022c,
+title={C2DFNet: Criss-Cross Dynamic Filter Network for RGB-D Salient Object Detection},
+author={Zhang, Miao and Yao, Shunyu and Hu, Beiqi and Piao, Yongri and Ji, Wei},
+journal={IEEE Transactions on Multimedia},
+year={2022},
+publisher={IEEE}
+}
+% SPSN
+@inproceedings{lee2022spsn,
+title={SPSN: Superpixel Prototype Sampling Network for RGB-D Salient Object Detection},
+author={Lee, Minhyeok and Park, Chaewon and Cho, Suhwan and Lee, Sangyoun},
+booktitle={European Conference on Computer Vision},
+pages={630--647},
+year={2022},
+organization={Springer}
+}
+% ConvNeXt
+@inproceedings{liu2022convnet,
+title={A convnet for the 2020s},
+author={Liu, Zhuang and Mao, Hanzi and Wu, Chao-Yuan and Feichtenhofer, Christoph and Darrell, Trevor and Xie, Saining},
+booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+pages={11976--11986},
+year={2022}
+}
+% GPT-2
+@article{radford2019language,
+title={Language models are unsupervised multitask learners},
+author={Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya and others},
+journal={OpenAI blog},
+volume={1},
+number={8},
+pages={9},
+year={2019}
+}
+% BERT
+@article{devlin2018bert,
+title={Bert: Pre-training of deep bidirectional transformers for language understanding},
+author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
+journal={arXiv preprint arXiv:1810.04805},
+year={2018}
+}
+% UNet
+@inproceedings{ronneberger2015u,
+title={U-net: Convolutional networks for biomedical image segmentation},
+author={Ronneberger, Olaf and Fischer, Philipp and Brox, Thomas},
+booktitle={International Conference on Medical image computing and computer-assisted intervention},
+pages={234--241},
+year={2015},
+organization={Springer}
+}
+% MobileNetV2
+@inproceedings{sandler2018mobilenetv2,
+title={Mobilenetv2: Inverted residuals and linear bottlenecks},
+author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
+booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+pages={4510--4520},
+year={2018}
+}
+% Xception
+@inproceedings{chollet2017xception,
+title={Xception: Deep learning with depthwise separable convolutions},
+author={Chollet, Fran{\c{c}}ois},
+booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+pages={1251--1258},
+year={2017}
+}
+% MobileNets
+@article{howard2017mobilenets,
+title={Mobilenets: Efficient convolutional neural networks for mobile vision applications},
+author={Howard, Andrew G and Zhu, Menglong and Chen, Bo and Kalenichenko, Dmitry and Wang, Weijun and Weyand, Tobias and Andreetto, Marco and Adam, Hartwig},
+journal={arXiv preprint arXiv:1704.04861},
+year={2017}
+}
+% ResNet
+@inproceedings{he2016deep,
+title={Deep residual learning for image recognition},
+author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+pages={770--778},
+year={2016}
+}
+% ResNeXt
+@inproceedings{xie2017aggregated,
+title={Aggregated residual transformations for deep neural networks},
+author={Xie, Saining and Girshick, Ross and Doll{\'a}r, Piotr and Tu, Zhuowen and He, Kaiming},
+booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+pages={1492--1500},
+year={2017}
+}
+% MultiMAE
+@article{bachmann2022multimae,
+title={MultiMAE: Multi-modal Multi-task Masked Autoencoders},
+author={Bachmann, Roman and Mizrahi, David and Atanov, Andrei and Zamir, Amir},
+journal={arXiv preprint arXiv:2204.01678},
+year={2022}
+}
+% MAE
+@inproceedings{he2022masked,
+title={Masked autoencoders are scalable vision learners},
+author={He, Kaiming and Chen, Xinlei and Xie, Saining and Li, Yanghao and Doll{\'a}r, Piotr and Girshick, Ross},
+booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+pages={16000--16009},
+year={2022}
+}
+% VisionTransformer, ViT
+@article{dosovitskiy2020image,
+title={An image is worth 16x16 words: Transformers for image recognition at scale},
+author={Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and others},
+journal={arXiv preprint arXiv:2010.11929},
+year={2020}
+}
+% DANet
+@Article{zhao2020single,
+title={A single stream network for robust and real-time RGB-D salient object detection},
+author={Zhao, Xiaoqi and Zhang, Lihe and Pang, Youwei and Lu, Huchuan and Zhang, Lei},
+booktitle={European Conference on Computer Vision},
+pages={646--662},
+year={2020},
+organization={Springer}
+}
+% DCF
+@Article{Ji_2021_DCF,
+  author    = {Ji, Wei and Li, Jingjing and Yu, Shuang and Zhang, Miao and Piao, Yongri and Yao, Shunyu and Bi, Qi and Ma, Kai and Zheng, Yefeng and Lu, Huchuan and Cheng, Li},
+  title     = {Calibrated RGB-D Salient Object Detection},
+  booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year      = {2021},
+  pages     = {9471-9481}
+}
+% MVSalNet
+@inproceedings{zhou2022mvsalnet,
+title={MVSalNet: Multi-view Augmentation for RGB-D Salient Object Detection},
+author={Zhou, Jiayuan and Wang, Lijun and Lu, Huchuan and Huang, Kaining and Shi, Xinchu and Liu, Bocong},
+booktitle={European Conference on Computer Vision},
+pages={270--287},
+year={2022},
+organization={Springer}
+}
+% DSA2F
+@Article{Sun2021DeepRS,
+title={Deep RGB-D Saliency Detection with Depth-Sensitive Attention and Automatic Multi-Modal Fusion},
+author={P. Sun and Wenhu Zhang and Huanyu Wang and Songyuan Li and Xi Li},
+journal={IEEE Conf. Comput. Vis. Pattern Recog.},
+year={2021}
+}
+% FRDT
+@Article{zhang2020feature,
+title={Feature reintegration over differential treatment: A top-down and adaptive fusion network for RGB-D salient object detection},
+author={Zhang, Miao and Zhang, Yu and Piao, Yongri and Hu, Beiqi and Lu, Huchuan},
+booktitle={Proceedings of the 28th ACM international conference on multimedia},
+pages={4107--4115},
+year={2020}
+}
+% HAINet
+@article{li2021hierarchical,
+title={Hierarchical alternate interaction network for RGB-D salient object detection},
+author={Li, Gongyang and Liu, Zhi and Chen, Minyu and Bai, Zhen and Lin, Weisi and Ling, Haibin},
+journal={IEEE Transactions on Image Processing},
+volume={30},
+pages={3528--3542},
+year={2021},
+publisher={IEEE}
+}
+% JLDCF
+@Article{fu2020jl,
+title={JL-DCF: Joint learning and densely-cooperative fusion framework for RGB-D salient object detection},
+author={Fu, Keren and Fan, Deng-Ping and Ji, Ge-Peng and Zhao, Qijun},
+booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
+pages={3052--3062},
+year={2020}
+}
+% SSLSOD
+@inproceedings{zhao2022self,
+title={Self-supervised pretraining for rgb-d salient object detection},
+author={Zhao, Xiaoqi and Pang, Youwei and Zhang, Lihe and Lu, Huchuan and Ruan, Xiang},
+booktitle={AAAI Conference on Artificial Intelligence},
+volume={3},
+year={2022}
+}
+% DFTR
+@article{zhudftr,
+title={DFTR: Depth-supervised Fusion Transformer for Salient Object Detection},
+author={Zhu, Heqin and Sun, Xu and Li, Yuexiang and Ma, Kai and Zhou, S Kevin and Zheng, Yefeng}
+}
+% PGAR
+@Article{chen2020progressively,
+title={Progressively guided alternate refinement network for RGB-D salient object detection},
+author={Chen, Shuhan and Fu, Yun},
+booktitle={European Conference on Computer Vision},
+pages={520--538},
+year={2020},
+organization={Springer}
+}
+% DCMF
+@article{wang2022learning,
+title={Learning Discriminative Cross-Modality Features for RGB-D Saliency Detection},
+author={Wang, Fengyun and Pan, Jinshan and Xu, Shoukun and Tang, Jinhui},
+journal={IEEE Transactions on Image Processing},
+volume={31},
+pages={1285--1297},
+year={2022},
+publisher={IEEE}
+}
+% RD3D
+@Article{chen2021rgb,
+title={RGB-D salient object detection via 3D convolutional neural networks},
+author={Chen, Qian and Liu, Ze and Zhang, Yi and Fu, Keren and Zhao, Qijun and Du, Hongwei},
+booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
+volume={35},
+number={2},
+pages={1063--1071},
+year={2021}
+}
+% ReDWeb-S
+% S2MA
+@Article{liu2020learning,
+title={Learning selective self-mutual attention for RGB-D saliency detection},
+author={Liu, Nian and Zhang, Ni and Han, Junwei},
+booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
+pages={13756--13765},
+year={2020}
+}
+% SSF
+@Article{zhang2020select,
+title={Select, supplement and focus for RGB-D saliency detection},
+author={Zhang, Miao and Ren, Weisong and Piao, Yongri and Rong, Zhengkun and Lu, Huchuan},
+booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
+pages={3472--3481},
+year={2020}
+}
+% UCNet
+@Article{zhang2020uc,
+title={UC-Net: Uncertainty inspired RGB-D saliency detection via conditional variational autoencoders},
+author={Zhang, Jing and Fan, Deng-Ping and Dai, Yuchao and Anwar, Saeed and Saleh, Fatemeh Sadat and Zhang, Tong and Barnes, Nick},
+booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
+pages={8582--8591},
+year={2020}
+}
+% TriTransNet
+@inproceedings{liu2021tritransnet,
+title={TriTransNet: RGB-D salient object detection with a triplet transformer embedding network},
+author={Liu, Zhengyi and Wang, Yuan and Tu, Zhengzheng and Xiao, Yun and Tang, Bin},
+booktitle={Proceedings of the 29th ACM international conference on multimedia},
+pages={4481--4490},
+year={2021}
+}
+@Comment{jabref-meta: databaseType:bibtex;}

requirements-lock.txt ADDED Viewed

	@@ -0,0 +1,103 @@

+aiohttp==3.9.3
+aiosignal==1.3.1
+albumentations==1.4.3
+altair==5.3.0
+async-timeout==4.0.3
+attrs==23.2.0
+black==24.3.0
+blinker==1.7.0
+cachetools==5.3.3
+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+contourpy==1.2.1
+cycler==0.12.1
+docstring_parser==0.16
+einops==0.7.0
+filelock==3.13.3
+fonttools==4.51.0
+frozenlist==1.4.1
+fsspec==2024.3.1
+gitdb==4.0.11
+GitPython==3.1.43
+huggingface-hub==0.22.2
+idna==3.6
+imageio==2.34.0
+Jinja2==3.1.3
+joblib==1.3.2
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+lazy_loader==0.4
+lightning-utilities==0.11.2
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.8.4
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.0.5
+mypy-extensions==1.0.0
+networkx==3.3
+numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.18.1
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.1.105
+opencv-python==4.9.0.80
+opencv-python-headless==4.9.0.80
+packaging==24.0
+pandas==2.2.1
+pathspec==0.12.1
+pillow==10.3.0
+platformdirs==4.2.0
+protobuf==4.25.3
+pyarrow==15.0.2
+pycocotools==2.0.7
+pydeck==0.8.1b0
+Pygments==2.17.2
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+pytorch-lightning==2.2.1
+pytz==2024.1
+PyYAML==6.0.1
+referencing==0.34.0
+requests==2.31.0
+rich==13.7.1
+rpds-py==0.18.0
+safetensors==0.4.2
+scikit-image==0.22.0
+scikit-learn==1.4.1.post1
+scipy==1.13.0
+six==1.16.0
+smmap==5.0.1
+streamlit==1.33.0
+sympy==1.12
+tenacity==8.2.3
+termcolor==2.4.0
+threadpoolctl==3.4.0
+tifffile==2024.2.12
+timm==0.9.16
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+torch==2.1.0
+torchmetrics==1.3.2
+torchvision==0.16.0
+tornado==6.4
+tqdm==4.66.2
+triton==2.1.0
+typed-argument-parser==1.9.0
+typing-inspect==0.9.0
+typing_extensions==4.11.0
+tzdata==2024.1
+urllib3==2.2.1
+watchdog==4.0.0
+yarl==1.9.4

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+torch==2.1.0
+torchvision
+opencv-python
+pycocotools
+matplotlib
+Pillow
+numpy
+einops
+timm
+albumentations
+termcolor
+tqdm
+pandas
+typed-argument-parser
+pytorch-lightning
+streamlit
+black
+huggingface-hub

s_multimae/__init__.py ADDED Viewed

File without changes

s_multimae/configs/__init__.py ADDED Viewed

File without changes

s_multimae/configs/base_config.py ADDED Viewed

	@@ -0,0 +1,164 @@

+from functools import partial
+import os
+from typing import Dict, Optional, Tuple, List
+import torch
+from torch import nn
+import math
+import albumentations as A
+from definition import PRETRAINED_BACKBONE
+from .data_augmentation_config import DataAugmentationConfig
+class base_cfg:
+    def __init__(
+        self,
+        epoch: int,
+        datasets_set: int,
+        experiment_name: Optional[str] = None,
+    ):
+        self.experiment_name = experiment_name = (
+            self.__class__.__name__ if experiment_name is None else experiment_name
+        )
+        self.datasets_set = datasets_set
+        # Trainv3
+        self.devices: List[int] = [0, 1]
+        # How often to check the validation set. Pass a float in the range [0.0, 1.0] to check
+        self.val_check_interval: float = 1.0
+        # Perform a validation loop every after every N training epochs.
+        self.check_val_every_n_epoch: int = 2
+        self.precision = 16
+        self.transform1 = [A.HorizontalFlip(p=0.5)]
+        self.save_top_k = 2
+        # ConvNeXtAdapter
+        self.dec_kernel = 1  # decoder kernel size
+        # Version 1: as usual
+        # Version 2: mean, std
+        self.model_version = 1
+        self.visualized_num_dev_samples = 0
+        # PytorchLightning Trainer
+        self.sync_batchnorm = True
+        self.normalized_depth: bool = True
+        self.test_image_size: int = 224
+        self.image_size: int = 224
+        """Whether using fp16 instead of fp32 (default)"""
+        self.is_fp16: bool = True
+        self.is_padding: bool = (
+            False  # deprecated due to randomly switch between padding and non-padding
+        )
+        # """For debug only"""
+        # self.max_train_samples: Optional[int] = None
+        # self.max_dev_samples: Optional[int] = None
+        """Whether using padding for test"""
+        self.is_padding_for_test: bool = False
+        """Seed"""
+        self.seed: int = 2022
+        """ MultiMAE """
+        self.decoder_depth: int = 4
+        self.encoder_depth: int = 12
+        self.is_inference_with_no_depth: bool = False
+        self.inputs = ["rgb", "depth"]
+        self.outputs = ["sod"]
+        self.decoder_main_tasks: List[List[str]] = [["rgb"]]
+        self.learnable_pos_emb: bool = False
+        self.learnable_additional_gt_tokens: bool = False
+        self.decoder_interpolate_mode: str = "bilinear"  # ['bilinear', 'nearest']
+        self.dim_tokens: int = 768
+        self.act_fn = partial(nn.ReLU, inplace=True)
+        self.num_heads: int = 12
+        self.freeze_encoder: bool = False
+        """Data Augmentation"""
+        self.data_augmentation_version: int = 2
+        self.data_augmentation_config = DataAugmentationConfig()
+        self.ckpt_path: Optional[str] = None
+        self.description: str = ""  # Override this
+        self.embed_dim: int = 6144
+        """Pretrained Backbone"""
+        self.pretrained_backbone: Optional[PRETRAINED_BACKBONE] = (
+            PRETRAINED_BACKBONE.MULTIMAE
+        )
+        """
+        Required only when self.pretrained_backbone in [PRETRAINED_BACKBONE.S_MULTIMAE, PRETRAINED_BACKBONE.LARGE_S_MULTIMAE].
+        Example: 'v1.0.4_e499' stands for version 1.0.4, epoch 499, trained 500 epochs
+        """
+        self.pretrained_backbone_version: Optional[str] = None
+        """Ground truth
+        V1: 1 head, each head has 1 class, BCE
+        V2: 1 head, each head has 5 classes, CE
+        V3: 5 heads, each head has 1 class, BCE
+        V4: 1 head, each head has 5 classes, BCE
+        V5: additional global token indicates individual thinker
+        """
+        self.ground_truth_version = 1
+        self.additional_gt_tokens_mlp_channels = []
+        self.num_classes = 1
+        self.actual_num_classes = 1
+        self.is_cache = False
+        """Learning rate
+        LR strategy:
+        V1: The ratio of unpretrained and pretrained is also 1:lr_scale
+        V2: The ratio of unpretrained and pretrained is changed gradually from 1:lr_scale -> 1:1
+        """
+        self.lr_strategy_version = 1
+        self.lr: float
+        self.end_lr: float = 1e-11
+        self.lr_scale: int
+        self.lr_power: float = 0.9
+        # Deprecated from v3
+        self.save_checkpoints_after_each_n_epochs: int = 10  # Not used in trainv3
+        self.weight_decay = 0.05
+        self.num_workers = 2
+        self.num_epochs_every_restart = 100
+        self.betas: Tuple[float, float] = (0.9, 0.999)
+        self.input_patch_size: int = 16
+        self.output_patch_size: int = 16  # must be a square of number
+        """Warmup batchsize"""
+        self.warmup_min_batch_size: Optional[int] = None
+        self.warmup_epoch_batch_size: Optional[int] = None
+        self.batch_size: int
+        self.val_batch_size: int
+        self.test_batch_size: int = 100
+        self.nepochs: int
+    def todict(self):
+        d = dict()
+        for k, v in self.__dict__.items():
+            if not k.startswith("_"):
+                d[k] = v
+        return d
+    @property
+    def total_iters_per_epoch(self):
+        return math.ceil(
+            (self.num_training_samples_per_epoch)
+            / (self.batch_size * len(self.devices))
+        )

s_multimae/configs/data_augmentation_config.py ADDED Viewed

	@@ -0,0 +1,19 @@

+class RandomGaussianBlurConfig:
+    def __init__(self, p=0.5, max_gaussian_kernel=19) -> None:
+        self.p = p
+        self.max_gaussian_kernel = max_gaussian_kernel
+class DataAugmentationConfig:
+    def __init__(self) -> None:
+        self.mean_normalization = [0.5, 0.5, 0.5]
+        self.std_normalization = [0.5, 0.5, 0.5]
+        self.image_gaussian_config = RandomGaussianBlurConfig(
+            p=0.5,
+            max_gaussian_kernel=19,
+        )
+        self.depth_gaussian_config = RandomGaussianBlurConfig(
+            p=0.5,
+            max_gaussian_kernel=36,
+        )
+        self.random_horizontal_flip_prob = 0.5

s_multimae/configs/experiment_config.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from functools import partial
+from typing import Dict, Optional, Type
+from .base_config import base_cfg
+import importlib, inspect, os
+from glob import glob
+arg_cfg: Dict[str, Type[base_cfg]] = dict()
+modules = []
+for p in glob("s_multimae/configs/experiment_configs/*.py"):
+    if not p.startswith("__"):
+        module_name = os.path.splitext(os.path.basename(p))[0]
+        modules.append(f"s_multimae.configs.experiment_configs.{module_name}")
+for module in modules:
+    for name, cls in inspect.getmembers(
+        importlib.import_module(module), inspect.isclass
+    ):
+        if name.startswith("cfg"):
+            arg_cfg[name] = cls
+def get_config_by_set_version(set_version: int) -> base_cfg:
+    if set_version not in [1, 2, 3, 4]:
+        raise Exception(f"Unsupported set version {set_version}")
+    return arg_cfg[f"cfg_set_{set_version}"]()
+def get_config(cfg_name: str, epoch: Optional[int] = None) -> base_cfg:
+    return arg_cfg[cfg_name](epoch)

s_multimae/configs/experiment_configs/__init__.py ADDED Viewed

File without changes

s_multimae/configs/experiment_configs/expv1_dynamic.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import albumentations as A
+import cv2
+from typing import Optional
+from definition import PRETRAINED_BACKBONE
+from ..base_config import base_cfg
+class cfgv4_0_2006(base_cfg):
+    def __init__(self, epoch: Optional[int] = None):
+        super().__init__(epoch, datasets_set=1)
+        self.check_val_every_n_epoch = 1
+        self.num_workers = 4
+        self.devices = [0]
+        self.description = "ViT Large [DoT]"
+        """MultiMAE"""
+        self.pretrained_backbone = PRETRAINED_BACKBONE.LARGE_S_MULTIMAE
+        self.pretrained_backbone_version = "v2.0.5-pr"
+        # Large MAE
+        self.dim_tokens = 1024
+        self.encoder_depth = 24
+        self.num_heads = 16
+        self.clip_grad = None
+        self.normalized_depth = False
+        """Decoders"""
+        self.decoder_main_tasks = [["rgb", "depth"]]
+        self.decoder_depth = 10
+        # ConvNeXtAdapter
+        self.dec_kernel = 3
+        # debug
+        # self.max_train_samples = 80
+        # self.max_dev_samples = 80
+        # Diversity of thought
+        self.ground_truth_version = 6
+        self.num_classes = 5  # ignored
+        self.actual_num_classes = 5  # ignored
+        self.additional_gt_tokens_mlp_channels = [768 * 2]
+        """Learning rate"""
+        self.lr = 1e-5
+        self.end_lr = 1e-8
+        self.lr_scale = 100
+        self.batch_size = 20
+        self.val_batch_size = 200
+        self.nepochs = 400
+        self.num_epochs_every_restart = 100
+        self.data_augmentation_version = 6
+        self.train_function_version = 3
+        self.weight_decay = 5e-2
+        self.transform1 = [
+            A.HorizontalFlip(p=0.5),
+        ]
+class cfgv4_0_2007(base_cfg):
+    def __init__(self, epoch: Optional[int] = None):
+        super().__init__(epoch, datasets_set=1)
+        self.check_val_every_n_epoch = 1
+        self.num_workers = 4
+        self.devices = [0]
+        self.description = "ViT Base [DoT]"
+        """MultiMAE"""
+        self.pretrained_backbone = PRETRAINED_BACKBONE.S_MULTIMAE
+        self.pretrained_backbone_version = "v2.0.1-pr"
+        self.clip_grad = None
+        self.normalized_depth = False
+        """Decoders"""
+        self.decoder_main_tasks = [["rgb", "depth"]]
+        self.decoder_depth = 10
+        # ConvNeXtAdapter
+        self.dec_kernel = 3
+        # debug
+        # self.max_train_samples = 80
+        # self.max_dev_samples = 80
+        # Diversity of thought
+        self.ground_truth_version = 6
+        self.num_classes = 5  # ignored
+        self.actual_num_classes = 5  # ignored
+        self.additional_gt_tokens_mlp_channels = [768 * 2]
+        """Learning rate"""
+        self.lr = 1e-5
+        self.end_lr = 1e-8
+        self.lr_scale = 100
+        self.batch_size = 40
+        self.val_batch_size = 200
+        self.nepochs = 400
+        self.num_epochs_every_restart = 100
+        self.data_augmentation_version = 6
+        self.train_function_version = 3
+        self.weight_decay = 5e-2
+        self.transform1 = [
+            A.HorizontalFlip(p=0.5),
+            A.OneOf(
+                [
+                    A.Compose(
+                        [
+                            A.RandomCropFromBorders(
+                                crop_left=0.3,
+                                crop_right=0.3,
+                                crop_top=0.3,
+                                crop_bottom=0.3,
+                                p=0.2,
+                            ),
+                            A.ShiftScaleRotate(
+                                shift_limit=0.0625,
+                                scale_limit=0.1,
+                                rotate_limit=45,
+                                p=0.1,
+                            ),
+                            A.Perspective(
+                                p=0.2,
+                                scale=(0.05, 0.1),
+                            ),
+                        ]
+                    ),
+                    A.Compose(
+                        [
+                            A.RandomCropFromBorders(
+                                crop_left=0.3,
+                                crop_right=0.3,
+                                crop_top=0.3,
+                                crop_bottom=0.3,
+                                p=0.2,
+                            ),
+                            A.ShiftScaleRotate(
+                                shift_limit=0.0625,
+                                scale_limit=0.1,
+                                rotate_limit=45,
+                                p=0.1,
+                                border_mode=cv2.BORDER_CONSTANT,
+                                value=(255, 255, 255),
+                                mask_value=0,
+                            ),
+                            A.Perspective(
+                                p=0.2,
+                                scale=(0.05, 0.1),
+                                pad_mode=cv2.BORDER_CONSTANT,
+                                pad_val=(255, 255, 255),
+                                mask_pad_val=0,
+                            ),
+                        ]
+                    ),
+                ]
+            ),
+        ]
+class cfgv4_0_2002(base_cfg):
+    def __init__(self, epoch: Optional[int] = None):
+        super().__init__(epoch, datasets_set=1)
+        self.check_val_every_n_epoch = 1
+        self.num_workers = 4
+        self.devices = [3]
+        # self.description = "Trainv3-DAv6-DiversityOfThought-NotMuchAug"
+        self.description = "DEBUG"
+        """MultiMAE"""
+        self.pretrained_backbone = PRETRAINED_BACKBONE.S_MULTIMAE
+        self.pretrained_backbone_version = "v2.0.1-pr"
+        # Large MAE
+        # self.dim_tokens = 1024
+        # self.encoder_depth = 24
+        # self.num_heads = 16
+        self.clip_grad = None
+        self.normalized_depth = False
+        """Decoders"""
+        self.decoder_main_tasks = [["rgb", "depth"]]
+        self.decoder_depth = 10
+        # ConvNeXtAdapter
+        self.dec_kernel = 3
+        # debug
+        self.max_train_samples = 20
+        self.max_dev_samples = 20
+        # Diversity of thought
+        self.ground_truth_version = 6
+        self.num_classes = 5  # ignored
+        self.actual_num_classes = 5  # ignored
+        self.additional_gt_tokens_mlp_channels = [768 * 2]
+        """Learning rate"""
+        self.lr = 1e-5
+        self.end_lr = 1e-8
+        self.lr_scale = 100
+        self.batch_size = 5
+        self.val_batch_size = 5
+        self.nepochs = 400
+        self.num_epochs_every_restart = 100
+        self.data_augmentation_version = 6
+        self.train_function_version = 3
+        self.weight_decay = 5e-2
+        self.transform1 = [
+            A.HorizontalFlip(p=0.5),
+            A.OneOf(
+                [
+                    A.Compose(
+                        [
+                            A.RandomCropFromBorders(
+                                crop_left=0.3,
+                                crop_right=0.3,
+                                crop_top=0.3,
+                                crop_bottom=0.3,
+                                p=0.2,
+                            ),
+                            A.ShiftScaleRotate(
+                                shift_limit=0.0625,
+                                scale_limit=0.1,
+                                rotate_limit=45,
+                                p=0.1,
+                            ),
+                            A.Perspective(
+                                p=0.2,
+                                scale=(0.05, 0.1),
+                            ),
+                        ]
+                    ),
+                    A.Compose(
+                        [
+                            A.RandomCropFromBorders(
+                                crop_left=0.3,
+                                crop_right=0.3,
+                                crop_top=0.3,
+                                crop_bottom=0.3,
+                                p=0.2,
+                            ),
+                            A.ShiftScaleRotate(
+                                shift_limit=0.0625,
+                                scale_limit=0.1,
+                                rotate_limit=45,
+                                p=0.1,
+                                border_mode=cv2.BORDER_CONSTANT,
+                                value=(255, 255, 255),
+                                mask_value=0,
+                            ),
+                            A.Perspective(
+                                p=0.2,
+                                scale=(0.05, 0.1),
+                                pad_mode=cv2.BORDER_CONSTANT,
+                                pad_val=(255, 255, 255),
+                                mask_pad_val=0,
+                            ),
+                        ]
+                    ),
+                ]
+            ),
+        ]

s_multimae/da/__init__.py ADDED Viewed

File without changes

s_multimae/da/base_da.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import abc
+from typing import List, Optional, Tuple
+from torch import nn, Tensor
+from PIL import Image
+class BaseDataAugmentation(nn.Module):
+    def __init__(self):
+        super(BaseDataAugmentation, self).__init__()
+    @abc.abstractmethod
+    def forward(
+        self,
+        image: Image.Image,
+        depth: Image.Image,
+        gt: Optional[Image.Image] = None,
+        ranking_gt: Optional[Image.Image] = None,
+        multi_gts: Optional[List[Image.Image]] = None,
+        is_transform: bool = True,  # is augmented?
+        is_debug: bool = False,
+    ) -> Tuple[Tensor, Tensor, Optional[Tensor]]:
+        """
+        Usual case:
+            If gt is provided, return [image, depth, gt]
+            Otherwise, return [image, depth]
+        When ranking_gt is provided, gt will be ignored
+            Return [image, depth, ranking_gt]
+        For debugging:
+            Return [image, depth, gt|ranking_gt, unnormalized, Optional[ranking_gts]]
+        """
+        pass

s_multimae/da/dav6.py ADDED Viewed

	@@ -0,0 +1,147 @@

+from typing import List, Optional, Tuple
+from PIL import Image
+import numpy as np
+from torchvision import transforms
+import albumentations as A
+import torch
+from torch import Tensor
+from ..configs.base_config import base_cfg
+from .base_da import BaseDataAugmentation
+class DataAugmentationV6(BaseDataAugmentation):
+    def __init__(
+        self,
+        cfg: base_cfg,
+        is_padding=True,
+    ):
+        super().__init__()
+        self.image_size = cfg.image_size
+        self.is_padding = is_padding
+        self.cfg = cfg
+        self.to_tensor = transforms.ToTensor()
+        self.additional_targets = {
+            "depth": "image",
+            "gt": "mask",
+            "ranking_gt": "mask",
+            "multi_gts": "mask",
+        }
+        # For rgb+depth+gt
+        self.transform1 = A.Compose(
+            cfg.transform1,
+            additional_targets=self.additional_targets,
+        )
+        # For rgb only
+        self.transform2 = A.Compose(
+            [
+                A.GaussianBlur(p=0.5, blur_limit=(3, 19)),
+                A.RandomBrightnessContrast(p=0.5),
+                A.ColorJitter(p=0.5),
+            ]
+        )
+        # For depth only
+        self.transform3 = A.Compose([A.GaussianBlur(p=0.5, blur_limit=(3, 37))])
+        # For rgb+depth+gt
+        self.transform4 = A.Compose(
+            [A.Resize(self.image_size, self.image_size)],
+            additional_targets=self.additional_targets,
+            is_check_shapes=False,
+        )
+        # For rgb only
+        self.transform5 = A.Compose([A.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])
+        # For depth only
+        self.transform6 = A.Compose([A.Normalize(0.5, 0.5)])
+    def forward(
+        self,
+        image: Image.Image,
+        depth: Image.Image,
+        gt: Optional[Image.Image] = None,
+        ranking_gt: Optional[Image.Image] = None,
+        multi_gts: Optional[List[Image.Image]] = None,
+        is_transform: bool = True,  # is augmented?
+        is_debug: bool = False,
+    ) -> Tuple[Tensor, Tensor, Optional[Tensor]]:
+        ## 1. Convert to numpy array: image, depth, gt, ranking_gts
+        image = np.array(image)
+        depth = np.array(depth)
+        d = dict(image=image, depth=depth)
+        if gt is not None:
+            gt = np.array(gt)
+            d["gt"] = gt
+        if not is_transform:
+            # Dev or Test
+            d = self.transform4(**d)
+            d["image"] = self.transform5(image=d["image"])["image"]
+            # d["depth"] = self.transform6(image=depth)["image"]
+            if gt is not None:
+                return self.to_tensors([d["image"], d["depth"], d["gt"]])
+            else:
+                return self.to_tensors([d["image"], d["depth"]])
+        d["depth"] = 255 - d["depth"]  # inverse depth
+        # if ranking_gt is not None and multi_gts is not None:
+        #     print('[WARN] Both ranking_gt and multi_gts are not none, but we prioritize multi_gts')
+        if ranking_gt is not None:
+            ranking_gt = np.array(ranking_gt)
+        if multi_gts is not None:
+            multi_gts = np.stack(multi_gts, axis=2)
+            d["multi_gts"] = multi_gts
+        ## 2. First transformation for image (Contrast, GaussianBlur,...), depth (GaussianBlur,...)
+        d["image"] = self.transform2(image=d["image"])["image"]
+        d["depth"] = self.transform3(image=d["depth"])["image"]
+        ## 3. Transformation defined in config: change perspective, rotation, size, ...
+        d = self.transform1(**d)
+        ## 4. Resize
+        d = self.transform4(**d)
+        ## Just backup image before normalizing it
+        if is_debug:
+            unnormalized_image = d["image"]
+        ## 6. Construct response
+        d["depth"] = 255 - d["depth"]  # inverse depth
+        d["image"] = self.transform5(image=d["image"])["image"]
+        # d["depth"] = self.transform6(image=depth)["image"]
+        rs = self.to_tensors([d["image"], d["depth"]])
+        if multi_gts is not None:
+            rs += self.to_tensors([d["multi_gts"]])
+        elif ranking_gt is not None:
+            rs += [torch.from_numpy(d["ranking_gt"]).to(torch.long)]
+        else:
+            rs += self.to_tensors([d["gt"]])
+        ## 7. For debug only
+        if is_debug:
+            rs.append(unnormalized_image)
+            if ranking_gt is not None:
+                ranking_gts = []
+                for i in range(self.cfg.num_classes):
+                    ranking_gts.append(
+                        np.array(d["ranking_gt"] == i).astype(np.uint8) * 255
+                    )
+                rs.append(ranking_gts)
+            if multi_gts is not None:
+                rs.append(d["multi_gts"])
+        return rs
+    def to_tensors(self, lst: List[Tensor]) -> List[Tensor]:
+        return [self.to_tensor(e) for e in lst]

s_multimae/data_augmentation.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from torch import nn
+from .configs.base_config import base_cfg
+from .da.dav6 import DataAugmentationV6
+from .da.base_da import BaseDataAugmentation
+def get_data_augmentation(
+    cfg: base_cfg,
+    image_size: int,
+    is_padding: bool,
+) -> BaseDataAugmentation:
+    if cfg.data_augmentation_version == 6:
+        print("Using DataAugmentationV6")
+        return DataAugmentationV6(cfg)
+    else:
+        raise NotImplementedError(
+            f"Unsupported DataAugmentation version {cfg.data_augmentation_version}"
+        )

s_multimae/model/__init__.py ADDED Viewed

File without changes

s_multimae/model/components.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import torch
+from torch import Tensor
+import math
+import warnings
+from typing import Tuple, Union
+from einops import rearrange
+import os
+from definition import PRETRAINED_BACKBONE
+from ..configs.base_config import base_cfg
+def pair(t: Union[int, Tuple[int, int]]) -> Tuple[int, int]:
+    return t if isinstance(t, tuple) else (t, t)
+def build_2d_sincos_posemb(h: int, w: int, embed_dim=1024, temperature=10000.0):
+    """Sine-cosine positional embeddings from MoCo-v3
+    Source: https://github.com/facebookresearch/moco-v3/blob/main/vits.py
+    """
+    grid_w = torch.arange(w, dtype=torch.float32)
+    grid_h = torch.arange(h, dtype=torch.float32)
+    grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="xy")
+    assert (
+        embed_dim % 4 == 0
+    ), "Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
+    pos_dim = embed_dim // 4
+    omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
+    omega = 1.0 / (temperature**omega)
+    out_w = torch.einsum("m,d->md", [grid_w.flatten(), omega])
+    out_h = torch.einsum("m,d->md", [grid_h.flatten(), omega])
+    pos_emb = torch.cat(
+        [torch.sin(out_w), torch.cos(out_w), torch.sin(out_h), torch.cos(out_h)], dim=1
+    )[None, :, :]
+    pos_emb = rearrange(pos_emb, "b (h w) d -> b d h w", h=h, w=w, d=embed_dim)
+    return pos_emb
+def _no_grad_trunc_normal_(tensor: Tensor, mean: float, std: float, a: float, b: float):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.0))
+        tensor.add_(mean)
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+def trunc_normal_(tensor: Tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+def drop_path(x: Tensor, drop_prob: float = 0.0, training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output

s_multimae/model/multimae.py ADDED Viewed

	@@ -0,0 +1,938 @@

+import math
+import re
+from collections import OrderedDict
+from functools import partial
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+from torchvision.ops import MLP
+from einops import rearrange, repeat
+from torch import Tensor, nn
+from definition import PRETRAINED_BACKBONE
+from ..configs.base_config import base_cfg
+from ..utils import count_parameters
+from .components import (
+    build_2d_sincos_posemb,
+    drop_path,
+    pair,
+    trunc_normal_,
+)
+class PatchedInputAdapter(nn.Module):
+    """Adapter for spatial inputs, like images or feature maps.
+    Creates tokens from patches over the image.
+    :param num_channels: Number of input channels of the image/feature map
+    :param stride_level: Stride level compared to the full-sized image.
+        E.g. 4 for 1/4th the size of the image.
+    :param patch_size_full: Int or tuple of the patch size over the full image size.
+        Patch size for smaller inputs will be computed accordingly.
+    :param dim_tokens: Dimension of output tokens. Can be set using init method.
+    :param sincos_pos_emb: Set to True (default) to use fixed 2D sin-cos positional embeddings
+    :param learnable_pos_emb: Set to True to learn positional embeddings instead
+    :param image_size: Default image size. Used to initialize size of positional embeddings.
+    """
+    def __init__(
+        self,
+        num_channels: int,
+        stride_level: int,
+        patch_size_full: Union[int, Tuple[int, int]],
+        dim_tokens: Optional[int] = None,
+        sincos_pos_emb: bool = True,
+        learnable_pos_emb: bool = False,
+        image_size: Union[int, Tuple[int]] = 224,
+    ):
+        super().__init__()
+        self.num_channels = num_channels
+        self.stride_level = stride_level
+        self.patch_size_full = pair(patch_size_full)
+        self.dim_tokens = dim_tokens
+        self.sincos_pos_emb = sincos_pos_emb
+        self.learnable_pos_emb = learnable_pos_emb
+        self.image_size = pair(image_size)
+        self.num_patches = (self.image_size[0] // patch_size_full) * (
+            self.image_size[1] // patch_size_full
+        )
+        # Actual patch height and width, taking into account stride of input
+        self.P_H = max(1, self.patch_size_full[0] // stride_level)
+        self.P_W = max(1, self.patch_size_full[1] // stride_level)
+        if self.dim_tokens is not None:
+            self.init(dim_tokens=dim_tokens)
+    def init(self, dim_tokens: int = 768):
+        """
+        Initialize parts of encoder that are dependent on dimension of tokens.
+        Should be called when setting up MultiMAE.
+        :param dim_tokens: Dimension of tokens
+        """
+        self.dim_tokens = dim_tokens
+        # Task embedding identifying from which task a given token comes from
+        # Fixed-size positional embeddings. Can be interpolated to different input sizes
+        h_posemb = self.image_size[0] // (self.stride_level * self.P_H)
+        w_posemb = self.image_size[1] // (self.stride_level * self.P_W)
+        if self.sincos_pos_emb:
+            self.pos_emb = build_2d_sincos_posemb(
+                h=h_posemb, w=w_posemb, embed_dim=self.dim_tokens
+            )
+            self.pos_emb = nn.Parameter(
+                self.pos_emb, requires_grad=self.learnable_pos_emb
+            )
+        else:
+            self.pos_emb = nn.Parameter(
+                torch.zeros(1, self.dim_tokens, h_posemb, w_posemb)
+            )
+            trunc_normal_(self.pos_emb, std=0.02)
+        # Image -> tokens projection
+        self.proj = nn.Conv2d(
+            in_channels=self.num_channels,
+            out_channels=self.dim_tokens,
+            kernel_size=(self.P_H, self.P_W),
+            stride=(self.P_H, self.P_W),
+        )
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {"pos_emb"}
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Forward pass through input adapter, transforming image to sequence of tokens.
+        Adds task and positional encodings.
+        :param x: Input image tensor
+        """
+        B, C, H, W = x.shape
+        assert (
+            self.dim_tokens is not None
+        ), "Need to call init(dim_tokens) function first"
+        assert (H % self.P_H == 0) and (
+            W % self.P_W == 0
+        ), f"Image sizes {H}x{W} must be divisible by patch sizes {self.P_H}x{self.P_W}"
+        N_H, N_W = H // self.P_H, W // self.P_W  # Number of patches in height and width
+        # Create patches [B, C, H, W] -> [B, (H*W), C]
+        projected_x = self.proj(x)
+        x_patch = rearrange(projected_x, "b d nh nw -> b (nh nw) d")
+        # Create positional embedding
+        x_pos_emb = F.interpolate(
+            self.pos_emb, size=(N_H, N_W), mode="bicubic", align_corners=False
+        )
+        x_pos_emb = rearrange(x_pos_emb, "b d nh nw -> b (nh nw) d")
+        # Add patches and positional embeddings
+        x = x_patch + x_pos_emb
+        return x
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x: Tensor) -> Tensor:
+        return drop_path(x, self.drop_prob, self.training)
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+class ConvNeXtBlock(nn.Module):
+    r"""ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+    Args:
+        dim (int): Number of input channels.
+        drop_path: Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 0 (disabled for isotropic ConvNeXt).
+    Code from: https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py
+    """
+    def __init__(self, dim, drop_path=0.0, layer_scale_init_value=0.0):
+        super().__init__()
+        self.dwconv = nn.Conv2d(
+            dim, dim, kernel_size=7, padding=3, groups=dim
+        )  # depthwise conv
+        self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim, 4 * dim
+        )  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
+        x = input + self.drop_path(x)
+        return x
+class ConvNeXtAdapter(nn.Module):
+    """Output adapter with ConvNext blocks for semantic segmentation
+    :param num_classes: Number of classes
+    :param num_heads: Number of attention heads
+    :param embed_dim: Token dimension after projection, and before reshaping operation.
+    :param preds_per_patch: Increases size of feature map by reshaping each patch  Each patch gets reshaped
+        from embed_dim x 1 x 1 to (embed_dim / preds_per_patch) x (preds_per_patch ** 0.5) x (preds_per_patch ** 0.5)
+    :param main_tasks: Tasks to use for the adapter. Only tokens coming from these tasks are kept.
+    :param patch_size: Size of patches
+    :param depth: Number of ConvNeXt blocks
+    :interpolate_mode: Interpolation mode for final upsampling
+    """
+    def __init__(
+        self,
+        image_size: int,
+        num_classes: int,
+        embed_dim: int = 6144,
+        preds_per_patch: int = 16,
+        main_tasks: Iterable[str] = ("rgb",),
+        patch_size: int = 16,
+        depth: int = 4,
+        interpolate_mode: str = "bilinear",
+        act_fn: nn.Module = nn.GELU,
+        dec_kernel: int = 1,
+    ):
+        super().__init__()
+        self.main_tasks = main_tasks
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        self.preds_per_patch = preds_per_patch
+        self.class_dim = embed_dim // preds_per_patch
+        self.num_classes = num_classes
+        self.interpolate_mode = interpolate_mode
+        self.image_size = image_size
+        self.blocks = nn.Sequential(
+            *[ConvNeXtBlock(dim=self.class_dim) for _ in range(depth)]
+        )
+        if dec_kernel == 1:
+            self.final_layer_1 = nn.Sequential(
+                nn.Conv2d(self.class_dim, self.class_dim // 4, 1),
+                nn.BatchNorm2d(self.class_dim // 4),
+                act_fn(),
+                nn.Upsample(scale_factor=2, mode=self.interpolate_mode),
+            )
+            self.final_layer_2 = nn.Sequential(
+                nn.Conv2d(self.class_dim // 4, self.class_dim // 16, 1),
+                nn.BatchNorm2d(self.class_dim // 16),
+                act_fn(),
+                nn.Upsample(size=image_size, mode=self.interpolate_mode),
+            )
+            self.final_layer = nn.Conv2d(self.class_dim // 16, self.num_classes, 1)
+        elif dec_kernel == 3:
+            self.final_layer_1 = nn.Sequential(
+                nn.Conv2d(
+                    self.class_dim,
+                    self.class_dim // 4,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                ),
+                nn.BatchNorm2d(self.class_dim // 4),
+                act_fn(),
+                nn.Upsample(scale_factor=2, mode=self.interpolate_mode),
+            )
+            self.final_layer_2 = nn.Sequential(
+                nn.Conv2d(
+                    self.class_dim // 4,
+                    self.class_dim // 16,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                ),
+                nn.BatchNorm2d(self.class_dim // 16),
+                act_fn(),
+                nn.Upsample(size=image_size, mode=self.interpolate_mode),
+            )
+            self.final_layer = nn.Conv2d(
+                self.class_dim // 16,
+                self.num_classes,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+            )
+        else:
+            raise Exception(f"Unsupported dec_kernel {dec_kernel}")
+        self.apply(self._init_weights)
+    def init(self, dim_tokens_enc: int = 768):
+        """
+        Initialize parts of decoder that are dependent on dimension of encoder tokens.
+        Should be called when setting up MultiMAE.
+        :param dim_tokens_enc: Dimension of tokens coming from encoder
+        """
+        self.in_channels = dim_tokens_enc * len(self.main_tasks)
+        # Projection of encoder tokens to the patch dimension
+        self.proj_dec = nn.Linear(self.in_channels, self.embed_dim)
+        self._init_weights(self.proj_dec)
+    def _init_weights(self, m: nn.Module):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def adapt_tokens(self, encoder_tokens: Tensor, input_info: Dict):
+        # Adapt tokens
+        x = []
+        for task in self.main_tasks:
+            start_idx = input_info["tasks"][task]["start_idx"]
+            end_idx = input_info["tasks"][task]["end_idx"]
+            x.append(encoder_tokens[:, start_idx:end_idx])
+        x = torch.cat(x, dim=-1)
+        return x
+    def forward(self, encoder_tokens: Tensor, input_info: Dict) -> Tensor:
+        H, W = input_info["image_size"]
+        N_H, N_W = H // self.patch_size, W // self.patch_size
+        x = self.adapt_tokens(encoder_tokens, input_info)
+        x = self.proj_dec(x)
+        x = rearrange(
+            x,
+            "b n (p c) -> b (n p) c",
+            n=N_H * N_W,
+            p=self.preds_per_patch,
+            c=self.class_dim,
+        )
+        x = rearrange(
+            x,
+            "b (nh nw ph pw) c -> b c (nh ph) (nw pw)",
+            nh=N_H,
+            nw=N_W,
+            ph=int(self.preds_per_patch**0.5),
+            pw=int(self.preds_per_patch**0.5),
+        )
+        x = self.blocks(x)
+        # for block in self.blocks:
+        #     x = block(x)
+        #     print(x.shape)
+        # print(x.shape)
+        x = self.final_layer_1(x)
+        # print(x.shape)
+        x = self.final_layer_2(x)
+        # print(x.shape)
+        x = self.final_layer(x)
+        # print(x.shape)
+        # Interpolate to sod res
+        # x = F.interpolate(x, size=(H, W), mode=self.interpolate_mode)
+        return x
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads=8,
+        qkv_bias=False,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: nn.Module = nn.GELU,
+        drop: float = 0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        # x = self.drop(x)
+        # commit this for the orignal BERT implement
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class MultiMAE(nn.Module):
+    """MultiMAE: Multi-task Multi-modal Masked Autoencoder
+    This module performs masking in its forward pass.
+    The MultiViT module defined below inherits from this module and performs a regular forward pass,
+    and should be used instead for downstream tasks
+    :param input_adapters: Dictionary of task -> input adapters
+    :param output_adapters: Optional dictionary of task -> output adapters
+    :param num_global_tokens: Number of additional global tokens to add (like cls tokens), default is 1
+    :param dim_tokens: Dimension of encoder tokens
+    :param depth: Depth of encoder
+    :param num_heads: Number of attention heads
+    :param mlp_ratio: MLP hidden dim ratio
+    :param qkv_bias: Set to False to disable bias
+    :param drop_rate: Dropout after MLPs and Attention
+    :param attn_drop_rate: Attention matrix drop rate
+    :param drop_path_rate: DropPath drop rate
+    :param norm_layer: Type of normalization layer
+    """
+    def __init__(
+        self,
+        input_adapters: Dict[str, PatchedInputAdapter],
+        output_adapters: Dict[str, ConvNeXtAdapter],
+        num_global_tokens: int = 1,
+        dim_tokens: int = 768,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        drop_rate: float = 0.0,
+        attn_drop_rate: float = 0.0,
+        drop_path_rate: float = 0.0,
+        norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6),
+        freeze_encoder: bool = False,
+        num_additional_gt_tokens: int = 0,  # @deprecated
+        actual_num_additional_gt_tokens: int = 0,  # @deprecated
+        learnable_additional_gt_tokens: bool = False,
+        additional_gt_tokens_mlp_channels: List[int] = [],
+        ground_truth_version: int = -1,
+        A: float = 0.5,
+    ):
+        super().__init__()
+        self.dim_tokens = dim_tokens
+        self.ground_truth_version = ground_truth_version
+        # Initialize input and output adapters
+        for adapter in input_adapters.values():
+            adapter.init(dim_tokens=dim_tokens)
+        self.input_adapters = nn.ModuleDict(input_adapters)
+        for adapter in output_adapters.values():
+            adapter.init(dim_tokens_enc=dim_tokens)
+        self.output_adapters = nn.ModuleDict(output_adapters)
+        # Additional learnable tokens that can be used by encoder to process/store global information
+        self.num_global_tokens = num_global_tokens
+        self.global_tokens = nn.Parameter(torch.zeros(1, num_global_tokens, dim_tokens))
+        trunc_normal_(self.global_tokens, std=0.02)
+        self.num_additional_gt_tokens = num_additional_gt_tokens  # @deprecated
+        self.actual_num_additional_gt_tokens = (
+            actual_num_additional_gt_tokens  # @deprecated
+        )
+        self.A = A
+        self.additional_gt_tokens_mlp_channels = additional_gt_tokens_mlp_channels
+        self.learnable_additional_gt_tokens = learnable_additional_gt_tokens
+        self.init_gt_tokens()
+        # Transformer encoder
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, depth)
+        ]  # stochastic depth decay rule
+        self.encoder = nn.Sequential(
+            *[
+                Block(
+                    dim=dim_tokens,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+        print(f"Encoder {count_parameters(self.encoder)}")
+        if freeze_encoder:
+            print("Freeze encoder")
+            for param in self.encoder.parameters():
+                param.requires_grad = False
+        self.apply(self._init_weights)
+        for name, m in self.named_modules():
+            if isinstance(m, nn.Linear):
+                if "qkv" in name:
+                    # treat the weights of Q, K, V separately
+                    val = math.sqrt(
+                        6.0 / float(m.weight.shape[0] // 3 + m.weight.shape[1])
+                    )
+                    nn.init.uniform_(m.weight, -val, val)
+                elif "kv" in name:
+                    # treat the weights of K, V separately
+                    val = math.sqrt(
+                        6.0 / float(m.weight.shape[0] // 2 + m.weight.shape[1])
+                    )
+                    nn.init.uniform_(m.weight, -val, val)
+            if isinstance(m, nn.Conv2d):
+                if ".proj" in name:
+                    # From MAE, initialize projection like nn.Linear (instead of nn.Conv2d)
+                    w = m.weight.data
+                    nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        print(f"Total params: {count_parameters(self)}")
+    def init_gt_tokens(self):
+        """Just prepare beforehand to save time in training
+        In inference, there is no need"""
+        addtional_gt_tokens: List[Tensor] = []
+        if self.num_additional_gt_tokens == 0:
+            self.token_mlp = nn.Identity()
+            return
+        if len(self.additional_gt_tokens_mlp_channels) > 0:
+            self.token_mlp = MLP(
+                self.dim_tokens,
+                self.additional_gt_tokens_mlp_channels + [self.dim_tokens],
+            )
+        else:
+            self.token_mlp = nn.Identity()
+        if self.ground_truth_version != 6:
+            T = 1 / (self.num_additional_gt_tokens * 4)
+            for i in range(self.actual_num_additional_gt_tokens):
+                t = [
+                    2 * math.pi * (offset / self.dim_tokens - i * T)
+                    for offset in range(self.dim_tokens)
+                ]
+                addtional_gt_tokens.append(
+                    nn.Parameter(
+                        self.A * torch.cos(Tensor(t).unsqueeze(0).unsqueeze(0)),
+                        requires_grad=self.learnable_additional_gt_tokens,
+                    )
+                )
+            self.addtional_gt_tokens = nn.ParameterList(addtional_gt_tokens)
+    def _init_weights(self, m: nn.Module) -> None:
+        if isinstance(m, nn.Linear):
+            nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        no_wd_set = {"global_tokens"}
+        for task, adapter in self.input_adapters.items():
+            if hasattr(adapter, "no_weight_decay"):
+                to_skip = adapter.no_weight_decay()
+                to_skip = set([f"input_adapters.{task}.{name}" for name in to_skip])
+                no_wd_set = no_wd_set | to_skip
+        for task, adapter in self.output_adapters.items():
+            if hasattr(adapter, "no_weight_decay"):
+                to_skip = adapter.no_weight_decay()
+                to_skip = set([f"output_adapters.{task}.{name}" for name in to_skip])
+                no_wd_set = no_wd_set | to_skip
+        return no_wd_set
+    def generate_input_info(
+        self, input_task_tokens: Dict[str, Tensor], image_size: Tuple[int, int]
+    ) -> Dict[str, Tensor]:
+        input_info = OrderedDict()
+        i = 0
+        input_info["tasks"] = {}
+        for domain, tensor in input_task_tokens.items():
+            num_tokens: Union[int, Tensor] = tensor.shape[1]
+            if type(num_tokens) == Tensor:
+                num_tokens = num_tokens.item()
+            d = {
+                "num_tokens": num_tokens,
+                "has_2d_posemb": True,
+                "start_idx": i,
+                "end_idx": i + num_tokens,
+            }
+            i += num_tokens
+            input_info["tasks"][domain] = d
+        input_info["image_size"] = image_size
+        input_info["num_task_tokens"] = i
+        input_info["num_global_tokens"] = self.num_global_tokens
+        return input_info
+class MultiViT(MultiMAE):
+    def extract_B_H_W(self, x: Dict[str, Tensor]) -> Tuple[int, int, int]:
+        # If input x is a Tensor, assume it's RGB
+        # x = {'rgb': x} if isinstance(x, Tensor) else x
+        # Need image size for tokens->image reconstruction
+        if "rgb" in x:
+            B, _, H, W = x["rgb"].shape
+        elif "sod" in x:
+            B, H, W = x["sod"].shape
+            H *= self.input_adapters["sod"].stride_level
+            W *= self.input_adapters["sod"].stride_level
+        else:
+            B, _, H, W = list(x.values())[0].shape
+        return B, H, W
+    def process_input(
+        self,
+        x: Dict[str, Tensor],
+        gt_index_lst: List[int],
+        num_gts_lst: List[int],
+    ) -> Tuple[Tensor, Dict[str, Tensor]]:
+        """
+        len(gt_i) must equal to x.shape[0] when self.num_additional_gt_tokens > 0
+        """
+        B, H, W = self.extract_B_H_W(x)
+        # Encode selected inputs to tokens
+        input_task_tokens: Dict[str, Tensor] = {
+            domain: self.input_adapters[domain](tensor)
+            for domain, tensor in x.items()
+            if domain in self.input_adapters
+        }
+        input_info = self.generate_input_info(
+            input_task_tokens=input_task_tokens, image_size=(H, W)
+        )
+        input_tokens = torch.cat(
+            [task_tokens for task_tokens in input_task_tokens.values()], dim=1
+        )
+        # Add global tokens to input tokens
+        global_tokens = repeat(self.global_tokens, "() n d -> b n d", b=B)
+        if self.ground_truth_version == 6:
+            # We need two inputs: gt_index, num_gts
+            assert len(gt_index_lst) == len(num_gts_lst)
+            additional_gt_tokens = []
+            for gt_index, num_gts in zip(gt_index_lst, num_gts_lst):
+                T = 1 / num_gts
+                i = gt_index
+                t = [
+                    2 * math.pi * (offset / self.dim_tokens - i * T)
+                    for offset in range(self.dim_tokens)
+                ]
+                additional_gt_token = self.A * torch.cos(
+                    Tensor(t).unsqueeze(0).unsqueeze(0)
+                )
+                additional_gt_tokens.append(additional_gt_token)
+            additional_gt_tokens = torch.cat(additional_gt_tokens, dim=0).to(
+                input_tokens.device
+            )
+            additional_gt_tokens = self.token_mlp(additional_gt_tokens)
+            input_tokens = torch.cat(
+                [input_tokens, global_tokens, additional_gt_tokens], dim=1
+            )
+        else:
+            if self.num_additional_gt_tokens > 0:
+                assert gt_index_lst is not None and len(gt_index_lst) == B
+                additional_gt_tokens: Tensor = torch.cat(
+                    [self.addtional_gt_tokens[gt_i] for gt_i in gt_index_lst], dim=0
+                )
+                additional_gt_tokens = self.token_mlp(additional_gt_tokens)
+                input_tokens = torch.cat(
+                    [input_tokens, global_tokens, additional_gt_tokens], dim=1
+                )
+            else:
+                input_tokens = torch.cat([input_tokens, global_tokens], dim=1)
+        return input_tokens, input_info
+    def forward(
+        self,
+        x: Dict[str, Tensor],
+        gt_index_lst: Optional[List[int]] = None,
+        max_gts_lst: Optional[List[int]] = None,
+    ) -> Dict[str, Tensor]:
+        """
+        Forward pass through input adapters, transformer encoder and output adapters.
+        :param x: Dictionary of tensors
+        :param outputs: List of outputs. For ex: outputs=['sod', 'depth']. Make sure 'sod' placed first!
+        """
+        input_tokens, input_info = self.process_input(x, gt_index_lst, max_gts_lst)
+        # Pass tokens through Transformer
+        encoder_tokens = self.encoder(input_tokens)
+        # Decode tokens for each task using task-specific output adapters
+        preds = {
+            domain: self.output_adapters[domain](
+                encoder_tokens=encoder_tokens,
+                input_info=input_info,
+            )
+            for domain in self.output_adapters
+        }
+        return preds
+def interpolate_pos_embed_multimae(
+    model: MultiViT,
+    checkpoint_model: Dict[str, Tensor],
+) -> None:
+    pattern = "input_adapters\.(.*)\.pos_emb"
+    matched_keys = [k for k in checkpoint_model if bool(re.match(pattern, k))]
+    for key in matched_keys:
+        domain = re.match(pattern, key).group(1)  # group(0) is entire matched regex
+        if getattr(model.input_adapters, domain, None) is not None:
+            pos_embed_checkpoint = checkpoint_model[key]
+            _, _, orig_H, orig_W = pos_embed_checkpoint.shape
+            _, _, new_H, new_W = getattr(model.input_adapters, domain).pos_emb.shape
+            if (orig_H != new_H) or (orig_W != new_W):
+                print(
+                    f"Key {key}: Position interpolate from {orig_H}x{orig_W} to {new_H}x{new_W}"
+                )
+                pos_embed_checkpoint = torch.nn.functional.interpolate(
+                    pos_embed_checkpoint,
+                    size=(new_H, new_W),
+                    mode="bicubic",
+                    align_corners=False,
+                )
+                checkpoint_model[key] = pos_embed_checkpoint
+def construct_adapters(cfg: base_cfg):
+    INPUT_ADAPTERS = {
+        "rgb": PatchedInputAdapter(
+            num_channels=3,
+            stride_level=1,
+            patch_size_full=cfg.input_patch_size,
+            image_size=cfg.image_size,
+            learnable_pos_emb=cfg.learnable_pos_emb,
+        ),
+        "depth": PatchedInputAdapter(
+            num_channels=1,
+            stride_level=1,
+            patch_size_full=cfg.input_patch_size,
+            image_size=cfg.image_size,
+            learnable_pos_emb=cfg.learnable_pos_emb,
+        ),
+    }
+    num_classes = cfg.num_classes
+    if cfg.ground_truth_version in [5, 6]:
+        num_classes = 1
+    OUTPUT_ADAPTERS = {
+        "sod": partial(
+            ConvNeXtAdapter,
+            num_classes=num_classes,
+            image_size=cfg.image_size,
+            embed_dim=cfg.embed_dim,
+            patch_size=cfg.input_patch_size,
+            preds_per_patch=cfg.output_patch_size,
+            depth=cfg.decoder_depth,
+            interpolate_mode=cfg.decoder_interpolate_mode,
+            main_tasks=cfg.decoder_main_tasks,
+            act_fn=cfg.act_fn,
+            dec_kernel=cfg.dec_kernel,
+        ),
+        "rgb": partial(
+            ConvNeXtAdapter,
+            num_classes=3,
+            image_size=cfg.image_size,
+            embed_dim=cfg.embed_dim,
+            patch_size=cfg.input_patch_size,
+            preds_per_patch=cfg.output_patch_size,
+            depth=cfg.decoder_depth,
+            interpolate_mode=cfg.decoder_interpolate_mode,
+            main_tasks=cfg.decoder_main_tasks,
+            act_fn=cfg.act_fn,
+            dec_kernel=cfg.dec_kernel,
+        ),
+        "depth": partial(
+            ConvNeXtAdapter,
+            num_classes=1,
+            image_size=cfg.image_size,
+            embed_dim=cfg.embed_dim,
+            patch_size=cfg.input_patch_size,
+            preds_per_patch=cfg.output_patch_size,
+            depth=cfg.decoder_depth,
+            interpolate_mode=cfg.decoder_interpolate_mode,
+            main_tasks=cfg.decoder_main_tasks,
+            act_fn=cfg.act_fn,
+            dec_kernel=cfg.dec_kernel,
+        ),
+    }
+    if cfg.ground_truth_version == 3:
+        for i in range(cfg.num_classes):
+            OUTPUT_ADAPTERS[f"sod{i}"] = partial(
+                ConvNeXtAdapter,
+                num_classes=1,
+                image_size=cfg.image_size,
+                embed_dim=cfg.embed_dim,
+                patch_size=cfg.input_patch_size,
+                preds_per_patch=cfg.output_patch_size,
+                depth=cfg.decoder_depth,
+                interpolate_mode=cfg.decoder_interpolate_mode,
+                main_tasks=cfg.decoder_main_tasks,
+                act_fn=cfg.act_fn,
+                dec_kernel=cfg.dec_kernel,
+            )
+    return INPUT_ADAPTERS, OUTPUT_ADAPTERS
+def generate_smultimae_model(cfg: base_cfg) -> Tuple[MultiViT, List[Dict]]:
+    """MULTIMAE"""
+    assert len(cfg.decoder_main_tasks) == len(
+        cfg.outputs
+    ), "Length of decoder main tasks must match length of outputs"
+    INPUT_ADAPTERS, OUTPUT_ADAPTERS = construct_adapters(cfg)
+    input_adapters = dict()
+    for input_key in cfg.inputs:
+        input_adapters[input_key] = INPUT_ADAPTERS[input_key]
+    output_adapters = dict()
+    for output_key, decoder_main_tasks_per_output in zip(
+        cfg.outputs, cfg.decoder_main_tasks
+    ):
+        output_adapters[output_key] = OUTPUT_ADAPTERS[output_key](
+            main_tasks=decoder_main_tasks_per_output
+        )
+    num_additional_gt_tokens = 0  # @deprecated
+    actual_num_additional_gt_tokens = 0  # @deprecated
+    if cfg.ground_truth_version in [5, 6]:  # @deprecated
+        num_additional_gt_tokens = cfg.num_classes  # @deprecated
+        actual_num_additional_gt_tokens = cfg.actual_num_classes  # @deprecated
+    model = MultiViT(
+        input_adapters=input_adapters,
+        output_adapters=output_adapters,
+        freeze_encoder=cfg.freeze_encoder,
+        drop_path_rate=0.1,
+        dim_tokens=cfg.dim_tokens,
+        depth=cfg.encoder_depth,
+        num_heads=cfg.num_heads,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        num_additional_gt_tokens=num_additional_gt_tokens,  # @deprecated
+        actual_num_additional_gt_tokens=actual_num_additional_gt_tokens,  # @deprecated
+        ground_truth_version=cfg.ground_truth_version,
+    )
+    # return load_pretrained_backbone(cfg, model)
+    return model, []

s_multimae/model_pl.py ADDED Viewed

	@@ -0,0 +1,105 @@

+from collections import defaultdict
+import os
+from typing import Any, Dict, List, Optional, Tuple
+import cv2
+import torch
+from torch import Tensor, nn
+import torch.nn.functional as F
+import pytorch_lightning as pl
+import numpy as np
+from .configs.base_config import base_cfg
+from .rgbd_model import RGBDModel
+class ModelPL(pl.LightningModule):
+    def __init__(self, cfg: base_cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.model = RGBDModel(cfg)
+    def forward(self, images: Tensor, depths: Tensor):
+        return self.model.forward(images, depths)
+    def __inference_v1(
+        self, outputs: Dict[str, Tensor], image_sizes: List[Tuple[int, int]]
+    ):
+        res_lst: List[List[np.ndarray]] = []
+        for output, image_size in zip(outputs["sod"], image_sizes):
+            output: Tensor = F.interpolate(
+                output.unsqueeze(0),
+                size=(image_size[1], image_size[0]),
+                mode="bilinear",
+                align_corners=False,
+            )
+            res: np.ndarray = output.sigmoid().data.cpu().numpy().squeeze()
+            res = (res - res.min()) / (res.max() - res.min() + 1e-8)
+            if self.cfg.is_fp16:
+                res = np.float32(res)
+            res_lst.append([(res * 255).astype(np.uint8)])
+        return res_lst
+    def __inference_v2(
+        self, outputs: Dict[str, Tensor], image_sizes: List[Tuple[int, int]]
+    ):
+        res_lst: List[List[np.ndarray]] = []
+        for output, image_size in zip(outputs["sod"], image_sizes):
+            output: Tensor = F.interpolate(
+                output.unsqueeze(0),
+                size=(image_size[1], image_size[0]),
+                mode="bilinear",
+                align_corners=False,
+            )
+            res: np.ndarray = torch.argmax(output, dim=1).cpu().numpy().squeeze()
+            res_lst.append([res])
+        return res_lst
+    def __inference_v3v5(
+        self, outputs: Dict[str, Tensor], image_sizes: List[Tuple[int, int]]
+    ):
+        res_lst: List[List[np.ndarray]] = []
+        for bi, image_size in enumerate(image_sizes):
+            res_lst_per_sample: List[np.ndarray] = []
+            for i in range(self.cfg.num_classes):
+                pred = outputs[f"sod{i}"][bi]
+                pred: Tensor = F.interpolate(
+                    pred.unsqueeze(0),
+                    size=(image_size[1], image_size[0]),
+                    mode="bilinear",
+                    align_corners=False,
+                )
+                res: np.ndarray = pred.sigmoid().data.cpu().numpy().squeeze()
+                res = (res - res.min()) / (res.max() - res.min() + 1e-8)
+                if self.cfg.is_fp16:
+                    res = np.float32(res)
+                res_lst_per_sample.append((res * 255).astype(np.uint8))
+            res_lst.append(res_lst_per_sample)
+        return res_lst
+    @torch.no_grad()
+    def inference(
+        self,
+        image_sizes: List[Tuple[int, int]],
+        images: Tensor,
+        depths: Optional[Tensor],
+        max_gts: Optional[List[int]],
+    ) -> List[List[np.ndarray]]:
+        self.model.eval()
+        assert len(image_sizes) == len(
+            images
+        ), "The number of image_sizes must equal to the number of images"
+        gpu_images: Tensor = images.to(self.device)
+        gpu_depths: Tensor = depths.to(self.device)
+        if self.cfg.ground_truth_version == 6:
+            with torch.cuda.amp.autocast(enabled=self.cfg.is_fp16):
+                outputs: Dict[str, Tensor] = dict()
+                for i in range(self.cfg.num_classes):
+                    outputs[f"sod{i}"] = self.model.inference(
+                        gpu_images, gpu_depths, [i] * gpu_images.shape[0], max_gts
+                    )["sod"]
+            return self.__inference_v3v5(outputs, image_sizes)
+        else:
+            raise Exception(
+                f"Unsupported ground_truth_version {self.cfg.ground_truth_version}"
+            )

s_multimae/rgbd_model.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from typing import Dict, List, Optional
+from torch import nn, Tensor
+from .model.multimae import generate_smultimae_model as generate_smultimae_model_v1
+from .configs.base_config import base_cfg
+class RGBDModel(nn.Module):
+    def __init__(self, cfg: base_cfg):
+        super(RGBDModel, self).__init__()
+        self.inputs = cfg.inputs
+        self.outputs = cfg.outputs
+        self.is_no_depth = cfg.is_inference_with_no_depth
+        if cfg.model_version == 1:
+            self.model, self.opt_params = generate_smultimae_model_v1(cfg)
+        else:
+            raise Exception(f"Unsupported model version {cfg.model_version}")
+    def encode_decode(
+        self,
+        images: Tensor,
+        depths: Optional[Tensor],
+        gt_index_lst: Optional[List[int]] = None,
+        max_gts_lst: Optional[List[int]] = None,
+    ) -> Dict[str, Tensor]:
+        """Encode images with backbone and decode into a semantic segmentation
+        map of the same size as input.
+        Returns:
+        {
+            "sod": Tensor,
+            "depth": Optional[Tensor],
+            "rgb": Optional[tensor],
+        }
+        """
+        inputs = {"rgb": images}
+        if "depth" in self.inputs:
+            inputs["depth"] = depths
+        return self.model.forward(inputs, gt_index_lst, max_gts_lst)
+    def forward(
+        self,
+        images: Tensor,
+        depths: Optional[Tensor],
+        gt_index_lst: Optional[List[int]] = None,
+        max_gts_lst: Optional[List[int]] = None,
+    ) -> Dict[str, Tensor]:
+        return self.encode_decode(images, depths, gt_index_lst, max_gts_lst)
+    def inference(
+        self,
+        images: Tensor,
+        depths: Optional[Tensor],
+        gt_index_lst: Optional[List[int]] = None,
+        max_gts_lst: Optional[List[int]] = None,
+    ) -> Dict[str, Tensor]:
+        return self.encode_decode(images, depths, gt_index_lst, max_gts_lst)

s_multimae/utils.py ADDED Viewed

	@@ -0,0 +1,236 @@

+from PIL import Image
+from glob import glob
+import random
+from typing import Dict, List
+from torch import nn, Tensor
+import os, shutil
+import torch
+import matplotlib.pyplot as plt
+import numpy as np
+import gc, cv2
+from .visualizer import post_processing_depth
+"""
+This module should not depend on other s_multimae modules.
+"""
+num_format = "{:,}".format
+def list_dirs(dir_root: str) -> List[str]:
+    return list(
+        sorted(
+            [
+                item
+                for item in os.listdir(dir_root)
+                if os.path.isdir(f"{dir_root}/{item}")
+            ]
+        )
+    )
+def clean_cache() -> None:
+    torch.cuda.empty_cache()
+    gc.collect()
+def count_parameters(model: nn.Module) -> str:
+    """Count the number of learnable parameters of a model"""
+    return num_format(sum(p.numel() for p in model.parameters() if p.requires_grad))
+def ranking_gts_to_dict(
+    ranking_gts: List[np.ndarray | str],
+) -> Dict[str, np.ndarray | str]:
+    """
+    Return:
+    dict(
+        gt0=ranking_gts[0],
+        gt1=ranking_gts[1],
+        gt2=ranking_gts[2],
+        gt3=ranking_gts[3],
+        gt4=ranking_gts[4],
+    )
+    """
+    return {f"gt{i}": v for i, v in enumerate(ranking_gts)}
+def dict_to_ranking_gts(d: Dict[str, np.ndarray], l=5) -> List[np.ndarray]:
+    """
+    Return: [ranking_gts["gt0"], ranking_gts["gt1"], ...]
+    """
+    return [d[f"gt{i}"] for i in range(l)]
+def random_choice(p: float) -> bool:
+    """Return True if random float <= p"""
+    return random.random() <= p
+def fname_without_ext(p: str) -> str:
+    return os.path.splitext(os.path.basename(p))[0]
+def list_files(
+    dirpaths: List[str] = [
+        "datasets/v1/train/RGB",
+        "datasets/v1/train/GT",
+        "datasets/v1/train/depths",
+    ],
+) -> List[List[str]]:
+    assert len(dirpaths) >= 1, "dirnames must contain at least 1 item"
+    fullpaths_lst: List[List[str]] = []
+    names_lst: List[List[str]] = []
+    for dirname in dirpaths:
+        fullpaths = list(sorted(glob(os.path.join(dirname, "*"))))
+        names = [fname_without_ext(fullpath) for fullpath in fullpaths]
+        fullpaths_lst.append(fullpaths)
+        names_lst.append(names)
+    rs: List[List[str]] = [fullpaths_lst[0]] + [[] for _ in range(len(dirpaths) - 1)]
+    # Ensure integrity
+    assert (
+        len(set([len(e) for e in names_lst])) == 1
+    ), f"Data is not integrity {[len(e) for e in names_lst]} | dirpath = {dirpaths}"
+    for name in names_lst[0]:
+        for i, names in enumerate(names_lst[1:]):
+            idx = names.index(name)
+            rs[i + 1].append(fullpaths_lst[i + 1][idx])
+    return rs
+def scale_saliency_maps(inputs: Tensor) -> Tensor:
+    """Input: Tensor, shape of (B, C, H, W)"""
+    min_v = (
+        torch.min(torch.flatten(inputs, 1), dim=1)[0]
+        .unsqueeze(1)
+        .unsqueeze(1)
+        .unsqueeze(1)
+    )
+    max_v = (
+        torch.max(torch.flatten(inputs, 1), dim=1)[0]
+        .unsqueeze(1)
+        .unsqueeze(1)
+        .unsqueeze(1)
+    )
+    return (inputs - min_v) / (max_v - min_v + 1e-8)
+def get_epoch_from_ckpt_path(ckpt_path: str) -> int:
+    """Example ckpt_path
+    os.path.join(experiment_dir_path, 'cfgv2.3', 'checkpoint_100.pt')
+    """
+    return int(ckpt_path.split("_")[-1].split(".")[0])
+def clean_dir(dir_path: str) -> None:
+    """Remove a directory if existed and create an empty directory"""
+    if os.path.isdir(dir_path):
+        shutil.rmtree(dir_path)
+    os.makedirs(dir_path, exist_ok=True)
+def get_sota_type(experiment_name: str) -> int:
+    """0 for SOTAs, 4 for experiment version 4, e.g. ..."""
+    if "cfgv" not in experiment_name:
+        return 0
+    half_right = experiment_name.split("cfgv")[1]
+    return int(half_right.split("_")[0])
+def hex_to_rgb(hex: str) -> np.ndarray:
+    """Convert hex color to rgb color
+    Args:
+        hex (str): "#00f900"
+    Returns:
+        np.ndarray: numpy array of rgb color
+    """
+    hex = hex[1:]
+    rgb = []
+    for i in (0, 2, 4):
+        decimal = int(hex[i : i + 2], 16)
+        rgb.append(decimal)
+    return (np.array(rgb) / 255.0)[::-1]
+def normalize(data: np.ndarray) -> np.ndarray:
+    return (data - data.min()) / (data.max() - data.min() + 1e-8)
+def post_processing_depth(depth_path: str) -> np.ndarray:
+    depth = np.array(Image.open(depth_path).convert("L"))
+    depth = (normalize(depth) * 255).astype(np.uint8)
+    return cv2.applyColorMap(depth, cv2.COLORMAP_SUMMER)
+def convert_batch_tensors_to_numpy_images(images: Tensor) -> np.ndarray:
+    """images of shape (batch_size, channels, width, height)"""
+    images = torch.permute(images, (0, 2, 3, 1))
+    images = images.numpy()
+    if images.shape[3] == 1:
+        return np.squeeze(images, axis=3)
+    else:
+        return images
+def join_horizontally(lst: List[np.ndarray]) -> np.ndarray:
+    return np.concatenate(lst, axis=1)
+def join_vertically(lst: List[np.ndarray]) -> np.ndarray:
+    return np.concatenate(lst, axis=0)
+def plot_batch_of_pairs(
+    images: Tensor,
+    depths: Tensor,
+    gts: Tensor,
+    save_file_path: str,
+) -> None:
+    images = convert_batch_tensors_to_numpy_images(images)
+    depths = convert_batch_tensors_to_numpy_images(depths)
+    gts = convert_batch_tensors_to_numpy_images(gts)
+    batch_size = images.shape[0]
+    samples: List[np.ndarray] = []
+    # fig, axes = plt.subplots(batch_size, 3, figsize=(3*batch_size, 20)) # (number of images, 3)
+    for i in range(batch_size):
+        samples.append(
+            join_horizontally(
+                [
+                    ((images[i] + 1.0) / 2 * 255).astype(np.uint8),
+                    post_processing_depth(depths[i]),
+                    post_processing_depth(gts[i]),
+                ]
+            )
+        )
+        # axes[i, 0].imshow(images[i])
+        # axes[i, 1].imshow(depths[i])
+        # axes[i, 2].imshow(gts[i])
+    # plt.show()
+    final = join_vertically(samples)
+    cv2.imwrite(save_file_path, cv2.cvtColor(final, cv2.COLOR_RGB2BGR))
+    print(f"Saved to file {save_file_path}")
+def plot_pairs(image: np.ndarray, depth: np.ndarray, gt: np.ndarray) -> None:
+    batch_size = 1
+    fig, axes = plt.subplots(
+        batch_size, 3, figsize=(3 * batch_size, 20)
+    )  # (number of images, 3)
+    for i in range(batch_size):
+        axes[i, 0].imshow(image)
+        axes[i, 1].imshow(depth)
+        axes[i, 2].imshow(gt)
+    plt.show()

s_multimae/visualize_2d_posemb.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import numpy as np
+from torch import Tensor
+import matplotlib.pyplot as plt
+from s_multimae.model.multimae import build_2d_sincos_posemb
+def visualize_2d_posemb():
+    NH, NW = 14, 14
+    dim_tokens = 768
+    colors = [
+        "Greys",
+        "Purples",
+        "Blues",
+        "Greens",
+        "Oranges",
+        "Reds",
+        "YlOrBr",
+        "YlOrRd",
+        "OrRd",
+        "PuRd",
+        "RdPu",
+        "BuPu",
+        "GnBu",
+        "PuBu",
+        "YlGnBu",
+        "PuBuGn",
+        "BuGn",
+        "YlGn",
+    ]
+    pos_emb: Tensor = build_2d_sincos_posemb(NH, NW, dim_tokens)
+    pos_emb_numpy: np.ndarray = (
+        pos_emb.squeeze(0).permute(1, 2, 0).numpy()
+    )  # 14 x 14 x 768
+    x = np.linspace(0, NH - 1, NH)
+    y = np.linspace(0, NW - 1, NW)
+    X, Y = np.meshgrid(x, y)
+    for color, i in zip(colors, range(0, pos_emb_numpy.shape[2], 100)):
+        ax = plt.axes(projection="3d")
+        Z = pos_emb_numpy[:, :, i]
+        # plt.imshow(Z, cmap='viridis')
+        # plt.savefig(f'posemb_visualization/test_{i}.png')
+        ax.plot_surface(
+            X,
+            Y,
+            Z,
+            # rstride=1, cstride=1,
+            cmap="viridis",
+            edgecolor="none",
+        )
+        plt.show()
+        plt.savefig(f"posemb_visualization/test_{i}.png")

s_multimae/visualizer.py ADDED Viewed

	@@ -0,0 +1,711 @@

+import colorsys
+from typing import Union
+import numpy as np
+import cv2
+import matplotlib.colors as mplc
+import pycocotools.mask as mask_util
+import matplotlib.figure as mplfigure
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+import matplotlib as mpl
+from enum import Enum, unique
+from PIL import Image
+_LARGE_MASK_AREA_THRESH = 120000
+_COLORS = (
+    np.array(
+        [
+            0.000,
+            0.447,
+            0.741,
+            0.850,
+            0.325,
+            0.098,
+            0.929,
+            0.694,
+            0.125,
+            0.494,
+            0.184,
+            0.556,
+            0.466,
+            0.674,
+            0.188,
+            0.301,
+            0.745,
+            0.933,
+            0.635,
+            0.078,
+            0.184,
+            0.300,
+            0.300,
+            0.300,
+            0.600,
+            0.600,
+            0.600,
+            1.000,
+            0.000,
+            0.000,
+            1.000,
+            0.500,
+            0.000,
+            0.749,
+            0.749,
+            0.000,
+            0.000,
+            1.000,
+            0.000,
+            0.000,
+            0.000,
+            1.000,
+            0.667,
+            0.000,
+            1.000,
+            0.333,
+            0.333,
+            0.000,
+            0.333,
+            0.667,
+            0.000,
+            0.333,
+            1.000,
+            0.000,
+            0.667,
+            0.333,
+            0.000,
+            0.667,
+            0.667,
+            0.000,
+            0.667,
+            1.000,
+            0.000,
+            1.000,
+            0.333,
+            0.000,
+            1.000,
+            0.667,
+            0.000,
+            1.000,
+            1.000,
+            0.000,
+            0.000,
+            0.333,
+            0.500,
+            0.000,
+            0.667,
+            0.500,
+            0.000,
+            1.000,
+            0.500,
+            0.333,
+            0.000,
+            0.500,
+            0.333,
+            0.333,
+            0.500,
+            0.333,
+            0.667,
+            0.500,
+            0.333,
+            1.000,
+            0.500,
+            0.667,
+            0.000,
+            0.500,
+            0.667,
+            0.333,
+            0.500,
+            0.667,
+            0.667,
+            0.500,
+            0.667,
+            1.000,
+            0.500,
+            1.000,
+            0.000,
+            0.500,
+            1.000,
+            0.333,
+            0.500,
+            1.000,
+            0.667,
+            0.500,
+            1.000,
+            1.000,
+            0.500,
+            0.000,
+            0.333,
+            1.000,
+            0.000,
+            0.667,
+            1.000,
+            0.000,
+            1.000,
+            1.000,
+            0.333,
+            0.000,
+            1.000,
+            0.333,
+            0.333,
+            1.000,
+            0.333,
+            0.667,
+            1.000,
+            0.333,
+            1.000,
+            1.000,
+            0.667,
+            0.000,
+            1.000,
+            0.667,
+            0.333,
+            1.000,
+            0.667,
+            0.667,
+            1.000,
+            0.667,
+            1.000,
+            1.000,
+            1.000,
+            0.000,
+            1.000,
+            1.000,
+            0.333,
+            1.000,
+            1.000,
+            0.667,
+            1.000,
+            0.333,
+            0.000,
+            0.000,
+            0.500,
+            0.000,
+            0.000,
+            0.667,
+            0.000,
+            0.000,
+            0.833,
+            0.000,
+            0.000,
+            1.000,
+            0.000,
+            0.000,
+            0.000,
+            0.167,
+            0.000,
+            0.000,
+            0.333,
+            0.000,
+            0.000,
+            0.500,
+            0.000,
+            0.000,
+            0.667,
+            0.000,
+            0.000,
+            0.833,
+            0.000,
+            0.000,
+            1.000,
+            0.000,
+            0.000,
+            0.000,
+            0.167,
+            0.000,
+            0.000,
+            0.333,
+            0.000,
+            0.000,
+            0.500,
+            0.000,
+            0.000,
+            0.667,
+            0.000,
+            0.000,
+            0.833,
+            0.000,
+            0.000,
+            1.000,
+            0.000,
+            0.000,
+            0.000,
+            0.143,
+            0.143,
+            0.143,
+            0.857,
+            0.857,
+            0.857,
+            1.000,
+            1.000,
+            1.000,
+        ]
+    )
+    .astype(np.float32)
+    .reshape(-1, 3)
+)
+def random_color(rgb=False, maximum=255):
+    """
+    Args:
+        rgb (bool): whether to return RGB colors or BGR colors.
+        maximum (int): either 255 or 1
+    Returns:
+        ndarray: a vector of 3 numbers
+    """
+    idx = np.random.randint(0, len(_COLORS))
+    ret = _COLORS[idx] * maximum
+    if not rgb:
+        ret = ret[::-1]
+    return ret
+@unique
+class ColorMode(Enum):
+    """
+    Enum of different color modes to use for instance visualizations.
+    """
+    IMAGE = 0
+    """
+    Picks a random color for every instance and overlay segmentations with low opacity.
+    """
+    SEGMENTATION = 1
+    """
+    Let instances of the same category have similar colors
+    (from metadata.thing_colors), and overlay them with
+    high opacity. This provides more attention on the quality of segmentation.
+    """
+    IMAGE_BW = 2
+    """
+    Same as IMAGE, but convert all areas without masks to gray-scale.
+    Only available for drawing per-instance mask predictions.
+    """
+class VisImage:
+    def __init__(self, img, scale=1.0):
+        """
+        Args:
+            img (ndarray): an RGB image of shape (H, W, 3) in range [0, 255].
+            scale (float): scale the input image
+        """
+        self.img = img
+        self.scale = scale
+        self.width, self.height = img.shape[1], img.shape[0]
+        self._setup_figure(img)
+    def _setup_figure(self, img):
+        """
+        Args:
+            Same as in :meth:`__init__()`.
+        Returns:
+            fig (matplotlib.pyplot.figure): top level container for all the image plot elements.
+            ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system.
+        """
+        fig = mplfigure.Figure(frameon=False)
+        self.dpi = fig.get_dpi()
+        # add a small 1e-2 to avoid precision lost due to matplotlib's truncation
+        # (https://github.com/matplotlib/matplotlib/issues/15363)
+        fig.set_size_inches(
+            (self.width * self.scale + 1e-2) / self.dpi,
+            (self.height * self.scale + 1e-2) / self.dpi,
+        )
+        self.canvas = FigureCanvasAgg(fig)
+        # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
+        ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
+        ax.axis("off")
+        self.fig = fig
+        self.ax = ax
+        self.reset_image(img)
+    def reset_image(self, img):
+        """
+        Args:
+            img: same as in __init__
+        """
+        img = img.astype("uint8")
+        self.ax.imshow(
+            img, extent=(0, self.width, self.height, 0), interpolation="nearest"
+        )
+    def save(self, filepath):
+        """
+        Args:
+            filepath (str): a string that contains the absolute path, including the file name, where
+                the visualized image will be saved.
+        """
+        self.fig.savefig(filepath)
+    def get_image(self):
+        """
+        Returns:
+            ndarray:
+                the visualized image of shape (H, W, 3) (RGB) in uint8 type.
+                The shape is scaled w.r.t the input image using the given `scale` argument.
+        """
+        canvas = self.canvas
+        s, (width, height) = canvas.print_to_buffer()
+        # buf = io.BytesIO()  # works for cairo backend
+        # canvas.print_rgba(buf)
+        # width, height = self.width, self.height
+        # s = buf.getvalue()
+        buffer = np.frombuffer(s, dtype="uint8")
+        img_rgba = buffer.reshape(height, width, 4)
+        rgb, alpha = np.split(img_rgba, [3], axis=2)
+        return rgb.astype("uint8")
+class GenericMask:
+    """
+    Attribute:
+        polygons (list[ndarray]): list[ndarray]: polygons for this mask.
+            Each ndarray has format [x, y, x, y, ...]
+        mask (ndarray): a binary mask
+    """
+    def __init__(self, mask_or_polygons, height, width):
+        self._mask = self._polygons = self._has_holes = None
+        self.height = height
+        self.width = width
+        m = mask_or_polygons
+        if isinstance(m, dict):
+            # RLEs
+            assert "counts" in m and "size" in m
+            if isinstance(m["counts"], list):  # uncompressed RLEs
+                h, w = m["size"]
+                assert h == height and w == width
+                m = mask_util.frPyObjects(m, h, w)
+            self._mask = mask_util.decode(m)[:, :]
+            return
+        if isinstance(m, list):  # list[ndarray]
+            self._polygons = [np.asarray(x).reshape(-1) for x in m]
+            return
+        if isinstance(m, np.ndarray):  # assumed to be a binary mask
+            assert m.shape[1] != 2, m.shape
+            assert m.shape == (
+                height,
+                width,
+            ), f"mask shape: {m.shape}, target dims: {height}, {width}"
+            self._mask = m.astype("uint8")
+            return
+        raise ValueError(
+            "GenericMask cannot handle object {} of type '{}'".format(m, type(m))
+        )
+    @property
+    def mask(self):
+        if self._mask is None:
+            self._mask = self.polygons_to_mask(self._polygons)
+        return self._mask
+    @property
+    def polygons(self):
+        if self._polygons is None:
+            self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
+        return self._polygons
+    @property
+    def has_holes(self):
+        if self._has_holes is None:
+            if self._mask is not None:
+                self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
+            else:
+                self._has_holes = (
+                    False  # if original format is polygon, does not have holes
+                )
+        return self._has_holes
+    def mask_to_polygons(self, mask):
+        # cv2.RETR_CCOMP flag retrieves all the contours and arranges them to a 2-level
+        # hierarchy. External contours (boundary) of the object are placed in hierarchy-1.
+        # Internal contours (holes) are placed in hierarchy-2.
+        # cv2.CHAIN_APPROX_NONE flag gets vertices of polygons from contours.
+        mask = np.ascontiguousarray(
+            mask
+        )  # some versions of cv2 does not support incontiguous arr
+        res = cv2.findContours(
+            mask.astype("uint8"), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE
+        )
+        hierarchy = res[-1]
+        if hierarchy is None:  # empty mask
+            return [], False
+        has_holes = (hierarchy.reshape(-1, 4)[:, 3] >= 0).sum() > 0
+        res = res[-2]
+        res = [x.flatten() for x in res]
+        # These coordinates from OpenCV are integers in range [0, W-1 or H-1].
+        # We add 0.5 to turn them into real-value coordinate space. A better solution
+        # would be to first +0.5 and then dilate the returned polygon by 0.5.
+        res = [x + 0.5 for x in res if len(x) >= 6]
+        return res, has_holes
+    def polygons_to_mask(self, polygons):
+        rle = mask_util.frPyObjects(polygons, self.height, self.width)
+        rle = mask_util.merge(rle)
+        return mask_util.decode(rle)[:, :]
+    def area(self):
+        return self.mask.sum()
+    def bbox(self):
+        p = mask_util.frPyObjects(self.polygons, self.height, self.width)
+        p = mask_util.merge(p)
+        bbox = mask_util.toBbox(p)
+        bbox[2] += bbox[0]
+        bbox[3] += bbox[1]
+        return bbox
+class Visualizer:
+    """
+    Visualizer that draws data about detection/segmentation on images.
+    It contains methods like `draw_{text,box,circle,line,binary_mask,polygon}`
+    that draw primitive objects to images, as well as high-level wrappers like
+    `draw_{instance_predictions,sem_seg,panoptic_seg_predictions,dataset_dict}`
+    that draw composite data in some pre-defined style.
+    Note that the exact visualization style for the high-level wrappers are subject to change.
+    Style such as color, opacity, label contents, visibility of labels, or even the visibility
+    of objects themselves (e.g. when the object is too small) may change according
+    to different heuristics, as long as the results still look visually reasonable.
+    To obtain a consistent style, you can implement custom drawing functions with the
+    abovementioned primitive methods instead. If you need more customized visualization
+    styles, you can process the data yourself following their format documented in
+    tutorials (:doc:`/tutorials/models`, :doc:`/tutorials/datasets`). This class does not
+    intend to satisfy everyone's preference on drawing styles.
+    This visualizer focuses on high rendering quality rather than performance. It is not
+    designed to be used for real-time applications.
+    """
+    # TODO implement a fast, rasterized version using OpenCV
+    def __init__(
+        self,
+        img_rgb: Union[Image.Image, np.ndarray],
+        scale=1.0,
+        instance_mode=ColorMode.IMAGE,
+    ):
+        """
+        Args:
+            img_rgb: a numpy array of shape (H, W, C), where H and W correspond to
+                the height and width of the image respectively. C is the number of
+                color channels. The image is required to be in RGB format since that
+                is a requirement of the Matplotlib library. The image is also expected
+                to be in the range [0, 255].
+            instance_mode (ColorMode): defines one of the pre-defined style for drawing
+                instances on an image.
+        """
+        if type(img_rgb) == np.ndarray:
+            img_rgb = img_rgb[:, :, ::-1]
+        else:
+            img_rgb = np.array(img_rgb)[:, :, ::-1]
+        self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
+        self.output = VisImage(self.img, scale=scale)
+        # too small texts are useless, therefore clamp to 9
+        self._default_font_size = max(
+            np.sqrt(self.output.height * self.output.width) // 90, 10 // scale
+        )
+        self._instance_mode = instance_mode
+    def draw_binary_mask(
+        self,
+        binary_mask,
+        color=None,
+        *,
+        edge_color=None,
+        text=None,
+        alpha=0.5,
+        area_threshold=10,
+    ):
+        """
+        Args:
+            binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and
+                W is the image width. Each value in the array is either a 0 or 1 value of uint8
+                type.
+            color: color of the mask. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted. If None, will pick a random color.
+            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
+                full list of formats that are accepted.
+            text (str): if None, will be drawn on the object
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            area_threshold (float): a connected component smaller than this area will not be shown.
+        Returns:
+            output (VisImage): image object with mask drawn.
+        """
+        if color is None:
+            color = random_color(rgb=True, maximum=1)
+        color = mplc.to_rgb(color)
+        has_valid_segment = False
+        binary_mask = binary_mask.astype("uint8")  # opencv needs uint8
+        mask = GenericMask(binary_mask, self.output.height, self.output.width)
+        shape2d = (binary_mask.shape[0], binary_mask.shape[1])
+        if not mask.has_holes:
+            # draw polygons for regular masks
+            for segment in mask.polygons:
+                area = mask_util.area(
+                    mask_util.frPyObjects([segment], shape2d[0], shape2d[1])
+                )
+                if area < (area_threshold or 0):
+                    continue
+                has_valid_segment = True
+                segment = segment.reshape(-1, 2)
+                self.draw_polygon(
+                    segment, color=color, edge_color=edge_color, alpha=alpha
+                )
+        else:
+            # TODO: Use Path/PathPatch to draw vector graphics:
+            # https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon
+            rgba = np.zeros(shape2d + (4,), dtype="float32")
+            rgba[:, :, :3] = color
+            rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha
+            has_valid_segment = True
+            self.output.ax.imshow(
+                rgba, extent=(0, self.output.width, self.output.height, 0)
+            )
+        if text is not None and has_valid_segment:
+            lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+            self._draw_text_in_mask(binary_mask, text, lighter_color)
+        return self.output
+    def draw_polygon(self, segment, color, edge_color=None, alpha=0.5):
+        """
+        Args:
+            segment: numpy array of shape Nx2, containing all the points in the polygon.
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
+                full list of formats that are accepted. If not provided, a darker shade
+                of the polygon color will be used instead.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+        Returns:
+            output (VisImage): image object with polygon drawn.
+        """
+        if edge_color is None:
+            # make edge color darker than the polygon color
+            if alpha > 0.8:
+                edge_color = self._change_color_brightness(
+                    color, brightness_factor=-0.7
+                )
+            else:
+                edge_color = color
+        edge_color = mplc.to_rgb(edge_color) + (1,)
+        polygon = mpl.patches.Polygon(
+            segment,
+            fill=True,
+            facecolor=mplc.to_rgb(color) + (alpha,),
+            edgecolor=edge_color,
+            linewidth=max(self._default_font_size // 15 * self.output.scale, 1),
+        )
+        self.output.ax.add_patch(polygon)
+        return self.output
+    """
+    Internal methods:
+    """
+    def _change_color_brightness(self, color, brightness_factor):
+        """
+        Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
+        less or more saturation than the original color.
+        Args:
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
+                0 will correspond to no change, a factor in [-1.0, 0) range will result in
+                a darker color and a factor in (0, 1.0] range will result in a lighter color.
+        Returns:
+            modified_color (tuple[double]): a tuple containing the RGB values of the
+                modified color. Each value in the tuple is in the [0.0, 1.0] range.
+        """
+        assert brightness_factor >= -1.0 and brightness_factor <= 1.0
+        color = mplc.to_rgb(color)
+        polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color))
+        modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1])
+        modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness
+        modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness
+        modified_color = colorsys.hls_to_rgb(
+            polygon_color[0], modified_lightness, polygon_color[2]
+        )
+        return modified_color
+    def _draw_text_in_mask(self, binary_mask, text, color):
+        """
+        Find proper places to draw text given a binary mask.
+        """
+        # TODO sometimes drawn on wrong objects. the heuristics here can improve.
+        _num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(
+            binary_mask, 8
+        )
+        if stats[1:, -1].size == 0:
+            return
+        largest_component_id = np.argmax(stats[1:, -1]) + 1
+        # draw text on the largest component, as well as other very large components.
+        for cid in range(1, _num_cc):
+            if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH:
+                # median is more stable than centroid
+                # center = centroids[largest_component_id]
+                center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1]
+                self.draw_text(text, center, color=color)
+    def get_output(self):
+        """
+        Returns:
+            output (VisImage): the image output containing the visualizations added
+            to the image.
+        """
+        return self.output
+def apply_threshold(pred: np.ndarray) -> np.ndarray:
+    """Apply threshold to a salient map
+    Args:
+        pred (np.ndarray): each pixel is in range [0, 255]
+    Returns:
+        np.ndarray: each pixel is only 0.0 or 1.0
+    """
+    binary_mask = pred / 255.0
+    binary_mask[binary_mask >= 0.5] = 1.0
+    binary_mask[binary_mask < 0.5] = 0.0
+    return binary_mask
+def normalize(data: np.ndarray) -> np.ndarray:
+    return (data - data.min()) / (data.max() - data.min() + 1e-8)
+def post_processing_depth(depth: np.ndarray) -> np.ndarray:
+    depth = (normalize(depth) * 255).astype(np.uint8)
+    return cv2.applyColorMap(depth, cv2.COLORMAP_OCEAN)
+def apply_vis_to_image(
+    rgb: np.ndarray, binary_mask: np.ndarray, color: np.ndarray
+) -> np.ndarray:
+    if rgb.shape[:2] != binary_mask.shape[:2]:
+        print(rgb.shape, binary_mask.shape)
+        binary_mask = cv2.resize(binary_mask, [rgb.shape[1], rgb.shape[0]])
+    visualizer = Visualizer(rgb)
+    vis_image: VisImage = visualizer.draw_binary_mask(binary_mask, color)
+    vis_image = vis_image.get_image()[:, :, ::-1]
+    return vis_image

streamlit_apps/__init__.py ADDED Viewed

File without changes

streamlit_apps/app.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os, sys
+sys.path.append(os.getcwd())
+import multiprocessing
+import streamlit as st
+from app_utils.color_selection_ui import color_selection_ui
+from app_utils.depth_selection_ui import depth_selection_ui
+from app_utils.device import device
+from app_utils.sod_selection_ui import sod_selection_ui
+class MODE:
+    IMAGE = "image"
+    VIDEO = "video"
+    WEBRTC = "webrtc"
+    DEMO = "demo"
+TITLE = "S-MultiMAE: A Multi-Ground Truth approach for RGB-D Saliency Detection"
+st.set_page_config(
+    page_title=TITLE,
+    page_icon="🧊",
+    layout="wide",
+    # initial_sidebar_state="expanded",
+    # menu_items={
+    #     'Get Help': 'https://www.extremelycoolapp.com/help',
+    #     'Report a bug': "https://www.extremelycoolapp.com/bug",
+    #     'About': "# This is a header. This is an *extremely* cool app!"
+    # }
+)
+st.title(TITLE)
+with st.expander("INTRODUCTION"):
+    st.text(
+        f"""Demo for S-MultiMAE.
+        Device: {device.type}
+        Number of CPU(s): {multiprocessing.cpu_count()}"""
+    )
+    st.image("docs/figures/proposed_method_v5.drawio.png", use_column_width="always")
+with st.expander("SETTINGS", expanded=True):
+    col1, col2 = st.columns(2)
+    with col1:
+        mode = st.radio(
+            "Mode",
+            (
+                MODE.IMAGE,
+                # MODE.VIDEO,
+                # MODE.WEBRTC,
+                # MODE.DEMO,
+            ),
+        )
+        st.markdown("---")
+        color = color_selection_ui()
+    with col2:
+        depth_model = depth_selection_ui()
+        st.markdown("---")
+        sod_model, da = sod_selection_ui()
+with st.expander("HOW TO USE", expanded=True):
+    st.text(
+        "(1) You can change the model type (using different backbones) in the settings."
+    )
+    st.text("(2) Upload an RGB image.")
+    st.text(
+        "(3) (Optional) Provide its corresponding depth. If not present, a pseudo-depth will be inferred by a rgb2depth model."
+    )
+    st.text(
+        "(4) You may try a different number of sets of salient objects the model can produce."
+    )
+    st.text("""(5) Click "Predict Salient Objects".""")
+if mode == MODE.IMAGE:
+    from app_utils.image_inference import image_inference
+    image_inference(depth_model, sod_model, da, color)
+# elif mode == MODE.VIDEO:
+#     from video_inference import video_inference
+#     video_inference(depth_model, sod_model, color)
+# elif mode == MODE.WEBRTC:
+#     from webrtc_app import webrtc_app
+#     webrtc_app(depth_model, sod_model, color)
+# elif mode == MODE.DEMO:
+#     from demo import demo
+#     demo()

streamlit_apps/app_utils/__init__.py ADDED Viewed

File without changes

streamlit_apps/app_utils/app_env.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import os
+app_env = os.environ.get("APP_ENVIRONMENT", "HUGGINGFACE")
+IMAGE_SIZE = 224
+class DEPTH_MODEL_TYPE:
+    DPT_DEPTH = "DPTDepth"
+    REL_DEPTH = "RelDepth"
+class SOD_MODEL_TYPE:
+    S_MULTIMAE = "S-MultiMAE"
+    SPNET = "SPNet"
+    BBSNET = "BBSNet"

streamlit_apps/app_utils/app_utils.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import os
+import random
+import time
+from typing import Tuple, Union
+import cv2
+import numpy as np
+import streamlit as st
+from PIL import Image
+from torch import nn
+num_format = "{:,}".format
+def count_parameters(model: nn.Module) -> str:
+    """Count the number of parameters of a model"""
+    return num_format(sum(p.numel() for p in model.parameters() if p.requires_grad))
+class FrameRate:
+    def __init__(self) -> None:
+        self.c: int = 0
+        self.start_time: float = None
+        self.NO_FRAMES = 100
+        self.fps: float = -1
+    def reset(self) -> None:
+        self.start_time = time.time()
+        self.c = 0
+        self.fps = -1
+    def count(self) -> None:
+        self.c += 1
+        if self.c % self.NO_FRAMES == 0:
+            self.c = 0
+            end_time = time.time()
+            self.fps = self.NO_FRAMES / (end_time - self.start_time)
+            self.start_time = end_time
+    def show_fps(self, image: np.ndarray) -> np.ndarray:
+        if self.fps != -1:
+            return cv2.putText(
+                image,
+                f"FPS {self.fps:.0f}",
+                (50, 50),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                fontScale=1,
+                color=(255, 0, 0),
+                thickness=2,
+            )
+        else:
+            return image
+class ImgContainer:
+    img: np.ndarray = None  # raw image
+    frame_rate: FrameRate = FrameRate()
+def load_video(video_path: str) -> bytes:
+    if not os.path.isfile(video_path):
+        return
+    with st.spinner(f"Loading video {video_path} ..."):
+        video_bytes = open(video_path, "rb").read()
+        st.video(video_bytes, format="video/mp4")
+def normalize(data: np.ndarray) -> np.ndarray:
+    return (data - data.min()) / (data.max() - data.min() + 1e-8)
+def get_size(image: Union[Image.Image, np.ndarray]) -> Tuple[int, int]:
+    """Get resolution (w, h) of an image
+    An input image can be Pillow Image or CV2 Image
+    """
+    if type(image) == np.ndarray:
+        return (image.shape[1], image.shape[0])
+    else:
+        return image.size
+def random_choice(p: float) -> bool:
+    """Return True if random float <= p"""
+    return random.random() <= p

streamlit_apps/app_utils/base_model.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from typing import List
+import numpy as np
+from torch import Tensor, nn
+class BaseRGBDModel(nn.Module):
+    def __init__(self):
+        super(BaseRGBDModel, self).__init__()
+        """
+        Requirements:
+        1. Construct a model
+        2. Load pretrained weights
+        3. Load model into device
+        4. Construct preprocessing
+        """
+    def inference(
+        self,
+        image: Tensor,
+        depth: Tensor,
+        origin_shape: np.array,
+    ) -> List[np.ndarray]:
+        """
+        Given:
+        - An image (Tensor) with original shape [c, h, w]
+        - A depth image (Tensor) with a shape of [c, h, w], do not need to be the same shape as image
+        Requirements:
+        1. Preprocessing
+        2. Inference
+        3. Return saliency maps np.float32 between 0.0 and 1.0,
+           with the same size as original size
+        """
+        raise NotImplementedError()
+    def batch_inference(
+        self,
+        images: Tensor,
+        depths: Tensor,
+    ) -> List[np.ndarray]:
+        """
+        Given:
+        - A batch of images (Tensor) with original shape [b, c, h, w]
+        - A batch of depths (Tensor) with a shape of [b, c, h, w], do not need to be the same shape as image
+        Requirements:
+        1. Preprocessing
+        2. Inference
+        3. Return saliency maps np.float32 between 0.0 and 1.0,
+           with the same size as original size
+        """
+        raise NotImplementedError()

streamlit_apps/app_utils/color_selection_ui.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import numpy as np
+import streamlit as st
+from s_multimae.utils import hex_to_rgb
+def color_selection_ui() -> np.ndarray:
+    color = st.color_picker("Pick A Color", value="#00f900", key="color")
+    color = hex_to_rgb(color)
+    return color

streamlit_apps/app_utils/depth_model.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import os
+import torch
+import torchvision.transforms as transforms
+import torchvision.transforms.functional as TF
+from torch import Tensor, nn
+from .app_utils import count_parameters
+from .device import device
+from .dpt.models import DPTDepthModel
+class BaseDepthModel:
+    def __init__(self, image_size: int) -> None:
+        self.image_size = image_size
+        self.model: nn.Module = None
+    def forward(self, image: Tensor) -> Tensor:
+        """Perform forward inference for an image
+        Input image of shape [c, h, w]
+        Return of shape [c, h, w]
+        """
+        raise NotImplementedError()
+    def batch_forward(self, images: Tensor) -> Tensor:
+        """Perform forward inference for a batch of images
+        Input images of shape [b, c, h, w]
+        Return of shape [b, c, h, w]"""
+        raise NotImplementedError()
+    def get_number_of_parameters(self) -> int:
+        return count_parameters(self.model)
+class DPTDepth(BaseDepthModel):
+    def __init__(self, image_size: int) -> None:
+        super().__init__(image_size)
+        print("DPTDepthconstructor")
+        weights_fname = "omnidata_rgb2depth_dpt_hybrid.pth"
+        weights_path = os.path.join("weights", weights_fname)
+        if not os.path.isfile(weights_path):
+            from huggingface_hub import hf_hub_download
+            hf_hub_download(repo_id="RGBD-SOD/S-MultiMAE", filename=weights_fname)
+            os.system(f"mv {weights_fname} weights")
+        omnidata_ckpt = torch.load(
+            weights_path,
+            map_location="cpu",
+        )
+        self.model = DPTDepthModel()
+        self.model.load_state_dict(omnidata_ckpt)
+        self.model: DPTDepthModel = self.model.to(device).eval()
+        self.transform = transforms.Compose(
+            [
+                transforms.Resize(
+                    (self.image_size, self.image_size),
+                    interpolation=TF.InterpolationMode.BICUBIC,
+                ),
+                transforms.Normalize(
+                    (0.5, 0.5, 0.5),
+                    (0.5, 0.5, 0.5),
+                ),
+            ]
+        )
+    def forward(self, image: Tensor) -> Tensor:
+        depth_model_input = self.transform(image.unsqueeze(0))
+        return self.model.forward(depth_model_input.to(device)).squeeze(0)
+    def batch_forward(self, images: Tensor) -> Tensor:
+        images: Tensor = TF.resize(
+            images,
+            (self.image_size, self.image_size),
+            interpolation=TF.InterpolationMode.BICUBIC,
+        )
+        depth_model_input = (images - 0.5) / 0.5
+        return self.model(depth_model_input.to(device))

streamlit_apps/app_utils/depth_selection_ui.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import streamlit as st
+from .app_env import DEPTH_MODEL_TYPE, IMAGE_SIZE
+from .depth_model import BaseDepthModel, DPTDepth
+@st.cache_resource
+def load_depth_model(depth_model_type: DEPTH_MODEL_TYPE) -> DPTDepth:
+    if depth_model_type == DEPTH_MODEL_TYPE.DPT_DEPTH:
+        return DPTDepth(IMAGE_SIZE)
+    else:
+        return DPTDepth(IMAGE_SIZE)
+def depth_selection_ui() -> BaseDepthModel:
+    depth_model: BaseDepthModel = None
+    depth_model_type = st.selectbox(
+        "Choose depth model",
+        (
+            DEPTH_MODEL_TYPE.DPT_DEPTH,
+            # DEPTH_MODEL_TYPE.REL_DEPTH,
+        ),
+        key="depth_model_type",
+    )
+    depth_model = load_depth_model(depth_model_type)
+    st.text(f"Number of parameters {depth_model.get_number_of_parameters()}")
+    return depth_model

streamlit_apps/app_utils/device.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import torch
+cpu_device = torch.device("cpu")
+# device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
+device = cpu_device

streamlit_apps/app_utils/dpt/__init__.py ADDED Viewed

File without changes

streamlit_apps/app_utils/dpt/base_model.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import torch
+class BaseModel(torch.nn.Module):
+    def load(self, path):
+        """Load model from file.
+        Args:
+            path (str): file path
+        """
+        parameters = torch.load(path, map_location=torch.device("cpu"))
+        if "optimizer" in parameters:
+            parameters = parameters["model"]
+        self.load_state_dict(parameters)

streamlit_apps/app_utils/dpt/blocks.py ADDED Viewed

	@@ -0,0 +1,383 @@

+import torch
+import torch.nn as nn
+from .vit import (
+    _make_pretrained_vitb_rn50_384,
+    _make_pretrained_vitl16_384,
+    _make_pretrained_vitb16_384,
+    forward_vit,
+)
+def _make_encoder(
+    backbone,
+    features,
+    use_pretrained,
+    groups=1,
+    expand=False,
+    exportable=True,
+    hooks=None,
+    use_vit_only=False,
+    use_readout="ignore",
+    enable_attention_hooks=False,
+):
+    if backbone == "vitl16_384":
+        pretrained = _make_pretrained_vitl16_384(
+            use_pretrained,
+            hooks=hooks,
+            use_readout=use_readout,
+            enable_attention_hooks=enable_attention_hooks,
+        )
+        scratch = _make_scratch(
+            [256, 512, 1024, 1024], features, groups=groups, expand=expand
+        )  # ViT-L/16 - 85.0% Top1 (backbone)
+    elif backbone == "vitb_rn50_384":
+        pretrained = _make_pretrained_vitb_rn50_384(
+            use_pretrained,
+            hooks=hooks,
+            use_vit_only=use_vit_only,
+            use_readout=use_readout,
+            enable_attention_hooks=enable_attention_hooks,
+        )
+        scratch = _make_scratch(
+            [256, 512, 768, 768], features, groups=groups, expand=expand
+        )  # ViT-H/16 - 85.0% Top1 (backbone)
+    elif backbone == "vitb16_384":
+        pretrained = _make_pretrained_vitb16_384(
+            use_pretrained,
+            hooks=hooks,
+            use_readout=use_readout,
+            enable_attention_hooks=enable_attention_hooks,
+        )
+        scratch = _make_scratch(
+            [96, 192, 384, 768], features, groups=groups, expand=expand
+        )  # ViT-B/16 - 84.6% Top1 (backbone)
+    elif backbone == "resnext101_wsl":
+        pretrained = _make_pretrained_resnext101_wsl(use_pretrained)
+        scratch = _make_scratch(
+            [256, 512, 1024, 2048], features, groups=groups, expand=expand
+        )  # efficientnet_lite3
+    else:
+        print(f"Backbone '{backbone}' not implemented")
+        assert False
+    return pretrained, scratch
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    out_shape4 = out_shape
+    if expand == True:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        out_shape4 = out_shape * 8
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0],
+        out_shape1,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1],
+        out_shape2,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2],
+        out_shape3,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer4_rn = nn.Conv2d(
+        in_shape[3],
+        out_shape4,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    return scratch
+def _make_resnet_backbone(resnet):
+    pretrained = nn.Module()
+    pretrained.layer1 = nn.Sequential(
+        resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool, resnet.layer1
+    )
+    pretrained.layer2 = resnet.layer2
+    pretrained.layer3 = resnet.layer3
+    pretrained.layer4 = resnet.layer4
+    return pretrained
+def _make_pretrained_resnext101_wsl(use_pretrained):
+    resnet = torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl")
+    return _make_resnet_backbone(resnet)
+class Interpolate(nn.Module):
+    """Interpolation module."""
+    def __init__(self, scale_factor, mode, align_corners=False):
+        """Init.
+        Args:
+            scale_factor (float): scaling
+            mode (str): interpolation mode
+        """
+        super(Interpolate, self).__init__()
+        self.interp = nn.functional.interpolate
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: interpolated data
+        """
+        x = self.interp(
+            x,
+            scale_factor=self.scale_factor,
+            mode=self.mode,
+            align_corners=self.align_corners,
+        )
+        return x
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module."""
+    def __init__(self, features):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.conv1 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True
+        )
+        self.conv2 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True
+        )
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.relu(x)
+        out = self.conv1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        return out + x
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block."""
+    def __init__(self, features):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock, self).__init__()
+        self.resConfUnit1 = ResidualConvUnit(features)
+        self.resConfUnit2 = ResidualConvUnit(features)
+    def forward(self, *xs):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if len(xs) == 2:
+            output += self.resConfUnit1(xs[1])
+        output = self.resConfUnit2(output)
+        output = nn.functional.interpolate(
+            output, scale_factor=2, mode="bilinear", align_corners=True
+        )
+        return output
+class ResidualConvUnit_custom(nn.Module):
+    """Residual convolution module."""
+    def __init__(self, features, activation, bn):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.bn = bn
+        self.groups = 1
+        self.conv1 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+        self.conv2 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+        if self.bn == True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+        self.activation = activation
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn == True:
+            out = self.bn1(out)
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn == True:
+            out = self.bn2(out)
+        if self.groups > 1:
+            out = self.conv_merge(out)
+        return self.skip_add.add(out, x)
+        # return out + x
+class FeatureFusionBlock_custom(nn.Module):
+    """Feature fusion block."""
+    def __init__(
+        self,
+        features,
+        activation,
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+    ):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock_custom, self).__init__()
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.groups = 1
+        self.expand = expand
+        out_features = features
+        if self.expand == True:
+            out_features = features // 2
+        self.out_conv = nn.Conv2d(
+            features,
+            out_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+            groups=1,
+        )
+        self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, *xs):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+            # output += res
+        output = self.resConfUnit2(output)
+        output = nn.functional.interpolate(
+            output, scale_factor=2, mode="bilinear", align_corners=self.align_corners
+        )
+        output = self.out_conv(output)
+        return output

streamlit_apps/app_utils/dpt/midas_net.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""MidashNet: Network for monocular depth estimation trained by mixing several datasets.
+This file contains code that is adapted from
+https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
+"""
+import torch
+import torch.nn as nn
+from .base_model import BaseModel
+from .blocks import FeatureFusionBlock, Interpolate, _make_encoder
+class MidasNet_large(BaseModel):
+    """Network for monocular depth estimation."""
+    def __init__(self, path=None, features=256, non_negative=True):
+        """Init.
+        Args:
+            path (str, optional): Path to saved model. Defaults to None.
+            features (int, optional): Number of features. Defaults to 256.
+            backbone (str, optional): Backbone network for encoder. Defaults to resnet50
+        """
+        print("Loading weights: ", path)
+        super(MidasNet_large, self).__init__()
+        use_pretrained = False if path is None else True
+        self.pretrained, self.scratch = _make_encoder(
+            backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained
+        )
+        self.scratch.refinenet4 = FeatureFusionBlock(features)
+        self.scratch.refinenet3 = FeatureFusionBlock(features)
+        self.scratch.refinenet2 = FeatureFusionBlock(features)
+        self.scratch.refinenet1 = FeatureFusionBlock(features)
+        self.scratch.output_conv = nn.Sequential(
+            nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1),
+            Interpolate(scale_factor=2, mode="bilinear"),
+            nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(True) if non_negative else nn.Identity(),
+        )
+        if path:
+            self.load(path)
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input data (image)
+        Returns:
+            tensor: depth
+        """
+        layer_1 = self.pretrained.layer1(x)
+        layer_2 = self.pretrained.layer2(layer_1)
+        layer_3 = self.pretrained.layer3(layer_2)
+        layer_4 = self.pretrained.layer4(layer_3)
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        path_4 = self.scratch.refinenet4(layer_4_rn)
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        out = self.scratch.output_conv(path_1)
+        return torch.squeeze(out, dim=1)

streamlit_apps/app_utils/dpt/models.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import torch
+import torch.nn as nn
+from torch import Tensor
+from .base_model import BaseModel
+from .blocks import (
+    FeatureFusionBlock_custom,
+    Interpolate,
+    _make_encoder,
+    forward_vit,
+)
+def _make_fusion_block(features, use_bn):
+    return FeatureFusionBlock_custom(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+    )
+class DPT(BaseModel):
+    def __init__(
+        self,
+        head,
+        features=256,
+        backbone="vitb_rn50_384",
+        readout="project",
+        channels_last=False,
+        use_bn=False,
+        enable_attention_hooks=False,
+    ):
+        super(DPT, self).__init__()
+        self.channels_last = channels_last
+        hooks = {
+            "vitb_rn50_384": [0, 1, 8, 11],
+            "vitb16_384": [2, 5, 8, 11],
+            "vitl16_384": [5, 11, 17, 23],
+        }
+        # Instantiate backbone and reassemble blocks
+        self.pretrained, self.scratch = _make_encoder(
+            backbone,
+            features,
+            False,  # Set to true of you want to train from scratch, uses ImageNet weights
+            groups=1,
+            expand=False,
+            exportable=False,
+            hooks=hooks[backbone],
+            use_readout=readout,
+            enable_attention_hooks=enable_attention_hooks,
+        )
+        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+        self.scratch.output_conv = head
+    def forward(self, x: Tensor) -> Tensor:
+        if self.channels_last == True:
+            x.contiguous(memory_format=torch.channels_last)
+        layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        path_4 = self.scratch.refinenet4(layer_4_rn)
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        out = self.scratch.output_conv(path_1)
+        return out
+class DPTDepthModel(DPT):
+    def __init__(
+        self, path=None, non_negative=True, scale=1.0, shift=0.0, invert=False, **kwargs
+    ):
+        features = kwargs["features"] if "features" in kwargs else 256
+        self.scale = scale
+        self.shift = shift
+        self.invert = invert
+        head = nn.Sequential(
+            nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
+            Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
+            nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(True) if non_negative else nn.Identity(),
+            nn.Identity(),
+        )
+        super().__init__(head, **kwargs)
+        if path is not None:
+            self.load(path)
+    def forward(self, x: Tensor) -> Tensor:
+        """Input x of shape [b, c, h, w]
+        Return tensor of shape [b, c, h, w]
+        """
+        inv_depth = super().forward(x)
+        if self.invert:
+            depth = self.scale * inv_depth + self.shift
+            depth[depth < 1e-8] = 1e-8
+            depth = 1.0 / depth
+            return depth
+        else:
+            return inv_depth

streamlit_apps/app_utils/dpt/transforms.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import numpy as np
+import cv2
+import math
+def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
+    """Rezise the sample to ensure the given size. Keeps aspect ratio.
+    Args:
+        sample (dict): sample
+        size (tuple): image size
+    Returns:
+        tuple: new size
+    """
+    shape = list(sample["disparity"].shape)
+    if shape[0] >= size[0] and shape[1] >= size[1]:
+        return sample
+    scale = [0, 0]
+    scale[0] = size[0] / shape[0]
+    scale[1] = size[1] / shape[1]
+    scale = max(scale)
+    shape[0] = math.ceil(scale * shape[0])
+    shape[1] = math.ceil(scale * shape[1])
+    # resize
+    sample["image"] = cv2.resize(
+        sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method
+    )
+    sample["disparity"] = cv2.resize(
+        sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST
+    )
+    sample["mask"] = cv2.resize(
+        sample["mask"].astype(np.float32),
+        tuple(shape[::-1]),
+        interpolation=cv2.INTER_NEAREST,
+    )
+    sample["mask"] = sample["mask"].astype(bool)
+    return tuple(shape)
+class Resize(object):
+    """Resize sample to given size (width, height)."""
+    def __init__(
+        self,
+        width,
+        height,
+        resize_target=True,
+        keep_aspect_ratio=False,
+        ensure_multiple_of=1,
+        resize_method="lower_bound",
+        image_interpolation_method=cv2.INTER_AREA,
+    ):
+        """Init.
+        Args:
+            width (int): desired output width
+            height (int): desired output height
+            resize_target (bool, optional):
+                True: Resize the full sample (image, mask, target).
+                False: Resize image only.
+                Defaults to True.
+            keep_aspect_ratio (bool, optional):
+                True: Keep the aspect ratio of the input sample.
+                Output sample might not have the given width and height, and
+                resize behaviour depends on the parameter 'resize_method'.
+                Defaults to False.
+            ensure_multiple_of (int, optional):
+                Output width and height is constrained to be multiple of this parameter.
+                Defaults to 1.
+            resize_method (str, optional):
+                "lower_bound": Output will be at least as large as the given size.
+                "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
+                "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
+                Defaults to "lower_bound".
+        """
+        self.__width = width
+        self.__height = height
+        self.__resize_target = resize_target
+        self.__keep_aspect_ratio = keep_aspect_ratio
+        self.__multiple_of = ensure_multiple_of
+        self.__resize_method = resize_method
+        self.__image_interpolation_method = image_interpolation_method
+    def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
+        y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if max_val is not None and y > max_val:
+            y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if y < min_val:
+            y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        return y
+    def get_size(self, width, height):
+        # determine new height and width
+        scale_height = self.__height / height
+        scale_width = self.__width / width
+        if self.__keep_aspect_ratio:
+            if self.__resize_method == "lower_bound":
+                # scale such that output size is lower bound
+                if scale_width > scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "upper_bound":
+                # scale such that output size is upper bound
+                if scale_width < scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "minimal":
+                # scale as least as possbile
+                if abs(1 - scale_width) < abs(1 - scale_height):
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            else:
+                raise ValueError(
+                    f"resize_method {self.__resize_method} not implemented"
+                )
+        if self.__resize_method == "lower_bound":
+            new_height = self.constrain_to_multiple_of(
+                scale_height * height, min_val=self.__height
+            )
+            new_width = self.constrain_to_multiple_of(
+                scale_width * width, min_val=self.__width
+            )
+        elif self.__resize_method == "upper_bound":
+            new_height = self.constrain_to_multiple_of(
+                scale_height * height, max_val=self.__height
+            )
+            new_width = self.constrain_to_multiple_of(
+                scale_width * width, max_val=self.__width
+            )
+        elif self.__resize_method == "minimal":
+            new_height = self.constrain_to_multiple_of(scale_height * height)
+            new_width = self.constrain_to_multiple_of(scale_width * width)
+        else:
+            raise ValueError(f"resize_method {self.__resize_method} not implemented")
+        return (new_width, new_height)
+    def __call__(self, sample):
+        width, height = self.get_size(
+            sample["image"].shape[1], sample["image"].shape[0]
+        )
+        # resize sample
+        sample["image"] = cv2.resize(
+            sample["image"],
+            (width, height),
+            interpolation=self.__image_interpolation_method,
+        )
+        if self.__resize_target:
+            if "disparity" in sample:
+                sample["disparity"] = cv2.resize(
+                    sample["disparity"],
+                    (width, height),
+                    interpolation=cv2.INTER_NEAREST,
+                )
+            if "depth" in sample:
+                sample["depth"] = cv2.resize(
+                    sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST
+                )
+            sample["mask"] = cv2.resize(
+                sample["mask"].astype(np.float32),
+                (width, height),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            sample["mask"] = sample["mask"].astype(bool)
+        return sample
+class NormalizeImage(object):
+    """Normlize image by given mean and std."""
+    def __init__(self, mean, std):
+        self.__mean = mean
+        self.__std = std
+    def __call__(self, sample):
+        sample["image"] = (sample["image"] - self.__mean) / self.__std
+        return sample
+class PrepareForNet(object):
+    """Prepare sample for usage as network input."""
+    def __init__(self):
+        pass
+    def __call__(self, sample):
+        image = np.transpose(sample["image"], (2, 0, 1))
+        sample["image"] = np.ascontiguousarray(image).astype(np.float32)
+        if "mask" in sample:
+            sample["mask"] = sample["mask"].astype(np.float32)
+            sample["mask"] = np.ascontiguousarray(sample["mask"])
+        if "disparity" in sample:
+            disparity = sample["disparity"].astype(np.float32)
+            sample["disparity"] = np.ascontiguousarray(disparity)
+        if "depth" in sample:
+            depth = sample["depth"].astype(np.float32)
+            sample["depth"] = np.ascontiguousarray(depth)
+        return sample

streamlit_apps/app_utils/dpt/vit.py ADDED Viewed

	@@ -0,0 +1,576 @@

+import torch
+import torch.nn as nn
+import timm
+import types
+import math
+import torch.nn.functional as F
+activations = {}
+def get_activation(name):
+    def hook(model, input, output):
+        activations[name] = output
+    return hook
+attention = {}
+def get_attention(name):
+    def hook(module, input, output):
+        x = input[0]
+        B, N, C = x.shape
+        qkv = (
+            module.qkv(x)
+            .reshape(B, N, 3, module.num_heads, C // module.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )  # make torchscript happy (cannot use tensor as tuple)
+        attn = (q @ k.transpose(-2, -1)) * module.scale
+        attn = attn.softmax(dim=-1)  # [:,:,1,1:]
+        attention[name] = attn
+    return hook
+def get_mean_attention_map(attn, token, shape):
+    attn = attn[:, :, token, 1:]
+    attn = attn.unflatten(2, torch.Size([shape[2] // 16, shape[3] // 16])).float()
+    attn = torch.nn.functional.interpolate(
+        attn, size=shape[2:], mode="bicubic", align_corners=False
+    ).squeeze(0)
+    all_attn = torch.mean(attn, 0)
+    return all_attn
+class Slice(nn.Module):
+    def __init__(self, start_index=1):
+        super(Slice, self).__init__()
+        self.start_index = start_index
+    def forward(self, x):
+        return x[:, self.start_index :]
+class AddReadout(nn.Module):
+    def __init__(self, start_index=1):
+        super(AddReadout, self).__init__()
+        self.start_index = start_index
+    def forward(self, x):
+        if self.start_index == 2:
+            readout = (x[:, 0] + x[:, 1]) / 2
+        else:
+            readout = x[:, 0]
+        return x[:, self.start_index :] + readout.unsqueeze(1)
+class ProjectReadout(nn.Module):
+    def __init__(self, in_features, start_index=1):
+        super(ProjectReadout, self).__init__()
+        self.start_index = start_index
+        self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU())
+    def forward(self, x):
+        readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index :])
+        features = torch.cat((x[:, self.start_index :], readout), -1)
+        return self.project(features)
+class Transpose(nn.Module):
+    def __init__(self, dim0, dim1):
+        super(Transpose, self).__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+    def forward(self, x):
+        x = x.transpose(self.dim0, self.dim1)
+        return x
+def forward_vit(pretrained, x):
+    b, c, h, w = x.shape
+    glob = pretrained.model.forward_flex(x)
+    layer_1 = pretrained.activations["1"]
+    layer_2 = pretrained.activations["2"]
+    layer_3 = pretrained.activations["3"]
+    layer_4 = pretrained.activations["4"]
+    layer_1 = pretrained.act_postprocess1[0:2](layer_1)
+    layer_2 = pretrained.act_postprocess2[0:2](layer_2)
+    layer_3 = pretrained.act_postprocess3[0:2](layer_3)
+    layer_4 = pretrained.act_postprocess4[0:2](layer_4)
+    unflatten = nn.Sequential(
+        nn.Unflatten(
+            2,
+            torch.Size(
+                [
+                    h // pretrained.model.patch_size[1],
+                    w // pretrained.model.patch_size[0],
+                ]
+            ),
+        )
+    )
+    if layer_1.ndim == 3:
+        layer_1 = unflatten(layer_1)
+    if layer_2.ndim == 3:
+        layer_2 = unflatten(layer_2)
+    if layer_3.ndim == 3:
+        layer_3 = unflatten(layer_3)
+    if layer_4.ndim == 3:
+        layer_4 = unflatten(layer_4)
+    layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1)
+    layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2)
+    layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3)
+    layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4)
+    return layer_1, layer_2, layer_3, layer_4
+def _resize_pos_embed(self, posemb, gs_h, gs_w):
+    posemb_tok, posemb_grid = (
+        posemb[:, : self.start_index],
+        posemb[0, self.start_index :],
+    )
+    gs_old = int(math.sqrt(len(posemb_grid)))
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear")
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
+    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
+    return posemb
+def forward_flex(self, x):
+    b, c, h, w = x.shape
+    pos_embed = self._resize_pos_embed(
+        self.pos_embed, h // self.patch_size[1], w // self.patch_size[0]
+    )
+    B = x.shape[0]
+    if hasattr(self.patch_embed, "backbone"):
+        x = self.patch_embed.backbone(x)
+        if isinstance(x, (list, tuple)):
+            x = x[-1]  # last feature if backbone outputs list/tuple of features
+    x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
+    if getattr(self, "dist_token", None) is not None:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1
+        )  # stole cls_tokens impl from Phil Wang, thanks
+        dist_token = self.dist_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, dist_token, x), dim=1)
+    else:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1
+        )  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+    x = x + pos_embed
+    x = self.pos_drop(x)
+    for blk in self.blocks:
+        x = blk(x)
+    x = self.norm(x)
+    return x
+def get_readout_oper(vit_features, features, use_readout, start_index=1):
+    if use_readout == "ignore":
+        readout_oper = [Slice(start_index)] * len(features)
+    elif use_readout == "add":
+        readout_oper = [AddReadout(start_index)] * len(features)
+    elif use_readout == "project":
+        readout_oper = [
+            ProjectReadout(vit_features, start_index) for out_feat in features
+        ]
+    else:
+        assert (
+            False
+        ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
+    return readout_oper
+def _make_vit_b16_backbone(
+    model,
+    features=[96, 192, 384, 768],
+    size=[384, 384],
+    hooks=[2, 5, 8, 11],
+    vit_features=768,
+    use_readout="ignore",
+    start_index=1,
+    enable_attention_hooks=False,
+):
+    pretrained = nn.Module()
+    pretrained.model = model
+    pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
+    pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
+    pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
+    pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
+    pretrained.activations = activations
+    if enable_attention_hooks:
+        pretrained.model.blocks[hooks[0]].attn.register_forward_hook(
+            get_attention("attn_1")
+        )
+        pretrained.model.blocks[hooks[1]].attn.register_forward_hook(
+            get_attention("attn_2")
+        )
+        pretrained.model.blocks[hooks[2]].attn.register_forward_hook(
+            get_attention("attn_3")
+        )
+        pretrained.model.blocks[hooks[3]].attn.register_forward_hook(
+            get_attention("attn_4")
+        )
+        pretrained.attention = attention
+    readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
+    # 32, 48, 136, 384
+    pretrained.act_postprocess1 = nn.Sequential(
+        readout_oper[0],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[0],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[0],
+            out_channels=features[0],
+            kernel_size=4,
+            stride=4,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+    pretrained.act_postprocess2 = nn.Sequential(
+        readout_oper[1],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[1],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[1],
+            out_channels=features[1],
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+    pretrained.act_postprocess3 = nn.Sequential(
+        readout_oper[2],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+    pretrained.act_postprocess4 = nn.Sequential(
+        readout_oper[3],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.Conv2d(
+            in_channels=features[3],
+            out_channels=features[3],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        ),
+    )
+    pretrained.model.start_index = start_index
+    pretrained.model.patch_size = [16, 16]
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
+    pretrained.model._resize_pos_embed = types.MethodType(
+        _resize_pos_embed, pretrained.model
+    )
+    return pretrained
+def _make_vit_b_rn50_backbone(
+    model,
+    features=[256, 512, 768, 768],
+    size=[384, 384],
+    hooks=[0, 1, 8, 11],
+    vit_features=768,
+    use_vit_only=False,
+    use_readout="ignore",
+    start_index=1,
+    enable_attention_hooks=False,
+):
+    pretrained = nn.Module()
+    pretrained.model = model
+    if use_vit_only == True:
+        pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
+        pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
+    else:
+        pretrained.model.patch_embed.backbone.stages[0].register_forward_hook(
+            get_activation("1")
+        )
+        pretrained.model.patch_embed.backbone.stages[1].register_forward_hook(
+            get_activation("2")
+        )
+    pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
+    pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
+    if enable_attention_hooks:
+        pretrained.model.blocks[2].attn.register_forward_hook(get_attention("attn_1"))
+        pretrained.model.blocks[5].attn.register_forward_hook(get_attention("attn_2"))
+        pretrained.model.blocks[8].attn.register_forward_hook(get_attention("attn_3"))
+        pretrained.model.blocks[11].attn.register_forward_hook(get_attention("attn_4"))
+        pretrained.attention = attention
+    pretrained.activations = activations
+    readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
+    if use_vit_only == True:
+        pretrained.act_postprocess1 = nn.Sequential(
+            readout_oper[0],
+            Transpose(1, 2),
+            nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+            nn.Conv2d(
+                in_channels=vit_features,
+                out_channels=features[0],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=features[0],
+                out_channels=features[0],
+                kernel_size=4,
+                stride=4,
+                padding=0,
+                bias=True,
+                dilation=1,
+                groups=1,
+            ),
+        )
+        pretrained.act_postprocess2 = nn.Sequential(
+            readout_oper[1],
+            Transpose(1, 2),
+            nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+            nn.Conv2d(
+                in_channels=vit_features,
+                out_channels=features[1],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=features[1],
+                out_channels=features[1],
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=True,
+                dilation=1,
+                groups=1,
+            ),
+        )
+    else:
+        pretrained.act_postprocess1 = nn.Sequential(
+            nn.Identity(), nn.Identity(), nn.Identity()
+        )
+        pretrained.act_postprocess2 = nn.Sequential(
+            nn.Identity(), nn.Identity(), nn.Identity()
+        )
+    pretrained.act_postprocess3 = nn.Sequential(
+        readout_oper[2],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+    pretrained.act_postprocess4 = nn.Sequential(
+        readout_oper[3],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.Conv2d(
+            in_channels=features[3],
+            out_channels=features[3],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        ),
+    )
+    pretrained.model.start_index = start_index
+    pretrained.model.patch_size = [16, 16]
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model._resize_pos_embed = types.MethodType(
+        _resize_pos_embed, pretrained.model
+    )
+    return pretrained
+def _make_pretrained_vitb_rn50_384(
+    pretrained,
+    use_readout="ignore",
+    hooks=None,
+    use_vit_only=False,
+    enable_attention_hooks=False,
+):
+    model = timm.create_model("vit_base_resnet50_384", pretrained=pretrained)
+    hooks = [0, 1, 8, 11] if hooks == None else hooks
+    return _make_vit_b_rn50_backbone(
+        model,
+        features=[256, 512, 768, 768],
+        size=[384, 384],
+        hooks=hooks,
+        use_vit_only=use_vit_only,
+        use_readout=use_readout,
+        enable_attention_hooks=enable_attention_hooks,
+    )
+def _make_pretrained_vitl16_384(
+    pretrained, use_readout="ignore", hooks=None, enable_attention_hooks=False
+):
+    model = timm.create_model("vit_large_patch16_384", pretrained=pretrained)
+    hooks = [5, 11, 17, 23] if hooks == None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[256, 512, 1024, 1024],
+        hooks=hooks,
+        vit_features=1024,
+        use_readout=use_readout,
+        enable_attention_hooks=enable_attention_hooks,
+    )
+def _make_pretrained_vitb16_384(
+    pretrained, use_readout="ignore", hooks=None, enable_attention_hooks=False
+):
+    model = timm.create_model("vit_base_patch16_384", pretrained=pretrained)
+    hooks = [2, 5, 8, 11] if hooks == None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[96, 192, 384, 768],
+        hooks=hooks,
+        use_readout=use_readout,
+        enable_attention_hooks=enable_attention_hooks,
+    )
+def _make_pretrained_deitb16_384(
+    pretrained, use_readout="ignore", hooks=None, enable_attention_hooks=False
+):
+    model = timm.create_model("vit_deit_base_patch16_384", pretrained=pretrained)
+    hooks = [2, 5, 8, 11] if hooks == None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[96, 192, 384, 768],
+        hooks=hooks,
+        use_readout=use_readout,
+        enable_attention_hooks=enable_attention_hooks,
+    )
+def _make_pretrained_deitb16_distil_384(
+    pretrained, use_readout="ignore", hooks=None, enable_attention_hooks=False
+):
+    model = timm.create_model(
+        "vit_deit_base_distilled_patch16_384", pretrained=pretrained
+    )
+    hooks = [2, 5, 8, 11] if hooks == None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[96, 192, 384, 768],
+        hooks=hooks,
+        use_readout=use_readout,
+        start_index=2,
+        enable_attention_hooks=enable_attention_hooks,
+    )

streamlit_apps/app_utils/image_inference.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import time
+import numpy as np
+import streamlit as st
+from PIL import Image
+from s_multimae.da.base_da import BaseDataAugmentation
+from .base_model import BaseRGBDModel
+from .depth_model import BaseDepthModel
+from .model import base_inference
+if "depth" not in st.session_state:
+    st.session_state.depth = None
+def image_inference(
+    depth_model: BaseDepthModel,
+    sod_model: BaseRGBDModel,
+    da: BaseDataAugmentation,
+    color: np.ndarray,
+) -> None:
+    col1, col2 = st.columns(2)
+    image: Image = None
+    # depth: Image = None
+    with col1:
+        img_file_buffer = st.file_uploader(
+            "Upload an RGB image", key="img_file_buffer", type=["png", "jpg", "jpeg"]
+        )
+        if img_file_buffer is not None:
+            image = Image.open(img_file_buffer).convert("RGB")
+            st.image(image, caption="RGB")
+    with col2:
+        depth_file_buffer = st.file_uploader(
+            "Upload a depth image (Optional)",
+            key="depth_file_buffer",
+            type=["png", "jpg", "jpeg"],
+        )
+        if depth_file_buffer is not None:
+            st.session_state.depth = Image.open(depth_file_buffer).convert("L")
+        if st.session_state.depth is not None:
+            st.image(st.session_state.depth, caption="Depth")
+    if sod_model.cfg.ground_truth_version == 6:
+        num_sets_of_salient_objects = st.number_input(
+            "Number of sets of salient objects", value=1, min_value=1, max_value=10
+        )
+    else:
+        num_sets_of_salient_objects = 1
+    is_predict = st.button(
+        "Predict Salient Objects",
+        key="predict_salient_objects",
+        disabled=img_file_buffer is None,
+    )
+    if is_predict:
+        with st.spinner("Processing..."):
+            start_time = time.time()
+            pred_depth, pred_sods, pred_sms = base_inference(
+                depth_model,
+                sod_model,
+                da,
+                image,
+                st.session_state.depth,
+                color,
+                num_sets_of_salient_objects,
+            )
+            if st.session_state.depth is None:
+                st.session_state.depth = Image.fromarray(pred_depth).convert("L")
+                col2.image(st.session_state.depth, "Pseudo-depth")
+            if num_sets_of_salient_objects == 1:
+                st.warning(
+                    "HINT: To view a wider variety of sets of salient objects, try to increase the number of sets the model can produce."
+                )
+            elif num_sets_of_salient_objects > 1:
+                st.warning(
+                    "NOTE: As single-GT accounts for 77.61% of training samples, the model may not consistently yield different sets. The best approach is to gradually increase the number of sets of salient objects until you achieve the desired result."
+                )
+            st.info(f"Inference time: {time.time() - start_time:.4f} seconds")
+            sod_cols = st.columns(len(pred_sods))
+            for i, (pred_sod, pred_sm) in enumerate(zip(pred_sods, pred_sms)):
+                with sod_cols[i]:
+                    st.image(pred_sod, "Salient Objects (Otsu threshold)")
+                    st.image(pred_sm, "Salient Map")

streamlit_apps/app_utils/model.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from typing import List, Optional, Tuple, Union
+import cv2
+import numpy as np
+import torch
+import torchvision.transforms.functional as TF
+from PIL import Image
+from torch import Tensor, nn
+import torch
+from skimage.filters import threshold_otsu
+from s_multimae.da.base_da import BaseDataAugmentation
+from s_multimae.model_pl import ModelPL
+from s_multimae.visualizer import apply_vis_to_image
+from .base_model import BaseRGBDModel
+from .app_utils import get_size, normalize
+from .depth_model import BaseDepthModel
+# Environment
+torch.set_grad_enabled(False)
+from .device import device
+print(f"device: {device}")
+def post_processing_depth(depth: np.ndarray) -> np.ndarray:
+    depth = (normalize(depth) * 255).astype(np.uint8)
+    return cv2.applyColorMap(depth, cv2.COLORMAP_OCEAN)
+def base_inference(
+    depth_model: BaseDepthModel,
+    sod_model: BaseRGBDModel,
+    da: BaseDataAugmentation,
+    raw_image: Union[Image.Image, np.ndarray],
+    raw_depth: Optional[Union[Image.Image, np.ndarray]] = None,
+    color: np.ndarray = None,
+    num_sets_of_salient_objects: int = 1,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """Inference a pair of rgb image and depth image
+    if depth image is not provided, the depth_model will predict a depth image based on image
+    """
+    origin_size = get_size(raw_image)
+    # Predict depth
+    image = TF.to_tensor(raw_image)
+    origin_shape = image.shape
+    if raw_depth is None:
+        depth: Tensor = depth_model.forward(image)
+    else:
+        depth = TF.to_tensor(raw_depth)
+    # Preprocessing
+    image, depth = da.forward(
+        raw_image, depth.cpu().detach().squeeze(0).numpy(), is_transform=False
+    )
+    # Inference
+    sms = sod_model.inference(image, depth, origin_shape, num_sets_of_salient_objects)
+    # Postprocessing
+    sods = []
+    for sm in sms:
+        binary_mask = np.array(sm)
+        t = threshold_otsu(binary_mask)
+        binary_mask[binary_mask < t] = 0.0
+        binary_mask[binary_mask >= t] = 1.0
+        sod = apply_vis_to_image(np.array(raw_image), binary_mask, color)
+        sods.append(sod)
+    depth = depth.permute(1, 2, 0).detach().cpu().numpy()
+    depth = cv2.resize(depth, origin_size)
+    depth = post_processing_depth(depth)
+    return depth, sods, [e / 255.0 for e in sms]
+def transform_images(inputs: List[Image.Image], transform: nn.Module) -> Tensor:
+    if len(inputs) == 1:
+        return transform(inputs[0]).unsqueeze(0)
+    return torch.cat([transform(input).unsqueeze(0) for input in inputs])

streamlit_apps/app_utils/smultimae_model.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import numpy as np
+from torch import Tensor
+from torchvision.transforms import Resize
+from s_multimae.model_pl import ModelPL
+from s_multimae.configs.base_config import base_cfg
+from .base_model import BaseRGBDModel
+class RGBDSMultiMAEModel(BaseRGBDModel):
+    def __init__(self, cfg: base_cfg, model: ModelPL):
+        """Wrapper of RGBDModel"""
+        super(RGBDSMultiMAEModel, self).__init__()
+        self.model: ModelPL = model
+        self.cfg = cfg
+        self.resize = Resize([self.cfg.image_size, self.cfg.image_size])
+    def inference(
+        self,
+        image: Tensor,
+        depth: Tensor,
+        origin_shape: np.array,
+        num_sets_of_salient_objects: int = 1,
+    ) -> np.ndarray:
+        # 1. Preprocessing
+        images = image.unsqueeze(0)
+        depths = depth.unsqueeze(0)
+        # images = self.resize(images)
+        # depths = self.resize(depths)
+        # 2. Inference
+        images, depths = images.to(self.model.device), depths.to(self.model.device)
+        if self.cfg.ground_truth_version == 6:
+            self.cfg.num_classes = num_sets_of_salient_objects
+        res = self.model.inference(
+            [[origin_shape[2], origin_shape[1]]],
+            images,
+            depths,
+            [num_sets_of_salient_objects],
+        )
+        return res[0]

streamlit_apps/app_utils/sod_selection_ui.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from typing import Tuple
+import streamlit as st
+import os
+import torch
+from .app_env import SOD_MODEL_TYPE
+from .app_utils import count_parameters
+from .smultimae_model import RGBDSMultiMAEModel
+from .base_model import BaseRGBDModel
+from .device import device
+from s_multimae.da.dav6 import DataAugmentationV6
+from s_multimae.configs.base_config import base_cfg
+from s_multimae.configs.experiment_config import arg_cfg
+from s_multimae.model_pl import ModelPL
+# from spnet_model import SPNetModel
+@st.cache_resource
+def load_smultimae_model(
+    sod_model_config_key: str, top: int
+) -> Tuple[BaseRGBDModel, base_cfg]:
+    """
+    1. Construct model
+    2. Load pretrained weights
+    3. Load model into device
+    """
+    cfg = arg_cfg[sod_model_config_key]()
+    weights_fname = f"s-multimae-{cfg.experiment_name}-top{top}.pth"
+    ckpt_path = os.path.join(
+        "weights", weights_fname
+    )
+    print(ckpt_path)
+    if not os.path.isfile(ckpt_path):
+        from huggingface_hub import hf_hub_download
+        hf_hub_download(repo_id="RGBD-SOD/S-MultiMAE", filename=weights_fname)
+        os.system(f"mv {weights_fname} weights")
+    assert os.path.isfile(ckpt_path)
+    # sod_model = ModelPL.load_from_checkpoint(
+    #     ckpt_path,
+    #     cfg=cfg,
+    #     map_location=device,
+    # )
+    sod_model = ModelPL(cfg)
+    sod_model.model.load_state_dict(
+        torch.load(ckpt_path, map_location="cpu"), strict=False
+    )
+    da = DataAugmentationV6(cfg)
+    return RGBDSMultiMAEModel(cfg, sod_model), cfg, da
+# @st.cache_resource
+# def load_spnet_model() -> BaseRGBDModel:
+#     """
+#     1. Construct model
+#     2. Load pretrained weights
+#     3. Load model into device
+#     """
+#     sod_model = SPNetModel()
+#     return sod_model
+# @st.cache_resource
+# def load_bbsnet_model() -> BaseRGBDModel:
+#     """
+#     1. Construct model
+#     2. Load pretrained weights
+#     3. Load model into device
+#     """
+#     sod_model = BBSNetModel()
+#     return sod_model
+def sod_selection_ui() -> BaseRGBDModel:
+    sod_model_type = st.selectbox(
+        "Choose SOD model",
+        (
+            SOD_MODEL_TYPE.S_MULTIMAE,
+            # SOD_MODEL_TYPE.SPNET,
+            # SOD_MODEL_TYPE.BBSNET,
+        ),
+        key="sod_model_type",
+    )
+    if sod_model_type == SOD_MODEL_TYPE.S_MULTIMAE:
+        d = {
+            "S-MultiMAE [ViT-L] Multi-GT": {"top": 1, "cfg": "cfgv4_0_2006"},
+            "S-MultiMAE [ViT-B] Multi-GT": {"top": 1, "cfg": "cfgv4_0_2007"},
+        }
+        sod_model_config_key = st.selectbox(
+            "Choose config",
+            list(d.keys()),
+            key="sod_model_config_key",
+        )
+        sod_model, cfg, da = load_smultimae_model(
+            d[sod_model_config_key]["cfg"], d[sod_model_config_key]["top"]
+        )
+        # st.text(f"Model description: {cfg.description}")
+    # elif sod_model_type == SOD_MODEL_TYPE.SPNET:
+    #     sod_model = load_spnet_model()
+    #     st.text(f"Model description: SPNet (https://github.com/taozh2017/SPNet)")
+    # elif sod_model_type == SOD_MODEL_TYPE.BBSNET:
+    #     sod_model = load_bbsnet_model()
+    #     st.text(f"Model description: BBSNet (https://github.com/DengPingFan/BBS-Net)")
+    st.text(f"Number of parameters {count_parameters(sod_model)}")
+    return sod_model, da