diff --git a/ControlNet/.gitignore b/ControlNet/.gitignore
deleted file mode 100644
index 325e5fd26480f88769892b2c263e91daca4ba719..0000000000000000000000000000000000000000
--- a/ControlNet/.gitignore
+++ /dev/null
@@ -1,143 +0,0 @@
-.idea/
-
-training/
-lightning_logs/
-image_log/
-
-*.pth
-*.pt
-*.ckpt
-*.safetensors
-
-gradio_pose2image_private.py
-gradio_canny2image_private.py
-
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-pip-wheel-metadata/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-.python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
diff --git a/ControlNet/LICENSE b/ControlNet/LICENSE
deleted file mode 100644
index 261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64..0000000000000000000000000000000000000000
--- a/ControlNet/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/ControlNet/README.md b/ControlNet/README.md
deleted file mode 100644
index 7e240b1821386750d92e5180b130a802d8e956ce..0000000000000000000000000000000000000000
--- a/ControlNet/README.md
+++ /dev/null
@@ -1,348 +0,0 @@
-# News: A nightly version of ControlNet 1.1 is released!
-
-[ControlNet 1.1](https://github.com/lllyasviel/ControlNet-v1-1-nightly) is released. Those new models will be merged to this repo after we make sure that everything is good.
-
-# Below is ControlNet 1.0
-
-Official implementation of [Adding Conditional Control to Text-to-Image Diffusion Models](https://arxiv.org/abs/2302.05543).
-
-ControlNet is a neural network structure to control diffusion models by adding extra conditions.
-
-![img](github_page/he.png)
-
-It copys the weights of neural network blocks into a "locked" copy and a "trainable" copy. 
-
-The "trainable" one learns your condition. The "locked" one preserves your model. 
-
-Thanks to this, training with small dataset of image pairs will not destroy the production-ready diffusion models.
-
-The "zero convolution" is 1×1 convolution with both weight and bias initialized as zeros. 
-
-Before training, all zero convolutions output zeros, and ControlNet will not cause any distortion.
-
-No layer is trained from scratch. You are still fine-tuning. Your original model is safe. 
-
-This allows training on small-scale or even personal devices.
-
-This is also friendly to merge/replacement/offsetting of models/weights/blocks/layers.
-
-### FAQ
-
-**Q:** But wait, if the weight of a conv layer is zero, the gradient will also be zero, and the network will not learn anything. Why "zero convolution" works?
-
-**A:** This is not true. [See an explanation here](docs/faq.md).
-
-# Stable Diffusion + ControlNet
-
-By repeating the above simple structure 14 times, we can control stable diffusion in this way:
-
-![img](github_page/sd.png)
-
-In this way, the ControlNet can **reuse** the SD encoder as a **deep, strong, robust, and powerful backbone** to learn diverse controls. Many evidences (like [this](https://jerryxu.net/ODISE/) and [this](https://vpd.ivg-research.xyz/)) validate that the SD encoder is an excellent backbone.
-
-Note that the way we connect layers is computational efficient. The original SD encoder does not need to store gradients (the locked original SD Encoder Block 1234 and Middle). The required GPU memory is not much larger than original SD, although many layers are added. Great!
-
-# Features & News
-
-2023/0/14 - We released [ControlNet 1.1](https://github.com/lllyasviel/ControlNet-v1-1-nightly). Those new models will be merged to this repo after we make sure that everything is good.
-
-2023/03/03 - We released a discussion - [Precomputed ControlNet: Speed up ControlNet by 45%, but is it necessary?](https://github.com/lllyasviel/ControlNet/discussions/216)
-
-2023/02/26 - We released a blog - [Ablation Study: Why ControlNets use deep encoder? What if it was lighter? Or even an MLP?](https://github.com/lllyasviel/ControlNet/discussions/188)
-
-2023/02/20 - Implementation for non-prompt mode released. See also [Guess Mode / Non-Prompt Mode](#guess-anchor).
-
-2023/02/12 - Now you can play with any community model by [Transferring the ControlNet](https://github.com/lllyasviel/ControlNet/discussions/12).
-
-2023/02/11 - [Low VRAM mode](docs/low_vram.md) is added. Please use this mode if you are using 8GB GPU(s) or if you want larger batch size.
-
-# Production-Ready Pretrained Models
-
-First create a new conda environment
-
-    conda env create -f environment.yaml
-    conda activate control
-
-All models and detectors can be downloaded from [our Hugging Face page](https://huggingface.co/lllyasviel/ControlNet). Make sure that SD models are put in "ControlNet/models" and detectors are put in "ControlNet/annotator/ckpts". Make sure that you download all necessary pretrained weights and detector models from that Hugging Face page, including HED edge detection model, Midas depth estimation model, Openpose, and so on. 
-
-We provide 9 Gradio apps with these models.
-
-All test images can be found at the folder "test_imgs".
-
-## ControlNet with Canny Edge
-
-Stable Diffusion 1.5 + ControlNet (using simple Canny edge detection)
-
-    python gradio_canny2image.py
-
-The Gradio app also allows you to change the Canny edge thresholds. Just try it for more details.
-
-Prompt: "bird"
-![p](github_page/p1.png)
-
-Prompt: "cute dog"
-![p](github_page/p2.png)
-
-## ControlNet with M-LSD Lines
-
-Stable Diffusion 1.5 + ControlNet (using simple M-LSD straight line detection)
-
-    python gradio_hough2image.py
-
-The Gradio app also allows you to change the M-LSD thresholds. Just try it for more details.
-
-Prompt: "room"
-![p](github_page/p3.png)
-
-Prompt: "building"
-![p](github_page/p4.png)
-
-## ControlNet with HED Boundary
-
-Stable Diffusion 1.5 + ControlNet (using soft HED Boundary)
-
-    python gradio_hed2image.py
-
-The soft HED Boundary will preserve many details in input images, making this app suitable for recoloring and stylizing. Just try it for more details.
-
-Prompt: "oil painting of handsome old man, masterpiece"
-![p](github_page/p5.png)
-
-Prompt: "Cyberpunk robot"
-![p](github_page/p6.png)
-
-## ControlNet with User Scribbles
-
-Stable Diffusion 1.5 + ControlNet (using Scribbles)
-
-    python gradio_scribble2image.py
-
-Note that the UI is based on Gradio, and Gradio is somewhat difficult to customize. Right now you need to draw scribbles outside the UI (using your favorite drawing software, for example, MS Paint) and then import the scribble image to Gradio. 
-
-Prompt: "turtle"
-![p](github_page/p7.png)
-
-Prompt: "hot air balloon"
-![p](github_page/p8.png)
-
-### Interactive Interface
-
-We actually provide an interactive interface
-
-    python gradio_scribble2image_interactive.py
-
-~~However, because gradio is very [buggy](https://github.com/gradio-app/gradio/issues/3166) and difficult to customize, right now, user need to first set canvas width and heights and then click "Open drawing canvas" to get a drawing area. Please do not upload image to that drawing canvas. Also, the drawing area is very small; it should be bigger. But I failed to find out how to make it larger. Again, gradio is really buggy.~~ (Now fixed, will update asap)
-
-The below dog sketch is drawn by me. Perhaps we should draw a better dog for showcase.
-
-Prompt: "dog in a room"
-![p](github_page/p20.png)
-
-## ControlNet with Fake Scribbles
-
-Stable Diffusion 1.5 + ControlNet (using fake scribbles)
-
-    python gradio_fake_scribble2image.py
-
-Sometimes we are lazy, and we do not want to draw scribbles. This script use the exactly same scribble-based model but use a simple algorithm to synthesize scribbles from input images.
-
-Prompt: "bag"
-![p](github_page/p9.png)
-
-Prompt: "shose" (Note that "shose" is a typo; it should be "shoes". But it still seems to work.)
-![p](github_page/p10.png)
-
-## ControlNet with Human Pose
-
-Stable Diffusion 1.5 + ControlNet (using human pose)
-
-    python gradio_pose2image.py
-
-Apparently, this model deserves a better UI to directly manipulate pose skeleton. However, again, Gradio is somewhat difficult to customize. Right now you need to input an image and then the Openpose will detect the pose for you.
-
-Prompt: "Chief in the kitchen"
-![p](github_page/p11.png)
-
-Prompt: "An astronaut on the moon"
-![p](github_page/p12.png)
-
-## ControlNet with Semantic Segmentation
-
-Stable Diffusion 1.5 + ControlNet (using semantic segmentation)
-
-    python gradio_seg2image.py
-
-This model use ADE20K's segmentation protocol. Again, this model deserves a better UI to directly draw the segmentations. However, again, Gradio is somewhat difficult to customize. Right now you need to input an image and then a model called Uniformer will detect the segmentations for you. Just try it for more details.
-
-Prompt: "House"
-![p](github_page/p13.png)
-
-Prompt: "River"
-![p](github_page/p14.png)
-
-## ControlNet with Depth
-
-Stable Diffusion 1.5 + ControlNet (using depth map)
-
-    python gradio_depth2image.py
-
-Great! Now SD 1.5 also have a depth control. FINALLY. So many possibilities (considering SD1.5 has much more community models than SD2).
-
-Note that different from Stability's model, the ControlNet receive the full 512×512 depth map, rather than 64×64 depth. Note that Stability's SD2 depth model use 64*64 depth maps. This means that the ControlNet will preserve more details in the depth map.
-
-This is always a strength because if users do not want to preserve more details, they can simply use another SD to post-process an i2i. But if they want to preserve more details, ControlNet becomes their only choice. Again, SD2 uses 64×64 depth, we use 512×512.
-
-Prompt: "Stormtrooper's lecture"
-![p](github_page/p15.png)
-
-## ControlNet with Normal Map
-
-Stable Diffusion 1.5 + ControlNet (using normal map)
-
-    python gradio_normal2image.py
-
-This model use normal map. Rightnow in the APP, the normal is computed from the midas depth map and a user threshold (to determine how many area is background with identity normal face to viewer, tune the "Normal background threshold" in the gradio app to get a feeling).
-
-Prompt: "Cute toy"
-![p](github_page/p17.png)
-
-Prompt: "Plaster statue of Abraham Lincoln"
-![p](github_page/p18.png)
-
-Compared to depth model, this model seems to be a bit better at preserving the geometry. This is intuitive: minor details are not salient in depth maps, but are salient in normal maps. Below is the depth result with same inputs. You can see that the hairstyle of the man in the input image is modified by depth model, but preserved by the normal model. 
-
-Prompt: "Plaster statue of Abraham Lincoln"
-![p](github_page/p19.png)
-
-## ControlNet with Anime Line Drawing
-
-We also trained a relatively simple ControlNet for anime line drawings. This tool may be useful for artistic creations. (Although the image details in the results is a bit modified, since it still diffuse latent images.)
-
-This model is not available right now. We need to evaluate the potential risks before releasing this model. Nevertheless, you may be interested in [transferring the ControlNet to any community model](https://github.com/lllyasviel/ControlNet/discussions/12).
-
-![p](github_page/p21.png)
-
-<a id="guess-anchor"></a>
-
-# Guess Mode / Non-Prompt Mode
-
-The "guess mode" (or called non-prompt mode) will completely unleash all the power of the very powerful ControlNet encoder. 
-
-See also the blog - [Ablation Study: Why ControlNets use deep encoder? What if it was lighter? Or even an MLP?](https://github.com/lllyasviel/ControlNet/discussions/188)
-
-You need to manually check the "Guess Mode" toggle to enable this mode.
-
-In this mode, the ControlNet encoder will try best to recognize the content of the input control map, like depth map, edge map, scribbles, etc, even if you remove all prompts.
-
-**Let's have fun with some very challenging experimental settings!**
-
-**No prompts. No "positive" prompts. No "negative" prompts. No extra caption detector. One single diffusion loop.**
-
-For this mode, we recommend to use 50 steps and guidance scale between 3 and 5.
-
-![p](github_page/uc2a.png)
-
-No prompts:
-
-![p](github_page/uc2b.png)
-
-Note that the below example is 768×768. No prompts. No "positive" prompts. No "negative" prompts.
-
-![p](github_page/uc1.png)
-
-By tuning the parameters, you can get some very intereting results like below:
-
-![p](github_page/uc3.png)
-
-Because no prompt is available, the ControlNet encoder will "guess" what is in the control map. Sometimes the guess result is really interesting. Because diffusion algorithm can essentially give multiple results, the ControlNet seems able to give multiple guesses, like this:
-
-![p](github_page/uc4.png)
-
-Without prompt, the HED seems good at generating images look like paintings when the control strength is relatively low:
-
-![p](github_page/uc6.png)
-
-The Guess Mode is also supported in [WebUI Plugin](https://github.com/Mikubill/sd-webui-controlnet):
-
-![p](github_page/uci1.png)
-
-No prompts. Default WebUI parameters. Pure random results with the seed being 12345. Standard SD1.5. Input scribble is in "test_imgs" folder to reproduce.
-
-![p](github_page/uci2.png)
-
-Below is another challenging example:
-
-![p](github_page/uci3.png)
-
-No prompts. Default WebUI parameters. Pure random results with the seed being 12345. Standard SD1.5. Input scribble is in "test_imgs" folder to reproduce.
-
-![p](github_page/uci4.png)
-
-Note that in the guess mode, you will still be able to input prompts. The only difference is that the model will "try harder" to guess what is in the control map even if you do not provide the prompt. Just try it yourself!
-
-Besides, if you write some scripts (like BLIP) to generate image captions from the "guess mode" images, and then use the generated captions as prompts to diffuse again, you will get a SOTA pipeline for fully automatic conditional image generating.
-
-# Combining Multiple ControlNets
-
-ControlNets are composable: more than one ControlNet can be easily composed to multi-condition control.
-
-Right now this feature is in experimental stage in the [Mikubill' A1111 Webui Plugin](https://github.com/Mikubill/sd-webui-controlnet):
-
-![p](github_page/multi2.png)
-
-![p](github_page/multi.png)
-
-As long as the models are controlling the same SD, the "boundary" between different research projects does not even exist. This plugin also allows different methods to work together!
-
-# Use ControlNet in Any Community Model (SD1.X)
-
-This is an experimental feature.
-
-[See the steps here](https://github.com/lllyasviel/ControlNet/discussions/12).
-
-Or you may want to use the [Mikubill' A1111 Webui Plugin](https://github.com/Mikubill/sd-webui-controlnet) which is plug-and-play and does not need manual merging.
-
-# Annotate Your Own Data
-
-We provide simple python scripts to process images.
-
-[See a gradio example here](docs/annotator.md).
-
-# Train with Your Own Data
-
-Training a ControlNet is as easy as (or even easier than) training a simple pix2pix. 
-
-[See the steps here](docs/train.md).
-
-# Related Resources
-
-Special Thank to the great project - [Mikubill' A1111 Webui Plugin](https://github.com/Mikubill/sd-webui-controlnet) !
-
-We also thank Hysts for making [Hugging Face Space](https://huggingface.co/spaces/hysts/ControlNet) as well as more than 65 models in that amazing [Colab list](https://github.com/camenduru/controlnet-colab)! 
-
-Thank haofanwang for making [ControlNet-for-Diffusers](https://github.com/haofanwang/ControlNet-for-Diffusers)!
-
-We also thank all authors for making Controlnet DEMOs, including but not limited to [fffiloni](https://huggingface.co/spaces/fffiloni/ControlNet-Video), [other-model](https://huggingface.co/spaces/hysts/ControlNet-with-other-models), [ThereforeGames](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/7784), [RamAnanth1](https://huggingface.co/spaces/RamAnanth1/ControlNet), etc!
-
-Besides, you may also want to read these amazing related works:
-
-[Composer: Creative and Controllable Image Synthesis with Composable Conditions](https://github.com/damo-vilab/composer): A much bigger model to control diffusion!
-
-[T2I-Adapter: Learning Adapters to Dig out More Controllable Ability for Text-to-Image Diffusion Models](https://github.com/TencentARC/T2I-Adapter): A much smaller model to control stable diffusion!
-
-[ControlLoRA: A Light Neural Network To Control Stable Diffusion Spatial Information](https://github.com/HighCWu/ControlLoRA): Implement Controlnet using LORA!
-
-And these amazing recent projects: [InstructPix2Pix Learning to Follow Image Editing Instructions](https://www.timothybrooks.com/instruct-pix2pix), [Pix2pix-zero: Zero-shot Image-to-Image Translation](https://github.com/pix2pixzero/pix2pix-zero), [Plug-and-Play Diffusion Features for Text-Driven Image-to-Image Translation](https://github.com/MichalGeyer/plug-and-play), [MaskSketch: Unpaired Structure-guided Masked Image Generation](https://arxiv.org/abs/2302.05496), [SEGA: Instructing Diffusion using Semantic Dimensions](https://arxiv.org/abs/2301.12247), [Universal Guidance for Diffusion Models](https://github.com/arpitbansal297/Universal-Guided-Diffusion), [Region-Aware Diffusion for Zero-shot Text-driven Image Editing](https://github.com/haha-lisa/RDM-Region-Aware-Diffusion-Model), [Domain Expansion of Image Generators](https://arxiv.org/abs/2301.05225), [Image Mixer](https://twitter.com/LambdaAPI/status/1626327289288957956), [MultiDiffusion: Fusing Diffusion Paths for Controlled Image Generation](https://multidiffusion.github.io/)
-
-# Citation
-
-    @misc{zhang2023adding,
-      title={Adding Conditional Control to Text-to-Image Diffusion Models}, 
-      author={Lvmin Zhang and Anyi Rao and Maneesh Agrawala},
-      booktitle={IEEE International Conference on Computer Vision (ICCV)}
-      year={2023},
-    }
-
-[Arxiv Link](https://arxiv.org/abs/2302.05543)
-
-[Supplementary Materials](https://lllyasviel.github.io/misc/202309/cnet_supp.pdf)
diff --git a/ControlNet/annotator/canny/__init__.py b/ControlNet/annotator/canny/__init__.py
deleted file mode 100644
index cb0da951dc838ec9dec2131007e036113281800b..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/canny/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-import cv2
-
-
-class CannyDetector:
-    def __call__(self, img, low_threshold, high_threshold):
-        return cv2.Canny(img, low_threshold, high_threshold)
diff --git a/ControlNet/annotator/ckpts/ckpts.txt b/ControlNet/annotator/ckpts/ckpts.txt
deleted file mode 100644
index 1978551fb2a9226814eaf58459f414fcfac4e69b..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/ckpts/ckpts.txt
+++ /dev/null
@@ -1 +0,0 @@
-Weights here.
\ No newline at end of file
diff --git a/ControlNet/annotator/hed/__init__.py b/ControlNet/annotator/hed/__init__.py
deleted file mode 100644
index a6a8fc712fba02b033dea13bfe33204b8d3c9139..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/hed/__init__.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# This is an improved version and model of HED edge detection with Apache License, Version 2.0.
-# Please use this implementation in your products
-# This implementation may produce slightly different results from Saining Xie's official implementations,
-# but it generates smoother edges and is more suitable for ControlNet as well as other image-to-image translations.
-# Different from official models and other implementations, this is an RGB-input model (rather than BGR)
-# and in this way it works better for gradio's RGB protocol
-
-import os
-import cv2
-import torch
-import numpy as np
-
-from einops import rearrange
-from annotator.util import annotator_ckpts_path
-
-
-class DoubleConvBlock(torch.nn.Module):
-    def __init__(self, input_channel, output_channel, layer_number):
-        super().__init__()
-        self.convs = torch.nn.Sequential()
-        self.convs.append(torch.nn.Conv2d(in_channels=input_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1))
-        for i in range(1, layer_number):
-            self.convs.append(torch.nn.Conv2d(in_channels=output_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1))
-        self.projection = torch.nn.Conv2d(in_channels=output_channel, out_channels=1, kernel_size=(1, 1), stride=(1, 1), padding=0)
-
-    def __call__(self, x, down_sampling=False):
-        h = x
-        if down_sampling:
-            h = torch.nn.functional.max_pool2d(h, kernel_size=(2, 2), stride=(2, 2))
-        for conv in self.convs:
-            h = conv(h)
-            h = torch.nn.functional.relu(h)
-        return h, self.projection(h)
-
-
-class ControlNetHED_Apache2(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.norm = torch.nn.Parameter(torch.zeros(size=(1, 3, 1, 1)))
-        self.block1 = DoubleConvBlock(input_channel=3, output_channel=64, layer_number=2)
-        self.block2 = DoubleConvBlock(input_channel=64, output_channel=128, layer_number=2)
-        self.block3 = DoubleConvBlock(input_channel=128, output_channel=256, layer_number=3)
-        self.block4 = DoubleConvBlock(input_channel=256, output_channel=512, layer_number=3)
-        self.block5 = DoubleConvBlock(input_channel=512, output_channel=512, layer_number=3)
-
-    def __call__(self, x):
-        h = x - self.norm
-        h, projection1 = self.block1(h)
-        h, projection2 = self.block2(h, down_sampling=True)
-        h, projection3 = self.block3(h, down_sampling=True)
-        h, projection4 = self.block4(h, down_sampling=True)
-        h, projection5 = self.block5(h, down_sampling=True)
-        return projection1, projection2, projection3, projection4, projection5
-
-
-class HEDdetector:
-    def __init__(self):
-        remote_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/ControlNetHED.pth"
-        modelpath = os.path.join(annotator_ckpts_path, "ControlNetHED.pth")
-        if not os.path.exists(modelpath):
-            from basicsr.utils.download_util import load_file_from_url
-            load_file_from_url(remote_model_path, model_dir=annotator_ckpts_path)
-        self.netNetwork = ControlNetHED_Apache2().float().cuda().eval()
-        self.netNetwork.load_state_dict(torch.load(modelpath))
-
-    def __call__(self, input_image):
-        assert input_image.ndim == 3
-        H, W, C = input_image.shape
-        with torch.no_grad():
-            image_hed = torch.from_numpy(input_image.copy()).float().cuda()
-            image_hed = rearrange(image_hed, 'h w c -> 1 c h w')
-            edges = self.netNetwork(image_hed)
-            edges = [e.detach().cpu().numpy().astype(np.float32)[0, 0] for e in edges]
-            edges = [cv2.resize(e, (W, H), interpolation=cv2.INTER_LINEAR) for e in edges]
-            edges = np.stack(edges, axis=2)
-            edge = 1 / (1 + np.exp(-np.mean(edges, axis=2).astype(np.float64)))
-            edge = (edge * 255.0).clip(0, 255).astype(np.uint8)
-            return edge
-
-
-def nms(x, t, s):
-    x = cv2.GaussianBlur(x.astype(np.float32), (0, 0), s)
-
-    f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8)
-    f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8)
-    f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8)
-    f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8)
-
-    y = np.zeros_like(x)
-
-    for f in [f1, f2, f3, f4]:
-        np.putmask(y, cv2.dilate(x, kernel=f) == x, x)
-
-    z = np.zeros_like(y, dtype=np.uint8)
-    z[y > t] = 255
-    return z
diff --git a/ControlNet/annotator/midas/LICENSE b/ControlNet/annotator/midas/LICENSE
deleted file mode 100644
index 277b5c11be103f028a8d10985139f1da10c2f08e..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/midas/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2019 Intel ISL (Intel Intelligent Systems Lab)
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/ControlNet/annotator/midas/__init__.py b/ControlNet/annotator/midas/__init__.py
deleted file mode 100644
index 36789767f35bcc169c2cbf096e2747539df4f14d..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/midas/__init__.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Midas Depth Estimation
-# From https://github.com/isl-org/MiDaS
-# MIT LICENSE
-
-import cv2
-import numpy as np
-import torch
-
-from einops import rearrange
-from .api import MiDaSInference
-
-
-class MidasDetector:
-    def __init__(self):
-        self.model = MiDaSInference(model_type="dpt_hybrid").cuda()
-
-    def __call__(self, input_image, a=np.pi * 2.0, bg_th=0.1):
-        assert input_image.ndim == 3
-        image_depth = input_image
-        with torch.no_grad():
-            image_depth = torch.from_numpy(image_depth).float().cuda()
-            image_depth = image_depth / 127.5 - 1.0
-            image_depth = rearrange(image_depth, 'h w c -> 1 c h w')
-            depth = self.model(image_depth)[0]
-
-            depth_pt = depth.clone()
-            depth_pt -= torch.min(depth_pt)
-            depth_pt /= torch.max(depth_pt)
-            depth_pt = depth_pt.cpu().numpy()
-            depth_image = (depth_pt * 255.0).clip(0, 255).astype(np.uint8)
-
-            depth_np = depth.cpu().numpy()
-            x = cv2.Sobel(depth_np, cv2.CV_32F, 1, 0, ksize=3)
-            y = cv2.Sobel(depth_np, cv2.CV_32F, 0, 1, ksize=3)
-            z = np.ones_like(x) * a
-            x[depth_pt < bg_th] = 0
-            y[depth_pt < bg_th] = 0
-            normal = np.stack([x, y, z], axis=2)
-            normal /= np.sum(normal ** 2.0, axis=2, keepdims=True) ** 0.5
-            normal_image = (normal * 127.5 + 127.5).clip(0, 255).astype(np.uint8)
-
-            return depth_image, normal_image
diff --git a/ControlNet/annotator/midas/api.py b/ControlNet/annotator/midas/api.py
deleted file mode 100644
index 1ab9f15bf96bbaffcee0e3e29fc9d3979d6c32e8..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/midas/api.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# based on https://github.com/isl-org/MiDaS
-
-import cv2
-import os
-import torch
-import torch.nn as nn
-from torchvision.transforms import Compose
-
-from .midas.dpt_depth import DPTDepthModel
-from .midas.midas_net import MidasNet
-from .midas.midas_net_custom import MidasNet_small
-from .midas.transforms import Resize, NormalizeImage, PrepareForNet
-from annotator.util import annotator_ckpts_path
-
-
-ISL_PATHS = {
-    "dpt_large": os.path.join(annotator_ckpts_path, "dpt_large-midas-2f21e586.pt"),
-    "dpt_hybrid": os.path.join(annotator_ckpts_path, "dpt_hybrid-midas-501f0c75.pt"),
-    "midas_v21": "",
-    "midas_v21_small": "",
-}
-
-remote_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt"
-
-
-def disabled_train(self, mode=True):
-    """Overwrite model.train with this function to make sure train/eval mode
-    does not change anymore."""
-    return self
-
-
-def load_midas_transform(model_type):
-    # https://github.com/isl-org/MiDaS/blob/master/run.py
-    # load transform only
-    if model_type == "dpt_large":  # DPT-Large
-        net_w, net_h = 384, 384
-        resize_mode = "minimal"
-        normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
-
-    elif model_type == "dpt_hybrid":  # DPT-Hybrid
-        net_w, net_h = 384, 384
-        resize_mode = "minimal"
-        normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
-
-    elif model_type == "midas_v21":
-        net_w, net_h = 384, 384
-        resize_mode = "upper_bound"
-        normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-
-    elif model_type == "midas_v21_small":
-        net_w, net_h = 256, 256
-        resize_mode = "upper_bound"
-        normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-
-    else:
-        assert False, f"model_type '{model_type}' not implemented, use: --model_type large"
-
-    transform = Compose(
-        [
-            Resize(
-                net_w,
-                net_h,
-                resize_target=None,
-                keep_aspect_ratio=True,
-                ensure_multiple_of=32,
-                resize_method=resize_mode,
-                image_interpolation_method=cv2.INTER_CUBIC,
-            ),
-            normalization,
-            PrepareForNet(),
-        ]
-    )
-
-    return transform
-
-
-def load_model(model_type):
-    # https://github.com/isl-org/MiDaS/blob/master/run.py
-    # load network
-    model_path = ISL_PATHS[model_type]
-    if model_type == "dpt_large":  # DPT-Large
-        model = DPTDepthModel(
-            path=model_path,
-            backbone="vitl16_384",
-            non_negative=True,
-        )
-        net_w, net_h = 384, 384
-        resize_mode = "minimal"
-        normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
-
-    elif model_type == "dpt_hybrid":  # DPT-Hybrid
-        if not os.path.exists(model_path):
-            from basicsr.utils.download_util import load_file_from_url
-            load_file_from_url(remote_model_path, model_dir=annotator_ckpts_path)
-
-        model = DPTDepthModel(
-            path=model_path,
-            backbone="vitb_rn50_384",
-            non_negative=True,
-        )
-        net_w, net_h = 384, 384
-        resize_mode = "minimal"
-        normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
-
-    elif model_type == "midas_v21":
-        model = MidasNet(model_path, non_negative=True)
-        net_w, net_h = 384, 384
-        resize_mode = "upper_bound"
-        normalization = NormalizeImage(
-            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-        )
-
-    elif model_type == "midas_v21_small":
-        model = MidasNet_small(model_path, features=64, backbone="efficientnet_lite3", exportable=True,
-                               non_negative=True, blocks={'expand': True})
-        net_w, net_h = 256, 256
-        resize_mode = "upper_bound"
-        normalization = NormalizeImage(
-            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-        )
-
-    else:
-        print(f"model_type '{model_type}' not implemented, use: --model_type large")
-        assert False
-
-    transform = Compose(
-        [
-            Resize(
-                net_w,
-                net_h,
-                resize_target=None,
-                keep_aspect_ratio=True,
-                ensure_multiple_of=32,
-                resize_method=resize_mode,
-                image_interpolation_method=cv2.INTER_CUBIC,
-            ),
-            normalization,
-            PrepareForNet(),
-        ]
-    )
-
-    return model.eval(), transform
-
-
-class MiDaSInference(nn.Module):
-    MODEL_TYPES_TORCH_HUB = [
-        "DPT_Large",
-        "DPT_Hybrid",
-        "MiDaS_small"
-    ]
-    MODEL_TYPES_ISL = [
-        "dpt_large",
-        "dpt_hybrid",
-        "midas_v21",
-        "midas_v21_small",
-    ]
-
-    def __init__(self, model_type):
-        super().__init__()
-        assert (model_type in self.MODEL_TYPES_ISL)
-        model, _ = load_model(model_type)
-        self.model = model
-        self.model.train = disabled_train
-
-    def forward(self, x):
-        with torch.no_grad():
-            prediction = self.model(x)
-        return prediction
-
diff --git a/ControlNet/annotator/midas/midas/__init__.py b/ControlNet/annotator/midas/midas/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/ControlNet/annotator/midas/midas/base_model.py b/ControlNet/annotator/midas/midas/base_model.py
deleted file mode 100644
index 5cf430239b47ec5ec07531263f26f5c24a2311cd..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/midas/midas/base_model.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import torch
-
-
-class BaseModel(torch.nn.Module):
-    def load(self, path):
-        """Load model from file.
-
-        Args:
-            path (str): file path
-        """
-        parameters = torch.load(path, map_location=torch.device('cpu'))
-
-        if "optimizer" in parameters:
-            parameters = parameters["model"]
-
-        self.load_state_dict(parameters)
diff --git a/ControlNet/annotator/midas/midas/blocks.py b/ControlNet/annotator/midas/midas/blocks.py
deleted file mode 100644
index 2145d18fa98060a618536d9a64fe6589e9be4f78..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/midas/midas/blocks.py
+++ /dev/null
@@ -1,342 +0,0 @@
-import torch
-import torch.nn as nn
-
-from .vit import (
-    _make_pretrained_vitb_rn50_384,
-    _make_pretrained_vitl16_384,
-    _make_pretrained_vitb16_384,
-    forward_vit,
-)
-
-def _make_encoder(backbone, features, use_pretrained, groups=1, expand=False, exportable=True, hooks=None, use_vit_only=False, use_readout="ignore",):
-    if backbone == "vitl16_384":
-        pretrained = _make_pretrained_vitl16_384(
-            use_pretrained, hooks=hooks, use_readout=use_readout
-        )
-        scratch = _make_scratch(
-            [256, 512, 1024, 1024], features, groups=groups, expand=expand
-        )  # ViT-L/16 - 85.0% Top1 (backbone)
-    elif backbone == "vitb_rn50_384":
-        pretrained = _make_pretrained_vitb_rn50_384(
-            use_pretrained,
-            hooks=hooks,
-            use_vit_only=use_vit_only,
-            use_readout=use_readout,
-        )
-        scratch = _make_scratch(
-            [256, 512, 768, 768], features, groups=groups, expand=expand
-        )  # ViT-H/16 - 85.0% Top1 (backbone)
-    elif backbone == "vitb16_384":
-        pretrained = _make_pretrained_vitb16_384(
-            use_pretrained, hooks=hooks, use_readout=use_readout
-        )
-        scratch = _make_scratch(
-            [96, 192, 384, 768], features, groups=groups, expand=expand
-        )  # ViT-B/16 - 84.6% Top1 (backbone)
-    elif backbone == "resnext101_wsl":
-        pretrained = _make_pretrained_resnext101_wsl(use_pretrained)
-        scratch = _make_scratch([256, 512, 1024, 2048], features, groups=groups, expand=expand)     # efficientnet_lite3  
-    elif backbone == "efficientnet_lite3":
-        pretrained = _make_pretrained_efficientnet_lite3(use_pretrained, exportable=exportable)
-        scratch = _make_scratch([32, 48, 136, 384], features, groups=groups, expand=expand)  # efficientnet_lite3     
-    else:
-        print(f"Backbone '{backbone}' not implemented")
-        assert False
-        
-    return pretrained, scratch
-
-
-def _make_scratch(in_shape, out_shape, groups=1, expand=False):
-    scratch = nn.Module()
-
-    out_shape1 = out_shape
-    out_shape2 = out_shape
-    out_shape3 = out_shape
-    out_shape4 = out_shape
-    if expand==True:
-        out_shape1 = out_shape
-        out_shape2 = out_shape*2
-        out_shape3 = out_shape*4
-        out_shape4 = out_shape*8
-
-    scratch.layer1_rn = nn.Conv2d(
-        in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
-    )
-    scratch.layer2_rn = nn.Conv2d(
-        in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
-    )
-    scratch.layer3_rn = nn.Conv2d(
-        in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
-    )
-    scratch.layer4_rn = nn.Conv2d(
-        in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
-    )
-
-    return scratch
-
-
-def _make_pretrained_efficientnet_lite3(use_pretrained, exportable=False):
-    efficientnet = torch.hub.load(
-        "rwightman/gen-efficientnet-pytorch",
-        "tf_efficientnet_lite3",
-        pretrained=use_pretrained,
-        exportable=exportable
-    )
-    return _make_efficientnet_backbone(efficientnet)
-
-
-def _make_efficientnet_backbone(effnet):
-    pretrained = nn.Module()
-
-    pretrained.layer1 = nn.Sequential(
-        effnet.conv_stem, effnet.bn1, effnet.act1, *effnet.blocks[0:2]
-    )
-    pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3])
-    pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5])
-    pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9])
-
-    return pretrained
-    
-
-def _make_resnet_backbone(resnet):
-    pretrained = nn.Module()
-    pretrained.layer1 = nn.Sequential(
-        resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool, resnet.layer1
-    )
-
-    pretrained.layer2 = resnet.layer2
-    pretrained.layer3 = resnet.layer3
-    pretrained.layer4 = resnet.layer4
-
-    return pretrained
-
-
-def _make_pretrained_resnext101_wsl(use_pretrained):
-    resnet = torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl")
-    return _make_resnet_backbone(resnet)
-
-
-
-class Interpolate(nn.Module):
-    """Interpolation module.
-    """
-
-    def __init__(self, scale_factor, mode, align_corners=False):
-        """Init.
-
-        Args:
-            scale_factor (float): scaling
-            mode (str): interpolation mode
-        """
-        super(Interpolate, self).__init__()
-
-        self.interp = nn.functional.interpolate
-        self.scale_factor = scale_factor
-        self.mode = mode
-        self.align_corners = align_corners
-
-    def forward(self, x):
-        """Forward pass.
-
-        Args:
-            x (tensor): input
-
-        Returns:
-            tensor: interpolated data
-        """
-
-        x = self.interp(
-            x, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners
-        )
-
-        return x
-
-
-class ResidualConvUnit(nn.Module):
-    """Residual convolution module.
-    """
-
-    def __init__(self, features):
-        """Init.
-
-        Args:
-            features (int): number of features
-        """
-        super().__init__()
-
-        self.conv1 = nn.Conv2d(
-            features, features, kernel_size=3, stride=1, padding=1, bias=True
-        )
-
-        self.conv2 = nn.Conv2d(
-            features, features, kernel_size=3, stride=1, padding=1, bias=True
-        )
-
-        self.relu = nn.ReLU(inplace=True)
-
-    def forward(self, x):
-        """Forward pass.
-
-        Args:
-            x (tensor): input
-
-        Returns:
-            tensor: output
-        """
-        out = self.relu(x)
-        out = self.conv1(out)
-        out = self.relu(out)
-        out = self.conv2(out)
-
-        return out + x
-
-
-class FeatureFusionBlock(nn.Module):
-    """Feature fusion block.
-    """
-
-    def __init__(self, features):
-        """Init.
-
-        Args:
-            features (int): number of features
-        """
-        super(FeatureFusionBlock, self).__init__()
-
-        self.resConfUnit1 = ResidualConvUnit(features)
-        self.resConfUnit2 = ResidualConvUnit(features)
-
-    def forward(self, *xs):
-        """Forward pass.
-
-        Returns:
-            tensor: output
-        """
-        output = xs[0]
-
-        if len(xs) == 2:
-            output += self.resConfUnit1(xs[1])
-
-        output = self.resConfUnit2(output)
-
-        output = nn.functional.interpolate(
-            output, scale_factor=2, mode="bilinear", align_corners=True
-        )
-
-        return output
-
-
-
-
-class ResidualConvUnit_custom(nn.Module):
-    """Residual convolution module.
-    """
-
-    def __init__(self, features, activation, bn):
-        """Init.
-
-        Args:
-            features (int): number of features
-        """
-        super().__init__()
-
-        self.bn = bn
-
-        self.groups=1
-
-        self.conv1 = nn.Conv2d(
-            features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
-        )
-        
-        self.conv2 = nn.Conv2d(
-            features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
-        )
-
-        if self.bn==True:
-            self.bn1 = nn.BatchNorm2d(features)
-            self.bn2 = nn.BatchNorm2d(features)
-
-        self.activation = activation
-
-        self.skip_add = nn.quantized.FloatFunctional()
-
-    def forward(self, x):
-        """Forward pass.
-
-        Args:
-            x (tensor): input
-
-        Returns:
-            tensor: output
-        """
-        
-        out = self.activation(x)
-        out = self.conv1(out)
-        if self.bn==True:
-            out = self.bn1(out)
-       
-        out = self.activation(out)
-        out = self.conv2(out)
-        if self.bn==True:
-            out = self.bn2(out)
-
-        if self.groups > 1:
-            out = self.conv_merge(out)
-
-        return self.skip_add.add(out, x)
-
-        # return out + x
-
-
-class FeatureFusionBlock_custom(nn.Module):
-    """Feature fusion block.
-    """
-
-    def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True):
-        """Init.
-
-        Args:
-            features (int): number of features
-        """
-        super(FeatureFusionBlock_custom, self).__init__()
-
-        self.deconv = deconv
-        self.align_corners = align_corners
-
-        self.groups=1
-
-        self.expand = expand
-        out_features = features
-        if self.expand==True:
-            out_features = features//2
-        
-        self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
-
-        self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
-        self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
-        
-        self.skip_add = nn.quantized.FloatFunctional()
-
-    def forward(self, *xs):
-        """Forward pass.
-
-        Returns:
-            tensor: output
-        """
-        output = xs[0]
-
-        if len(xs) == 2:
-            res = self.resConfUnit1(xs[1])
-            output = self.skip_add.add(output, res)
-            # output += res
-
-        output = self.resConfUnit2(output)
-
-        output = nn.functional.interpolate(
-            output, scale_factor=2, mode="bilinear", align_corners=self.align_corners
-        )
-
-        output = self.out_conv(output)
-
-        return output
-
diff --git a/ControlNet/annotator/midas/midas/dpt_depth.py b/ControlNet/annotator/midas/midas/dpt_depth.py
deleted file mode 100644
index 4e9aab5d2767dffea39da5b3f30e2798688216f1..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/midas/midas/dpt_depth.py
+++ /dev/null
@@ -1,109 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from .base_model import BaseModel
-from .blocks import (
-    FeatureFusionBlock,
-    FeatureFusionBlock_custom,
-    Interpolate,
-    _make_encoder,
-    forward_vit,
-)
-
-
-def _make_fusion_block(features, use_bn):
-    return FeatureFusionBlock_custom(
-        features,
-        nn.ReLU(False),
-        deconv=False,
-        bn=use_bn,
-        expand=False,
-        align_corners=True,
-    )
-
-
-class DPT(BaseModel):
-    def __init__(
-        self,
-        head,
-        features=256,
-        backbone="vitb_rn50_384",
-        readout="project",
-        channels_last=False,
-        use_bn=False,
-    ):
-
-        super(DPT, self).__init__()
-
-        self.channels_last = channels_last
-
-        hooks = {
-            "vitb_rn50_384": [0, 1, 8, 11],
-            "vitb16_384": [2, 5, 8, 11],
-            "vitl16_384": [5, 11, 17, 23],
-        }
-
-        # Instantiate backbone and reassemble blocks
-        self.pretrained, self.scratch = _make_encoder(
-            backbone,
-            features,
-            False, # Set to true of you want to train from scratch, uses ImageNet weights
-            groups=1,
-            expand=False,
-            exportable=False,
-            hooks=hooks[backbone],
-            use_readout=readout,
-        )
-
-        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
-        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
-        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
-        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
-
-        self.scratch.output_conv = head
-
-
-    def forward(self, x):
-        if self.channels_last == True:
-            x.contiguous(memory_format=torch.channels_last)
-
-        layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)
-
-        layer_1_rn = self.scratch.layer1_rn(layer_1)
-        layer_2_rn = self.scratch.layer2_rn(layer_2)
-        layer_3_rn = self.scratch.layer3_rn(layer_3)
-        layer_4_rn = self.scratch.layer4_rn(layer_4)
-
-        path_4 = self.scratch.refinenet4(layer_4_rn)
-        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
-        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
-        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
-
-        out = self.scratch.output_conv(path_1)
-
-        return out
-
-
-class DPTDepthModel(DPT):
-    def __init__(self, path=None, non_negative=True, **kwargs):
-        features = kwargs["features"] if "features" in kwargs else 256
-
-        head = nn.Sequential(
-            nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
-            Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
-            nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
-            nn.ReLU(True),
-            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
-            nn.ReLU(True) if non_negative else nn.Identity(),
-            nn.Identity(),
-        )
-
-        super().__init__(head, **kwargs)
-
-        if path is not None:
-           self.load(path)
-
-    def forward(self, x):
-        return super().forward(x).squeeze(dim=1)
-
diff --git a/ControlNet/annotator/midas/midas/midas_net.py b/ControlNet/annotator/midas/midas/midas_net.py
deleted file mode 100644
index 8a954977800b0a0f48807e80fa63041910e33c1f..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/midas/midas/midas_net.py
+++ /dev/null
@@ -1,76 +0,0 @@
-"""MidashNet: Network for monocular depth estimation trained by mixing several datasets.
-This file contains code that is adapted from
-https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
-"""
-import torch
-import torch.nn as nn
-
-from .base_model import BaseModel
-from .blocks import FeatureFusionBlock, Interpolate, _make_encoder
-
-
-class MidasNet(BaseModel):
-    """Network for monocular depth estimation.
-    """
-
-    def __init__(self, path=None, features=256, non_negative=True):
-        """Init.
-
-        Args:
-            path (str, optional): Path to saved model. Defaults to None.
-            features (int, optional): Number of features. Defaults to 256.
-            backbone (str, optional): Backbone network for encoder. Defaults to resnet50
-        """
-        print("Loading weights: ", path)
-
-        super(MidasNet, self).__init__()
-
-        use_pretrained = False if path is None else True
-
-        self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained)
-
-        self.scratch.refinenet4 = FeatureFusionBlock(features)
-        self.scratch.refinenet3 = FeatureFusionBlock(features)
-        self.scratch.refinenet2 = FeatureFusionBlock(features)
-        self.scratch.refinenet1 = FeatureFusionBlock(features)
-
-        self.scratch.output_conv = nn.Sequential(
-            nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1),
-            Interpolate(scale_factor=2, mode="bilinear"),
-            nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1),
-            nn.ReLU(True),
-            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
-            nn.ReLU(True) if non_negative else nn.Identity(),
-        )
-
-        if path:
-            self.load(path)
-
-    def forward(self, x):
-        """Forward pass.
-
-        Args:
-            x (tensor): input data (image)
-
-        Returns:
-            tensor: depth
-        """
-
-        layer_1 = self.pretrained.layer1(x)
-        layer_2 = self.pretrained.layer2(layer_1)
-        layer_3 = self.pretrained.layer3(layer_2)
-        layer_4 = self.pretrained.layer4(layer_3)
-
-        layer_1_rn = self.scratch.layer1_rn(layer_1)
-        layer_2_rn = self.scratch.layer2_rn(layer_2)
-        layer_3_rn = self.scratch.layer3_rn(layer_3)
-        layer_4_rn = self.scratch.layer4_rn(layer_4)
-
-        path_4 = self.scratch.refinenet4(layer_4_rn)
-        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
-        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
-        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
-
-        out = self.scratch.output_conv(path_1)
-
-        return torch.squeeze(out, dim=1)
diff --git a/ControlNet/annotator/midas/midas/midas_net_custom.py b/ControlNet/annotator/midas/midas/midas_net_custom.py
deleted file mode 100644
index 50e4acb5e53d5fabefe3dde16ab49c33c2b7797c..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/midas/midas/midas_net_custom.py
+++ /dev/null
@@ -1,128 +0,0 @@
-"""MidashNet: Network for monocular depth estimation trained by mixing several datasets.
-This file contains code that is adapted from
-https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
-"""
-import torch
-import torch.nn as nn
-
-from .base_model import BaseModel
-from .blocks import FeatureFusionBlock, FeatureFusionBlock_custom, Interpolate, _make_encoder
-
-
-class MidasNet_small(BaseModel):
-    """Network for monocular depth estimation.
-    """
-
-    def __init__(self, path=None, features=64, backbone="efficientnet_lite3", non_negative=True, exportable=True, channels_last=False, align_corners=True,
-        blocks={'expand': True}):
-        """Init.
-
-        Args:
-            path (str, optional): Path to saved model. Defaults to None.
-            features (int, optional): Number of features. Defaults to 256.
-            backbone (str, optional): Backbone network for encoder. Defaults to resnet50
-        """
-        print("Loading weights: ", path)
-
-        super(MidasNet_small, self).__init__()
-
-        use_pretrained = False if path else True
-                
-        self.channels_last = channels_last
-        self.blocks = blocks
-        self.backbone = backbone
-
-        self.groups = 1
-
-        features1=features
-        features2=features
-        features3=features
-        features4=features
-        self.expand = False
-        if "expand" in self.blocks and self.blocks['expand'] == True:
-            self.expand = True
-            features1=features
-            features2=features*2
-            features3=features*4
-            features4=features*8
-
-        self.pretrained, self.scratch = _make_encoder(self.backbone, features, use_pretrained, groups=self.groups, expand=self.expand, exportable=exportable)
-  
-        self.scratch.activation = nn.ReLU(False)    
-
-        self.scratch.refinenet4 = FeatureFusionBlock_custom(features4, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
-        self.scratch.refinenet3 = FeatureFusionBlock_custom(features3, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
-        self.scratch.refinenet2 = FeatureFusionBlock_custom(features2, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
-        self.scratch.refinenet1 = FeatureFusionBlock_custom(features1, self.scratch.activation, deconv=False, bn=False, align_corners=align_corners)
-
-        
-        self.scratch.output_conv = nn.Sequential(
-            nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1, groups=self.groups),
-            Interpolate(scale_factor=2, mode="bilinear"),
-            nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1),
-            self.scratch.activation,
-            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
-            nn.ReLU(True) if non_negative else nn.Identity(),
-            nn.Identity(),
-        )
-        
-        if path:
-            self.load(path)
-
-
-    def forward(self, x):
-        """Forward pass.
-
-        Args:
-            x (tensor): input data (image)
-
-        Returns:
-            tensor: depth
-        """
-        if self.channels_last==True:
-            print("self.channels_last = ", self.channels_last)
-            x.contiguous(memory_format=torch.channels_last)
-
-
-        layer_1 = self.pretrained.layer1(x)
-        layer_2 = self.pretrained.layer2(layer_1)
-        layer_3 = self.pretrained.layer3(layer_2)
-        layer_4 = self.pretrained.layer4(layer_3)
-        
-        layer_1_rn = self.scratch.layer1_rn(layer_1)
-        layer_2_rn = self.scratch.layer2_rn(layer_2)
-        layer_3_rn = self.scratch.layer3_rn(layer_3)
-        layer_4_rn = self.scratch.layer4_rn(layer_4)
-
-
-        path_4 = self.scratch.refinenet4(layer_4_rn)
-        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
-        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
-        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
-        
-        out = self.scratch.output_conv(path_1)
-
-        return torch.squeeze(out, dim=1)
-
-
-
-def fuse_model(m):
-    prev_previous_type = nn.Identity()
-    prev_previous_name = ''
-    previous_type = nn.Identity()
-    previous_name = ''
-    for name, module in m.named_modules():
-        if prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d and type(module) == nn.ReLU:
-            # print("FUSED ", prev_previous_name, previous_name, name)
-            torch.quantization.fuse_modules(m, [prev_previous_name, previous_name, name], inplace=True)
-        elif prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d:
-            # print("FUSED ", prev_previous_name, previous_name)
-            torch.quantization.fuse_modules(m, [prev_previous_name, previous_name], inplace=True)
-        # elif previous_type == nn.Conv2d and type(module) == nn.ReLU:
-        #    print("FUSED ", previous_name, name)
-        #    torch.quantization.fuse_modules(m, [previous_name, name], inplace=True)
-
-        prev_previous_type = previous_type
-        prev_previous_name = previous_name
-        previous_type = type(module)
-        previous_name = name
\ No newline at end of file
diff --git a/ControlNet/annotator/midas/midas/vit.py b/ControlNet/annotator/midas/midas/vit.py
deleted file mode 100644
index ea46b1be88b261b0dec04f3da0256f5f66f88a74..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/midas/midas/vit.py
+++ /dev/null
@@ -1,491 +0,0 @@
-import torch
-import torch.nn as nn
-import timm
-import types
-import math
-import torch.nn.functional as F
-
-
-class Slice(nn.Module):
-    def __init__(self, start_index=1):
-        super(Slice, self).__init__()
-        self.start_index = start_index
-
-    def forward(self, x):
-        return x[:, self.start_index :]
-
-
-class AddReadout(nn.Module):
-    def __init__(self, start_index=1):
-        super(AddReadout, self).__init__()
-        self.start_index = start_index
-
-    def forward(self, x):
-        if self.start_index == 2:
-            readout = (x[:, 0] + x[:, 1]) / 2
-        else:
-            readout = x[:, 0]
-        return x[:, self.start_index :] + readout.unsqueeze(1)
-
-
-class ProjectReadout(nn.Module):
-    def __init__(self, in_features, start_index=1):
-        super(ProjectReadout, self).__init__()
-        self.start_index = start_index
-
-        self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU())
-
-    def forward(self, x):
-        readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index :])
-        features = torch.cat((x[:, self.start_index :], readout), -1)
-
-        return self.project(features)
-
-
-class Transpose(nn.Module):
-    def __init__(self, dim0, dim1):
-        super(Transpose, self).__init__()
-        self.dim0 = dim0
-        self.dim1 = dim1
-
-    def forward(self, x):
-        x = x.transpose(self.dim0, self.dim1)
-        return x
-
-
-def forward_vit(pretrained, x):
-    b, c, h, w = x.shape
-
-    glob = pretrained.model.forward_flex(x)
-
-    layer_1 = pretrained.activations["1"]
-    layer_2 = pretrained.activations["2"]
-    layer_3 = pretrained.activations["3"]
-    layer_4 = pretrained.activations["4"]
-
-    layer_1 = pretrained.act_postprocess1[0:2](layer_1)
-    layer_2 = pretrained.act_postprocess2[0:2](layer_2)
-    layer_3 = pretrained.act_postprocess3[0:2](layer_3)
-    layer_4 = pretrained.act_postprocess4[0:2](layer_4)
-
-    unflatten = nn.Sequential(
-        nn.Unflatten(
-            2,
-            torch.Size(
-                [
-                    h // pretrained.model.patch_size[1],
-                    w // pretrained.model.patch_size[0],
-                ]
-            ),
-        )
-    )
-
-    if layer_1.ndim == 3:
-        layer_1 = unflatten(layer_1)
-    if layer_2.ndim == 3:
-        layer_2 = unflatten(layer_2)
-    if layer_3.ndim == 3:
-        layer_3 = unflatten(layer_3)
-    if layer_4.ndim == 3:
-        layer_4 = unflatten(layer_4)
-
-    layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1)
-    layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2)
-    layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3)
-    layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4)
-
-    return layer_1, layer_2, layer_3, layer_4
-
-
-def _resize_pos_embed(self, posemb, gs_h, gs_w):
-    posemb_tok, posemb_grid = (
-        posemb[:, : self.start_index],
-        posemb[0, self.start_index :],
-    )
-
-    gs_old = int(math.sqrt(len(posemb_grid)))
-
-    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
-    posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear")
-    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
-
-    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
-
-    return posemb
-
-
-def forward_flex(self, x):
-    b, c, h, w = x.shape
-
-    pos_embed = self._resize_pos_embed(
-        self.pos_embed, h // self.patch_size[1], w // self.patch_size[0]
-    )
-
-    B = x.shape[0]
-
-    if hasattr(self.patch_embed, "backbone"):
-        x = self.patch_embed.backbone(x)
-        if isinstance(x, (list, tuple)):
-            x = x[-1]  # last feature if backbone outputs list/tuple of features
-
-    x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
-
-    if getattr(self, "dist_token", None) is not None:
-        cls_tokens = self.cls_token.expand(
-            B, -1, -1
-        )  # stole cls_tokens impl from Phil Wang, thanks
-        dist_token = self.dist_token.expand(B, -1, -1)
-        x = torch.cat((cls_tokens, dist_token, x), dim=1)
-    else:
-        cls_tokens = self.cls_token.expand(
-            B, -1, -1
-        )  # stole cls_tokens impl from Phil Wang, thanks
-        x = torch.cat((cls_tokens, x), dim=1)
-
-    x = x + pos_embed
-    x = self.pos_drop(x)
-
-    for blk in self.blocks:
-        x = blk(x)
-
-    x = self.norm(x)
-
-    return x
-
-
-activations = {}
-
-
-def get_activation(name):
-    def hook(model, input, output):
-        activations[name] = output
-
-    return hook
-
-
-def get_readout_oper(vit_features, features, use_readout, start_index=1):
-    if use_readout == "ignore":
-        readout_oper = [Slice(start_index)] * len(features)
-    elif use_readout == "add":
-        readout_oper = [AddReadout(start_index)] * len(features)
-    elif use_readout == "project":
-        readout_oper = [
-            ProjectReadout(vit_features, start_index) for out_feat in features
-        ]
-    else:
-        assert (
-            False
-        ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
-
-    return readout_oper
-
-
-def _make_vit_b16_backbone(
-    model,
-    features=[96, 192, 384, 768],
-    size=[384, 384],
-    hooks=[2, 5, 8, 11],
-    vit_features=768,
-    use_readout="ignore",
-    start_index=1,
-):
-    pretrained = nn.Module()
-
-    pretrained.model = model
-    pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
-    pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
-    pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
-    pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
-
-    pretrained.activations = activations
-
-    readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
-
-    # 32, 48, 136, 384
-    pretrained.act_postprocess1 = nn.Sequential(
-        readout_oper[0],
-        Transpose(1, 2),
-        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-        nn.Conv2d(
-            in_channels=vit_features,
-            out_channels=features[0],
-            kernel_size=1,
-            stride=1,
-            padding=0,
-        ),
-        nn.ConvTranspose2d(
-            in_channels=features[0],
-            out_channels=features[0],
-            kernel_size=4,
-            stride=4,
-            padding=0,
-            bias=True,
-            dilation=1,
-            groups=1,
-        ),
-    )
-
-    pretrained.act_postprocess2 = nn.Sequential(
-        readout_oper[1],
-        Transpose(1, 2),
-        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-        nn.Conv2d(
-            in_channels=vit_features,
-            out_channels=features[1],
-            kernel_size=1,
-            stride=1,
-            padding=0,
-        ),
-        nn.ConvTranspose2d(
-            in_channels=features[1],
-            out_channels=features[1],
-            kernel_size=2,
-            stride=2,
-            padding=0,
-            bias=True,
-            dilation=1,
-            groups=1,
-        ),
-    )
-
-    pretrained.act_postprocess3 = nn.Sequential(
-        readout_oper[2],
-        Transpose(1, 2),
-        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-        nn.Conv2d(
-            in_channels=vit_features,
-            out_channels=features[2],
-            kernel_size=1,
-            stride=1,
-            padding=0,
-        ),
-    )
-
-    pretrained.act_postprocess4 = nn.Sequential(
-        readout_oper[3],
-        Transpose(1, 2),
-        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-        nn.Conv2d(
-            in_channels=vit_features,
-            out_channels=features[3],
-            kernel_size=1,
-            stride=1,
-            padding=0,
-        ),
-        nn.Conv2d(
-            in_channels=features[3],
-            out_channels=features[3],
-            kernel_size=3,
-            stride=2,
-            padding=1,
-        ),
-    )
-
-    pretrained.model.start_index = start_index
-    pretrained.model.patch_size = [16, 16]
-
-    # We inject this function into the VisionTransformer instances so that
-    # we can use it with interpolated position embeddings without modifying the library source.
-    pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
-    pretrained.model._resize_pos_embed = types.MethodType(
-        _resize_pos_embed, pretrained.model
-    )
-
-    return pretrained
-
-
-def _make_pretrained_vitl16_384(pretrained, use_readout="ignore", hooks=None):
-    model = timm.create_model("vit_large_patch16_384", pretrained=pretrained)
-
-    hooks = [5, 11, 17, 23] if hooks == None else hooks
-    return _make_vit_b16_backbone(
-        model,
-        features=[256, 512, 1024, 1024],
-        hooks=hooks,
-        vit_features=1024,
-        use_readout=use_readout,
-    )
-
-
-def _make_pretrained_vitb16_384(pretrained, use_readout="ignore", hooks=None):
-    model = timm.create_model("vit_base_patch16_384", pretrained=pretrained)
-
-    hooks = [2, 5, 8, 11] if hooks == None else hooks
-    return _make_vit_b16_backbone(
-        model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
-    )
-
-
-def _make_pretrained_deitb16_384(pretrained, use_readout="ignore", hooks=None):
-    model = timm.create_model("vit_deit_base_patch16_384", pretrained=pretrained)
-
-    hooks = [2, 5, 8, 11] if hooks == None else hooks
-    return _make_vit_b16_backbone(
-        model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
-    )
-
-
-def _make_pretrained_deitb16_distil_384(pretrained, use_readout="ignore", hooks=None):
-    model = timm.create_model(
-        "vit_deit_base_distilled_patch16_384", pretrained=pretrained
-    )
-
-    hooks = [2, 5, 8, 11] if hooks == None else hooks
-    return _make_vit_b16_backbone(
-        model,
-        features=[96, 192, 384, 768],
-        hooks=hooks,
-        use_readout=use_readout,
-        start_index=2,
-    )
-
-
-def _make_vit_b_rn50_backbone(
-    model,
-    features=[256, 512, 768, 768],
-    size=[384, 384],
-    hooks=[0, 1, 8, 11],
-    vit_features=768,
-    use_vit_only=False,
-    use_readout="ignore",
-    start_index=1,
-):
-    pretrained = nn.Module()
-
-    pretrained.model = model
-
-    if use_vit_only == True:
-        pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
-        pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
-    else:
-        pretrained.model.patch_embed.backbone.stages[0].register_forward_hook(
-            get_activation("1")
-        )
-        pretrained.model.patch_embed.backbone.stages[1].register_forward_hook(
-            get_activation("2")
-        )
-
-    pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
-    pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
-
-    pretrained.activations = activations
-
-    readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
-
-    if use_vit_only == True:
-        pretrained.act_postprocess1 = nn.Sequential(
-            readout_oper[0],
-            Transpose(1, 2),
-            nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-            nn.Conv2d(
-                in_channels=vit_features,
-                out_channels=features[0],
-                kernel_size=1,
-                stride=1,
-                padding=0,
-            ),
-            nn.ConvTranspose2d(
-                in_channels=features[0],
-                out_channels=features[0],
-                kernel_size=4,
-                stride=4,
-                padding=0,
-                bias=True,
-                dilation=1,
-                groups=1,
-            ),
-        )
-
-        pretrained.act_postprocess2 = nn.Sequential(
-            readout_oper[1],
-            Transpose(1, 2),
-            nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-            nn.Conv2d(
-                in_channels=vit_features,
-                out_channels=features[1],
-                kernel_size=1,
-                stride=1,
-                padding=0,
-            ),
-            nn.ConvTranspose2d(
-                in_channels=features[1],
-                out_channels=features[1],
-                kernel_size=2,
-                stride=2,
-                padding=0,
-                bias=True,
-                dilation=1,
-                groups=1,
-            ),
-        )
-    else:
-        pretrained.act_postprocess1 = nn.Sequential(
-            nn.Identity(), nn.Identity(), nn.Identity()
-        )
-        pretrained.act_postprocess2 = nn.Sequential(
-            nn.Identity(), nn.Identity(), nn.Identity()
-        )
-
-    pretrained.act_postprocess3 = nn.Sequential(
-        readout_oper[2],
-        Transpose(1, 2),
-        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-        nn.Conv2d(
-            in_channels=vit_features,
-            out_channels=features[2],
-            kernel_size=1,
-            stride=1,
-            padding=0,
-        ),
-    )
-
-    pretrained.act_postprocess4 = nn.Sequential(
-        readout_oper[3],
-        Transpose(1, 2),
-        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-        nn.Conv2d(
-            in_channels=vit_features,
-            out_channels=features[3],
-            kernel_size=1,
-            stride=1,
-            padding=0,
-        ),
-        nn.Conv2d(
-            in_channels=features[3],
-            out_channels=features[3],
-            kernel_size=3,
-            stride=2,
-            padding=1,
-        ),
-    )
-
-    pretrained.model.start_index = start_index
-    pretrained.model.patch_size = [16, 16]
-
-    # We inject this function into the VisionTransformer instances so that
-    # we can use it with interpolated position embeddings without modifying the library source.
-    pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
-
-    # We inject this function into the VisionTransformer instances so that
-    # we can use it with interpolated position embeddings without modifying the library source.
-    pretrained.model._resize_pos_embed = types.MethodType(
-        _resize_pos_embed, pretrained.model
-    )
-
-    return pretrained
-
-
-def _make_pretrained_vitb_rn50_384(
-    pretrained, use_readout="ignore", hooks=None, use_vit_only=False
-):
-    model = timm.create_model("vit_base_resnet50_384", pretrained=pretrained)
-
-    hooks = [0, 1, 8, 11] if hooks == None else hooks
-    return _make_vit_b_rn50_backbone(
-        model,
-        features=[256, 512, 768, 768],
-        size=[384, 384],
-        hooks=hooks,
-        use_vit_only=use_vit_only,
-        use_readout=use_readout,
-    )
diff --git a/ControlNet/annotator/midas/utils.py b/ControlNet/annotator/midas/utils.py
deleted file mode 100644
index 9a9d3b5b66370fa98da9e067ba53ead848ea9a59..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/midas/utils.py
+++ /dev/null
@@ -1,189 +0,0 @@
-"""Utils for monoDepth."""
-import sys
-import re
-import numpy as np
-import cv2
-import torch
-
-
-def read_pfm(path):
-    """Read pfm file.
-
-    Args:
-        path (str): path to file
-
-    Returns:
-        tuple: (data, scale)
-    """
-    with open(path, "rb") as file:
-
-        color = None
-        width = None
-        height = None
-        scale = None
-        endian = None
-
-        header = file.readline().rstrip()
-        if header.decode("ascii") == "PF":
-            color = True
-        elif header.decode("ascii") == "Pf":
-            color = False
-        else:
-            raise Exception("Not a PFM file: " + path)
-
-        dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii"))
-        if dim_match:
-            width, height = list(map(int, dim_match.groups()))
-        else:
-            raise Exception("Malformed PFM header.")
-
-        scale = float(file.readline().decode("ascii").rstrip())
-        if scale < 0:
-            # little-endian
-            endian = "<"
-            scale = -scale
-        else:
-            # big-endian
-            endian = ">"
-
-        data = np.fromfile(file, endian + "f")
-        shape = (height, width, 3) if color else (height, width)
-
-        data = np.reshape(data, shape)
-        data = np.flipud(data)
-
-        return data, scale
-
-
-def write_pfm(path, image, scale=1):
-    """Write pfm file.
-
-    Args:
-        path (str): pathto file
-        image (array): data
-        scale (int, optional): Scale. Defaults to 1.
-    """
-
-    with open(path, "wb") as file:
-        color = None
-
-        if image.dtype.name != "float32":
-            raise Exception("Image dtype must be float32.")
-
-        image = np.flipud(image)
-
-        if len(image.shape) == 3 and image.shape[2] == 3:  # color image
-            color = True
-        elif (
-            len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1
-        ):  # greyscale
-            color = False
-        else:
-            raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.")
-
-        file.write("PF\n" if color else "Pf\n".encode())
-        file.write("%d %d\n".encode() % (image.shape[1], image.shape[0]))
-
-        endian = image.dtype.byteorder
-
-        if endian == "<" or endian == "=" and sys.byteorder == "little":
-            scale = -scale
-
-        file.write("%f\n".encode() % scale)
-
-        image.tofile(file)
-
-
-def read_image(path):
-    """Read image and output RGB image (0-1).
-
-    Args:
-        path (str): path to file
-
-    Returns:
-        array: RGB image (0-1)
-    """
-    img = cv2.imread(path)
-
-    if img.ndim == 2:
-        img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
-
-    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0
-
-    return img
-
-
-def resize_image(img):
-    """Resize image and make it fit for network.
-
-    Args:
-        img (array): image
-
-    Returns:
-        tensor: data ready for network
-    """
-    height_orig = img.shape[0]
-    width_orig = img.shape[1]
-
-    if width_orig > height_orig:
-        scale = width_orig / 384
-    else:
-        scale = height_orig / 384
-
-    height = (np.ceil(height_orig / scale / 32) * 32).astype(int)
-    width = (np.ceil(width_orig / scale / 32) * 32).astype(int)
-
-    img_resized = cv2.resize(img, (width, height), interpolation=cv2.INTER_AREA)
-
-    img_resized = (
-        torch.from_numpy(np.transpose(img_resized, (2, 0, 1))).contiguous().float()
-    )
-    img_resized = img_resized.unsqueeze(0)
-
-    return img_resized
-
-
-def resize_depth(depth, width, height):
-    """Resize depth map and bring to CPU (numpy).
-
-    Args:
-        depth (tensor): depth
-        width (int): image width
-        height (int): image height
-
-    Returns:
-        array: processed depth
-    """
-    depth = torch.squeeze(depth[0, :, :, :]).to("cpu")
-
-    depth_resized = cv2.resize(
-        depth.numpy(), (width, height), interpolation=cv2.INTER_CUBIC
-    )
-
-    return depth_resized
-
-def write_depth(path, depth, bits=1):
-    """Write depth map to pfm and png file.
-
-    Args:
-        path (str): filepath without extension
-        depth (array): depth
-    """
-    write_pfm(path + ".pfm", depth.astype(np.float32))
-
-    depth_min = depth.min()
-    depth_max = depth.max()
-
-    max_val = (2**(8*bits))-1
-
-    if depth_max - depth_min > np.finfo("float").eps:
-        out = max_val * (depth - depth_min) / (depth_max - depth_min)
-    else:
-        out = np.zeros(depth.shape, dtype=depth.type)
-
-    if bits == 1:
-        cv2.imwrite(path + ".png", out.astype("uint8"))
-    elif bits == 2:
-        cv2.imwrite(path + ".png", out.astype("uint16"))
-
-    return
diff --git a/ControlNet/annotator/mlsd/LICENSE b/ControlNet/annotator/mlsd/LICENSE
deleted file mode 100644
index d855c6db44b4e873eedd750d34fa2eaf22e22363..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/mlsd/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "{}"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2021-present NAVER Corp.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
\ No newline at end of file
diff --git a/ControlNet/annotator/mlsd/__init__.py b/ControlNet/annotator/mlsd/__init__.py
deleted file mode 100644
index c1860702df6150c5a93c9bb6bf34906a77048c7c..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/mlsd/__init__.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# MLSD Line Detection
-# From https://github.com/navervision/mlsd
-# Apache-2.0 license
-
-import cv2
-import numpy as np
-import torch
-import os
-
-from einops import rearrange
-from .models.mbv2_mlsd_tiny import MobileV2_MLSD_Tiny
-from .models.mbv2_mlsd_large import MobileV2_MLSD_Large
-from .utils import pred_lines
-
-from annotator.util import annotator_ckpts_path
-
-
-remote_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/mlsd_large_512_fp32.pth"
-
-
-class MLSDdetector:
-    def __init__(self):
-        model_path = os.path.join(annotator_ckpts_path, "mlsd_large_512_fp32.pth")
-        if not os.path.exists(model_path):
-            from basicsr.utils.download_util import load_file_from_url
-            load_file_from_url(remote_model_path, model_dir=annotator_ckpts_path)
-        model = MobileV2_MLSD_Large()
-        model.load_state_dict(torch.load(model_path), strict=True)
-        self.model = model.cuda().eval()
-
-    def __call__(self, input_image, thr_v, thr_d):
-        assert input_image.ndim == 3
-        img = input_image
-        img_output = np.zeros_like(img)
-        try:
-            with torch.no_grad():
-                lines = pred_lines(img, self.model, [img.shape[0], img.shape[1]], thr_v, thr_d)
-                for line in lines:
-                    x_start, y_start, x_end, y_end = [int(val) for val in line]
-                    cv2.line(img_output, (x_start, y_start), (x_end, y_end), [255, 255, 255], 1)
-        except Exception as e:
-            pass
-        return img_output[:, :, 0]
diff --git a/ControlNet/annotator/mlsd/models/mbv2_mlsd_large.py b/ControlNet/annotator/mlsd/models/mbv2_mlsd_large.py
deleted file mode 100644
index 5b9799e7573ca41549b3c3b13ac47b906b369603..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/mlsd/models/mbv2_mlsd_large.py
+++ /dev/null
@@ -1,292 +0,0 @@
-import os
-import sys
-import torch
-import torch.nn as nn
-import torch.utils.model_zoo as model_zoo
-from  torch.nn import  functional as F
-
-
-class BlockTypeA(nn.Module):
-    def __init__(self, in_c1, in_c2, out_c1, out_c2, upscale = True):
-        super(BlockTypeA, self).__init__()
-        self.conv1 = nn.Sequential(
-            nn.Conv2d(in_c2, out_c2, kernel_size=1),
-            nn.BatchNorm2d(out_c2),
-            nn.ReLU(inplace=True)
-        )
-        self.conv2 = nn.Sequential(
-            nn.Conv2d(in_c1, out_c1, kernel_size=1),
-            nn.BatchNorm2d(out_c1),
-            nn.ReLU(inplace=True)
-        )
-        self.upscale = upscale
-
-    def forward(self, a, b):
-        b = self.conv1(b)
-        a = self.conv2(a)
-        if self.upscale:
-             b = F.interpolate(b, scale_factor=2.0, mode='bilinear', align_corners=True)
-        return torch.cat((a, b), dim=1)
-
-
-class BlockTypeB(nn.Module):
-    def __init__(self, in_c, out_c):
-        super(BlockTypeB, self).__init__()
-        self.conv1 = nn.Sequential(
-            nn.Conv2d(in_c, in_c,  kernel_size=3, padding=1),
-            nn.BatchNorm2d(in_c),
-            nn.ReLU()
-        )
-        self.conv2 = nn.Sequential(
-            nn.Conv2d(in_c, out_c, kernel_size=3, padding=1),
-            nn.BatchNorm2d(out_c),
-            nn.ReLU()
-        )
-
-    def forward(self, x):
-        x = self.conv1(x) + x
-        x = self.conv2(x)
-        return x
-
-class BlockTypeC(nn.Module):
-    def __init__(self, in_c, out_c):
-        super(BlockTypeC, self).__init__()
-        self.conv1 = nn.Sequential(
-            nn.Conv2d(in_c, in_c,  kernel_size=3, padding=5, dilation=5),
-            nn.BatchNorm2d(in_c),
-            nn.ReLU()
-        )
-        self.conv2 = nn.Sequential(
-            nn.Conv2d(in_c, in_c,  kernel_size=3, padding=1),
-            nn.BatchNorm2d(in_c),
-            nn.ReLU()
-        )
-        self.conv3 = nn.Conv2d(in_c, out_c, kernel_size=1)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.conv2(x)
-        x = self.conv3(x)
-        return x
-
-def _make_divisible(v, divisor, min_value=None):
-    """
-    This function is taken from the original tf repo.
-    It ensures that all layers have a channel number that is divisible by 8
-    It can be seen here:
-    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
-    :param v:
-    :param divisor:
-    :param min_value:
-    :return:
-    """
-    if min_value is None:
-        min_value = divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-    # Make sure that round down does not go down by more than 10%.
-    if new_v < 0.9 * v:
-        new_v += divisor
-    return new_v
-
-
-class ConvBNReLU(nn.Sequential):
-    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
-        self.channel_pad = out_planes - in_planes
-        self.stride = stride
-        #padding = (kernel_size - 1) // 2
-
-        # TFLite uses slightly different padding than PyTorch
-        if stride == 2:
-            padding = 0
-        else:
-            padding = (kernel_size - 1) // 2
-
-        super(ConvBNReLU, self).__init__(
-            nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
-            nn.BatchNorm2d(out_planes),
-            nn.ReLU6(inplace=True)
-        )
-        self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
-
-
-    def forward(self, x):
-        # TFLite uses  different padding
-        if self.stride == 2:
-            x = F.pad(x, (0, 1, 0, 1), "constant", 0)
-            #print(x.shape)
-
-        for module in self:
-            if not isinstance(module, nn.MaxPool2d):
-                x = module(x)
-        return x
-
-
-class InvertedResidual(nn.Module):
-    def __init__(self, inp, oup, stride, expand_ratio):
-        super(InvertedResidual, self).__init__()
-        self.stride = stride
-        assert stride in [1, 2]
-
-        hidden_dim = int(round(inp * expand_ratio))
-        self.use_res_connect = self.stride == 1 and inp == oup
-
-        layers = []
-        if expand_ratio != 1:
-            # pw
-            layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
-        layers.extend([
-            # dw
-            ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
-            # pw-linear
-            nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
-            nn.BatchNorm2d(oup),
-        ])
-        self.conv = nn.Sequential(*layers)
-
-    def forward(self, x):
-        if self.use_res_connect:
-            return x + self.conv(x)
-        else:
-            return self.conv(x)
-
-
-class MobileNetV2(nn.Module):
-    def __init__(self, pretrained=True):
-        """
-        MobileNet V2 main class
-        Args:
-            num_classes (int): Number of classes
-            width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
-            inverted_residual_setting: Network structure
-            round_nearest (int): Round the number of channels in each layer to be a multiple of this number
-            Set to 1 to turn off rounding
-            block: Module specifying inverted residual building block for mobilenet
-        """
-        super(MobileNetV2, self).__init__()
-
-        block = InvertedResidual
-        input_channel = 32
-        last_channel = 1280
-        width_mult = 1.0
-        round_nearest = 8
-
-        inverted_residual_setting = [
-            # t, c, n, s
-            [1, 16, 1, 1],
-            [6, 24, 2, 2],
-            [6, 32, 3, 2],
-            [6, 64, 4, 2],
-            [6, 96, 3, 1],
-            #[6, 160, 3, 2],
-            #[6, 320, 1, 1],
-        ]
-
-        # only check the first element, assuming user knows t,c,n,s are required
-        if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
-            raise ValueError("inverted_residual_setting should be non-empty "
-                             "or a 4-element list, got {}".format(inverted_residual_setting))
-
-        # building first layer
-        input_channel = _make_divisible(input_channel * width_mult, round_nearest)
-        self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
-        features = [ConvBNReLU(4, input_channel, stride=2)]
-        # building inverted residual blocks
-        for t, c, n, s in inverted_residual_setting:
-            output_channel = _make_divisible(c * width_mult, round_nearest)
-            for i in range(n):
-                stride = s if i == 0 else 1
-                features.append(block(input_channel, output_channel, stride, expand_ratio=t))
-                input_channel = output_channel
-
-        self.features = nn.Sequential(*features)
-        self.fpn_selected = [1, 3, 6, 10, 13]
-        # weight initialization
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out')
-                if m.bias is not None:
-                    nn.init.zeros_(m.bias)
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.ones_(m.weight)
-                nn.init.zeros_(m.bias)
-            elif isinstance(m, nn.Linear):
-                nn.init.normal_(m.weight, 0, 0.01)
-                nn.init.zeros_(m.bias)
-        if pretrained:
-           self._load_pretrained_model()
-
-    def _forward_impl(self, x):
-        # This exists since TorchScript doesn't support inheritance, so the superclass method
-        # (this one) needs to have a name other than `forward` that can be accessed in a subclass
-        fpn_features = []
-        for i, f in enumerate(self.features):
-            if i > self.fpn_selected[-1]:
-                break
-            x = f(x)
-            if i in self.fpn_selected:
-                fpn_features.append(x)
-
-        c1, c2, c3, c4, c5 = fpn_features
-        return c1, c2, c3, c4, c5
-
-
-    def forward(self, x):
-        return self._forward_impl(x)
-
-    def _load_pretrained_model(self):
-        pretrain_dict = model_zoo.load_url('https://download.pytorch.org/models/mobilenet_v2-b0353104.pth')
-        model_dict = {}
-        state_dict = self.state_dict()
-        for k, v in pretrain_dict.items():
-            if k in state_dict:
-                model_dict[k] = v
-        state_dict.update(model_dict)
-        self.load_state_dict(state_dict)
-
-
-class MobileV2_MLSD_Large(nn.Module):
-    def __init__(self):
-        super(MobileV2_MLSD_Large, self).__init__()
-
-        self.backbone = MobileNetV2(pretrained=False)
-        ## A, B
-        self.block15 = BlockTypeA(in_c1= 64, in_c2= 96,
-                                  out_c1= 64, out_c2=64,
-                                  upscale=False)
-        self.block16 = BlockTypeB(128, 64)
-
-        ## A, B
-        self.block17 = BlockTypeA(in_c1 = 32,  in_c2 = 64,
-                                  out_c1= 64,  out_c2= 64)
-        self.block18 = BlockTypeB(128, 64)
-
-        ## A, B
-        self.block19 = BlockTypeA(in_c1=24, in_c2=64,
-                                  out_c1=64, out_c2=64)
-        self.block20 = BlockTypeB(128, 64)
-
-        ## A, B, C
-        self.block21 = BlockTypeA(in_c1=16, in_c2=64,
-                                  out_c1=64, out_c2=64)
-        self.block22 = BlockTypeB(128, 64)
-
-        self.block23 = BlockTypeC(64, 16)
-
-    def forward(self, x):
-        c1, c2, c3, c4, c5 = self.backbone(x)
-
-        x = self.block15(c4, c5)
-        x = self.block16(x)
-
-        x = self.block17(c3, x)
-        x = self.block18(x)
-
-        x = self.block19(c2, x)
-        x = self.block20(x)
-
-        x = self.block21(c1, x)
-        x = self.block22(x)
-        x = self.block23(x)
-        x = x[:, 7:, :, :]
-
-        return x
\ No newline at end of file
diff --git a/ControlNet/annotator/mlsd/models/mbv2_mlsd_tiny.py b/ControlNet/annotator/mlsd/models/mbv2_mlsd_tiny.py
deleted file mode 100644
index e3ed633f2cc23ea1829a627fdb879ab39f641f83..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/mlsd/models/mbv2_mlsd_tiny.py
+++ /dev/null
@@ -1,275 +0,0 @@
-import os
-import sys
-import torch
-import torch.nn as nn
-import torch.utils.model_zoo as model_zoo
-from  torch.nn import  functional as F
-
-
-class BlockTypeA(nn.Module):
-    def __init__(self, in_c1, in_c2, out_c1, out_c2, upscale = True):
-        super(BlockTypeA, self).__init__()
-        self.conv1 = nn.Sequential(
-            nn.Conv2d(in_c2, out_c2, kernel_size=1),
-            nn.BatchNorm2d(out_c2),
-            nn.ReLU(inplace=True)
-        )
-        self.conv2 = nn.Sequential(
-            nn.Conv2d(in_c1, out_c1, kernel_size=1),
-            nn.BatchNorm2d(out_c1),
-            nn.ReLU(inplace=True)
-        )
-        self.upscale = upscale
-
-    def forward(self, a, b):
-        b = self.conv1(b)
-        a = self.conv2(a)
-        b = F.interpolate(b, scale_factor=2.0, mode='bilinear', align_corners=True)
-        return torch.cat((a, b), dim=1)
-
-
-class BlockTypeB(nn.Module):
-    def __init__(self, in_c, out_c):
-        super(BlockTypeB, self).__init__()
-        self.conv1 = nn.Sequential(
-            nn.Conv2d(in_c, in_c,  kernel_size=3, padding=1),
-            nn.BatchNorm2d(in_c),
-            nn.ReLU()
-        )
-        self.conv2 = nn.Sequential(
-            nn.Conv2d(in_c, out_c, kernel_size=3, padding=1),
-            nn.BatchNorm2d(out_c),
-            nn.ReLU()
-        )
-
-    def forward(self, x):
-        x = self.conv1(x) + x
-        x = self.conv2(x)
-        return x
-
-class BlockTypeC(nn.Module):
-    def __init__(self, in_c, out_c):
-        super(BlockTypeC, self).__init__()
-        self.conv1 = nn.Sequential(
-            nn.Conv2d(in_c, in_c,  kernel_size=3, padding=5, dilation=5),
-            nn.BatchNorm2d(in_c),
-            nn.ReLU()
-        )
-        self.conv2 = nn.Sequential(
-            nn.Conv2d(in_c, in_c,  kernel_size=3, padding=1),
-            nn.BatchNorm2d(in_c),
-            nn.ReLU()
-        )
-        self.conv3 = nn.Conv2d(in_c, out_c, kernel_size=1)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.conv2(x)
-        x = self.conv3(x)
-        return x
-
-def _make_divisible(v, divisor, min_value=None):
-    """
-    This function is taken from the original tf repo.
-    It ensures that all layers have a channel number that is divisible by 8
-    It can be seen here:
-    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
-    :param v:
-    :param divisor:
-    :param min_value:
-    :return:
-    """
-    if min_value is None:
-        min_value = divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-    # Make sure that round down does not go down by more than 10%.
-    if new_v < 0.9 * v:
-        new_v += divisor
-    return new_v
-
-
-class ConvBNReLU(nn.Sequential):
-    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
-        self.channel_pad = out_planes - in_planes
-        self.stride = stride
-        #padding = (kernel_size - 1) // 2
-
-        # TFLite uses slightly different padding than PyTorch
-        if stride == 2:
-            padding = 0
-        else:
-            padding = (kernel_size - 1) // 2
-
-        super(ConvBNReLU, self).__init__(
-            nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
-            nn.BatchNorm2d(out_planes),
-            nn.ReLU6(inplace=True)
-        )
-        self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
-
-
-    def forward(self, x):
-        # TFLite uses  different padding
-        if self.stride == 2:
-            x = F.pad(x, (0, 1, 0, 1), "constant", 0)
-            #print(x.shape)
-
-        for module in self:
-            if not isinstance(module, nn.MaxPool2d):
-                x = module(x)
-        return x
-
-
-class InvertedResidual(nn.Module):
-    def __init__(self, inp, oup, stride, expand_ratio):
-        super(InvertedResidual, self).__init__()
-        self.stride = stride
-        assert stride in [1, 2]
-
-        hidden_dim = int(round(inp * expand_ratio))
-        self.use_res_connect = self.stride == 1 and inp == oup
-
-        layers = []
-        if expand_ratio != 1:
-            # pw
-            layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
-        layers.extend([
-            # dw
-            ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
-            # pw-linear
-            nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
-            nn.BatchNorm2d(oup),
-        ])
-        self.conv = nn.Sequential(*layers)
-
-    def forward(self, x):
-        if self.use_res_connect:
-            return x + self.conv(x)
-        else:
-            return self.conv(x)
-
-
-class MobileNetV2(nn.Module):
-    def __init__(self, pretrained=True):
-        """
-        MobileNet V2 main class
-        Args:
-            num_classes (int): Number of classes
-            width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
-            inverted_residual_setting: Network structure
-            round_nearest (int): Round the number of channels in each layer to be a multiple of this number
-            Set to 1 to turn off rounding
-            block: Module specifying inverted residual building block for mobilenet
-        """
-        super(MobileNetV2, self).__init__()
-
-        block = InvertedResidual
-        input_channel = 32
-        last_channel = 1280
-        width_mult = 1.0
-        round_nearest = 8
-
-        inverted_residual_setting = [
-            # t, c, n, s
-            [1, 16, 1, 1],
-            [6, 24, 2, 2],
-            [6, 32, 3, 2],
-            [6, 64, 4, 2],
-            #[6, 96, 3, 1],
-            #[6, 160, 3, 2],
-            #[6, 320, 1, 1],
-        ]
-
-        # only check the first element, assuming user knows t,c,n,s are required
-        if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
-            raise ValueError("inverted_residual_setting should be non-empty "
-                             "or a 4-element list, got {}".format(inverted_residual_setting))
-
-        # building first layer
-        input_channel = _make_divisible(input_channel * width_mult, round_nearest)
-        self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
-        features = [ConvBNReLU(4, input_channel, stride=2)]
-        # building inverted residual blocks
-        for t, c, n, s in inverted_residual_setting:
-            output_channel = _make_divisible(c * width_mult, round_nearest)
-            for i in range(n):
-                stride = s if i == 0 else 1
-                features.append(block(input_channel, output_channel, stride, expand_ratio=t))
-                input_channel = output_channel
-        self.features = nn.Sequential(*features)
-
-        self.fpn_selected = [3, 6, 10]
-        # weight initialization
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out')
-                if m.bias is not None:
-                    nn.init.zeros_(m.bias)
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.ones_(m.weight)
-                nn.init.zeros_(m.bias)
-            elif isinstance(m, nn.Linear):
-                nn.init.normal_(m.weight, 0, 0.01)
-                nn.init.zeros_(m.bias)
-
-        #if pretrained:
-        #    self._load_pretrained_model()
-
-    def _forward_impl(self, x):
-        # This exists since TorchScript doesn't support inheritance, so the superclass method
-        # (this one) needs to have a name other than `forward` that can be accessed in a subclass
-        fpn_features = []
-        for i, f in enumerate(self.features):
-            if i > self.fpn_selected[-1]:
-                break
-            x = f(x)
-            if i in self.fpn_selected:
-                fpn_features.append(x)
-
-        c2, c3, c4 = fpn_features
-        return c2, c3, c4
-
-
-    def forward(self, x):
-        return self._forward_impl(x)
-
-    def _load_pretrained_model(self):
-        pretrain_dict = model_zoo.load_url('https://download.pytorch.org/models/mobilenet_v2-b0353104.pth')
-        model_dict = {}
-        state_dict = self.state_dict()
-        for k, v in pretrain_dict.items():
-            if k in state_dict:
-                model_dict[k] = v
-        state_dict.update(model_dict)
-        self.load_state_dict(state_dict)
-
-
-class MobileV2_MLSD_Tiny(nn.Module):
-    def __init__(self):
-        super(MobileV2_MLSD_Tiny, self).__init__()
-
-        self.backbone = MobileNetV2(pretrained=True)
-
-        self.block12 = BlockTypeA(in_c1= 32, in_c2= 64,
-                                  out_c1= 64, out_c2=64)
-        self.block13 = BlockTypeB(128, 64)
-
-        self.block14 = BlockTypeA(in_c1 = 24,  in_c2 = 64,
-                                  out_c1= 32,  out_c2= 32)
-        self.block15 = BlockTypeB(64, 64)
-
-        self.block16 = BlockTypeC(64, 16)
-
-    def forward(self, x):
-        c2, c3, c4 = self.backbone(x)
-
-        x = self.block12(c3, c4)
-        x = self.block13(x)
-        x = self.block14(c2, x)
-        x = self.block15(x)
-        x = self.block16(x)
-        x = x[:, 7:, :, :]
-        #print(x.shape)
-        x = F.interpolate(x, scale_factor=2.0, mode='bilinear', align_corners=True)
-
-        return x
\ No newline at end of file
diff --git a/ControlNet/annotator/mlsd/utils.py b/ControlNet/annotator/mlsd/utils.py
deleted file mode 100644
index ae3cf9420a33a4abae27c48ac4b90938c7d63cc3..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/mlsd/utils.py
+++ /dev/null
@@ -1,580 +0,0 @@
-'''
-modified by  lihaoweicv
-pytorch version
-'''
-
-'''
-M-LSD
-Copyright 2021-present NAVER Corp.
-Apache License v2.0
-'''
-
-import os
-import numpy as np
-import cv2
-import torch
-from  torch.nn import  functional as F
-
-
-def deccode_output_score_and_ptss(tpMap, topk_n = 200, ksize = 5):
-    '''
-    tpMap:
-    center: tpMap[1, 0, :, :]
-    displacement: tpMap[1, 1:5, :, :]
-    '''
-    b, c, h, w = tpMap.shape
-    assert  b==1, 'only support bsize==1'
-    displacement = tpMap[:, 1:5, :, :][0]
-    center = tpMap[:, 0, :, :]
-    heat = torch.sigmoid(center)
-    hmax = F.max_pool2d( heat, (ksize, ksize), stride=1, padding=(ksize-1)//2)
-    keep = (hmax == heat).float()
-    heat = heat * keep
-    heat = heat.reshape(-1, )
-
-    scores, indices = torch.topk(heat, topk_n, dim=-1, largest=True)
-    yy = torch.floor_divide(indices, w).unsqueeze(-1)
-    xx = torch.fmod(indices, w).unsqueeze(-1)
-    ptss = torch.cat((yy, xx),dim=-1)
-
-    ptss   = ptss.detach().cpu().numpy()
-    scores = scores.detach().cpu().numpy()
-    displacement = displacement.detach().cpu().numpy()
-    displacement = displacement.transpose((1,2,0))
-    return  ptss, scores, displacement
-
-
-def pred_lines(image, model,
-               input_shape=[512, 512],
-               score_thr=0.10,
-               dist_thr=20.0):
-    h, w, _ = image.shape
-    h_ratio, w_ratio = [h / input_shape[0], w / input_shape[1]]
-
-    resized_image = np.concatenate([cv2.resize(image, (input_shape[1], input_shape[0]), interpolation=cv2.INTER_AREA),
-                                    np.ones([input_shape[0], input_shape[1], 1])], axis=-1)
-
-    resized_image = resized_image.transpose((2,0,1))
-    batch_image = np.expand_dims(resized_image, axis=0).astype('float32')
-    batch_image = (batch_image / 127.5) - 1.0
-
-    batch_image = torch.from_numpy(batch_image).float().cuda()
-    outputs = model(batch_image)
-    pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3)
-    start = vmap[:, :, :2]
-    end = vmap[:, :, 2:]
-    dist_map = np.sqrt(np.sum((start - end) ** 2, axis=-1))
-
-    segments_list = []
-    for center, score in zip(pts, pts_score):
-        y, x = center
-        distance = dist_map[y, x]
-        if score > score_thr and distance > dist_thr:
-            disp_x_start, disp_y_start, disp_x_end, disp_y_end = vmap[y, x, :]
-            x_start = x + disp_x_start
-            y_start = y + disp_y_start
-            x_end = x + disp_x_end
-            y_end = y + disp_y_end
-            segments_list.append([x_start, y_start, x_end, y_end])
-
-    lines = 2 * np.array(segments_list)  # 256 > 512
-    lines[:, 0] = lines[:, 0] * w_ratio
-    lines[:, 1] = lines[:, 1] * h_ratio
-    lines[:, 2] = lines[:, 2] * w_ratio
-    lines[:, 3] = lines[:, 3] * h_ratio
-
-    return lines
-
-
-def pred_squares(image,
-                 model,
-                 input_shape=[512, 512],
-                 params={'score': 0.06,
-                         'outside_ratio': 0.28,
-                         'inside_ratio': 0.45,
-                         'w_overlap': 0.0,
-                         'w_degree': 1.95,
-                         'w_length': 0.0,
-                         'w_area': 1.86,
-                         'w_center': 0.14}):
-    '''
-    shape = [height, width]
-    '''
-    h, w, _ = image.shape
-    original_shape = [h, w]
-
-    resized_image = np.concatenate([cv2.resize(image, (input_shape[0], input_shape[1]), interpolation=cv2.INTER_AREA),
-                                    np.ones([input_shape[0], input_shape[1], 1])], axis=-1)
-    resized_image = resized_image.transpose((2, 0, 1))
-    batch_image = np.expand_dims(resized_image, axis=0).astype('float32')
-    batch_image = (batch_image / 127.5) - 1.0
-
-    batch_image = torch.from_numpy(batch_image).float().cuda()
-    outputs = model(batch_image)
-
-    pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3)
-    start = vmap[:, :, :2]  # (x, y)
-    end = vmap[:, :, 2:]  # (x, y)
-    dist_map = np.sqrt(np.sum((start - end) ** 2, axis=-1))
-
-    junc_list = []
-    segments_list = []
-    for junc, score in zip(pts, pts_score):
-        y, x = junc
-        distance = dist_map[y, x]
-        if score > params['score'] and distance > 20.0:
-            junc_list.append([x, y])
-            disp_x_start, disp_y_start, disp_x_end, disp_y_end = vmap[y, x, :]
-            d_arrow = 1.0
-            x_start = x + d_arrow * disp_x_start
-            y_start = y + d_arrow * disp_y_start
-            x_end = x + d_arrow * disp_x_end
-            y_end = y + d_arrow * disp_y_end
-            segments_list.append([x_start, y_start, x_end, y_end])
-
-    segments = np.array(segments_list)
-
-    ####### post processing for squares
-    # 1. get unique lines
-    point = np.array([[0, 0]])
-    point = point[0]
-    start = segments[:, :2]
-    end = segments[:, 2:]
-    diff = start - end
-    a = diff[:, 1]
-    b = -diff[:, 0]
-    c = a * start[:, 0] + b * start[:, 1]
-
-    d = np.abs(a * point[0] + b * point[1] - c) / np.sqrt(a ** 2 + b ** 2 + 1e-10)
-    theta = np.arctan2(diff[:, 0], diff[:, 1]) * 180 / np.pi
-    theta[theta < 0.0] += 180
-    hough = np.concatenate([d[:, None], theta[:, None]], axis=-1)
-
-    d_quant = 1
-    theta_quant = 2
-    hough[:, 0] //= d_quant
-    hough[:, 1] //= theta_quant
-    _, indices, counts = np.unique(hough, axis=0, return_index=True, return_counts=True)
-
-    acc_map = np.zeros([512 // d_quant + 1, 360 // theta_quant + 1], dtype='float32')
-    idx_map = np.zeros([512 // d_quant + 1, 360 // theta_quant + 1], dtype='int32') - 1
-    yx_indices = hough[indices, :].astype('int32')
-    acc_map[yx_indices[:, 0], yx_indices[:, 1]] = counts
-    idx_map[yx_indices[:, 0], yx_indices[:, 1]] = indices
-
-    acc_map_np = acc_map
-    # acc_map = acc_map[None, :, :, None]
-    #
-    # ### fast suppression using tensorflow op
-    # acc_map = tf.constant(acc_map, dtype=tf.float32)
-    # max_acc_map = tf.keras.layers.MaxPool2D(pool_size=(5, 5), strides=1, padding='same')(acc_map)
-    # acc_map = acc_map * tf.cast(tf.math.equal(acc_map, max_acc_map), tf.float32)
-    # flatten_acc_map = tf.reshape(acc_map, [1, -1])
-    # topk_values, topk_indices = tf.math.top_k(flatten_acc_map, k=len(pts))
-    # _, h, w, _ = acc_map.shape
-    # y = tf.expand_dims(topk_indices // w, axis=-1)
-    # x = tf.expand_dims(topk_indices % w, axis=-1)
-    # yx = tf.concat([y, x], axis=-1)
-
-    ### fast suppression using pytorch op
-    acc_map = torch.from_numpy(acc_map_np).unsqueeze(0).unsqueeze(0)
-    _,_, h, w = acc_map.shape
-    max_acc_map = F.max_pool2d(acc_map,kernel_size=5, stride=1, padding=2)
-    acc_map = acc_map * ( (acc_map == max_acc_map).float() )
-    flatten_acc_map = acc_map.reshape([-1, ])
-
-    scores, indices = torch.topk(flatten_acc_map, len(pts), dim=-1, largest=True)
-    yy = torch.div(indices, w, rounding_mode='floor').unsqueeze(-1)
-    xx = torch.fmod(indices, w).unsqueeze(-1)
-    yx = torch.cat((yy, xx), dim=-1)
-
-    yx = yx.detach().cpu().numpy()
-
-    topk_values = scores.detach().cpu().numpy()
-    indices = idx_map[yx[:, 0], yx[:, 1]]
-    basis = 5 // 2
-
-    merged_segments = []
-    for yx_pt, max_indice, value in zip(yx, indices, topk_values):
-        y, x = yx_pt
-        if max_indice == -1 or value == 0:
-            continue
-        segment_list = []
-        for y_offset in range(-basis, basis + 1):
-            for x_offset in range(-basis, basis + 1):
-                indice = idx_map[y + y_offset, x + x_offset]
-                cnt = int(acc_map_np[y + y_offset, x + x_offset])
-                if indice != -1:
-                    segment_list.append(segments[indice])
-                if cnt > 1:
-                    check_cnt = 1
-                    current_hough = hough[indice]
-                    for new_indice, new_hough in enumerate(hough):
-                        if (current_hough == new_hough).all() and indice != new_indice:
-                            segment_list.append(segments[new_indice])
-                            check_cnt += 1
-                        if check_cnt == cnt:
-                            break
-        group_segments = np.array(segment_list).reshape([-1, 2])
-        sorted_group_segments = np.sort(group_segments, axis=0)
-        x_min, y_min = sorted_group_segments[0, :]
-        x_max, y_max = sorted_group_segments[-1, :]
-
-        deg = theta[max_indice]
-        if deg >= 90:
-            merged_segments.append([x_min, y_max, x_max, y_min])
-        else:
-            merged_segments.append([x_min, y_min, x_max, y_max])
-
-    # 2. get intersections
-    new_segments = np.array(merged_segments)  # (x1, y1, x2, y2)
-    start = new_segments[:, :2]  # (x1, y1)
-    end = new_segments[:, 2:]  # (x2, y2)
-    new_centers = (start + end) / 2.0
-    diff = start - end
-    dist_segments = np.sqrt(np.sum(diff ** 2, axis=-1))
-
-    # ax + by = c
-    a = diff[:, 1]
-    b = -diff[:, 0]
-    c = a * start[:, 0] + b * start[:, 1]
-    pre_det = a[:, None] * b[None, :]
-    det = pre_det - np.transpose(pre_det)
-
-    pre_inter_y = a[:, None] * c[None, :]
-    inter_y = (pre_inter_y - np.transpose(pre_inter_y)) / (det + 1e-10)
-    pre_inter_x = c[:, None] * b[None, :]
-    inter_x = (pre_inter_x - np.transpose(pre_inter_x)) / (det + 1e-10)
-    inter_pts = np.concatenate([inter_x[:, :, None], inter_y[:, :, None]], axis=-1).astype('int32')
-
-    # 3. get corner information
-    # 3.1 get distance
-    '''
-    dist_segments:
-        | dist(0), dist(1), dist(2), ...|
-    dist_inter_to_segment1:
-        | dist(inter,0), dist(inter,0), dist(inter,0), ... |
-        | dist(inter,1), dist(inter,1), dist(inter,1), ... |
-        ...
-    dist_inter_to_semgnet2:
-        | dist(inter,0), dist(inter,1), dist(inter,2), ... |
-        | dist(inter,0), dist(inter,1), dist(inter,2), ... |
-        ...
-    '''
-
-    dist_inter_to_segment1_start = np.sqrt(
-        np.sum(((inter_pts - start[:, None, :]) ** 2), axis=-1, keepdims=True))  # [n_batch, n_batch, 1]
-    dist_inter_to_segment1_end = np.sqrt(
-        np.sum(((inter_pts - end[:, None, :]) ** 2), axis=-1, keepdims=True))  # [n_batch, n_batch, 1]
-    dist_inter_to_segment2_start = np.sqrt(
-        np.sum(((inter_pts - start[None, :, :]) ** 2), axis=-1, keepdims=True))  # [n_batch, n_batch, 1]
-    dist_inter_to_segment2_end = np.sqrt(
-        np.sum(((inter_pts - end[None, :, :]) ** 2), axis=-1, keepdims=True))  # [n_batch, n_batch, 1]
-
-    # sort ascending
-    dist_inter_to_segment1 = np.sort(
-        np.concatenate([dist_inter_to_segment1_start, dist_inter_to_segment1_end], axis=-1),
-        axis=-1)  # [n_batch, n_batch, 2]
-    dist_inter_to_segment2 = np.sort(
-        np.concatenate([dist_inter_to_segment2_start, dist_inter_to_segment2_end], axis=-1),
-        axis=-1)  # [n_batch, n_batch, 2]
-
-    # 3.2 get degree
-    inter_to_start = new_centers[:, None, :] - inter_pts
-    deg_inter_to_start = np.arctan2(inter_to_start[:, :, 1], inter_to_start[:, :, 0]) * 180 / np.pi
-    deg_inter_to_start[deg_inter_to_start < 0.0] += 360
-    inter_to_end = new_centers[None, :, :] - inter_pts
-    deg_inter_to_end = np.arctan2(inter_to_end[:, :, 1], inter_to_end[:, :, 0]) * 180 / np.pi
-    deg_inter_to_end[deg_inter_to_end < 0.0] += 360
-
-    '''
-    B -- G
-    |    |
-    C -- R
-    B : blue / G: green / C: cyan / R: red
-
-    0 -- 1
-    |    |
-    3 -- 2
-    '''
-    # rename variables
-    deg1_map, deg2_map = deg_inter_to_start, deg_inter_to_end
-    # sort deg ascending
-    deg_sort = np.sort(np.concatenate([deg1_map[:, :, None], deg2_map[:, :, None]], axis=-1), axis=-1)
-
-    deg_diff_map = np.abs(deg1_map - deg2_map)
-    # we only consider the smallest degree of intersect
-    deg_diff_map[deg_diff_map > 180] = 360 - deg_diff_map[deg_diff_map > 180]
-
-    # define available degree range
-    deg_range = [60, 120]
-
-    corner_dict = {corner_info: [] for corner_info in range(4)}
-    inter_points = []
-    for i in range(inter_pts.shape[0]):
-        for j in range(i + 1, inter_pts.shape[1]):
-            # i, j > line index, always i < j
-            x, y = inter_pts[i, j, :]
-            deg1, deg2 = deg_sort[i, j, :]
-            deg_diff = deg_diff_map[i, j]
-
-            check_degree = deg_diff > deg_range[0] and deg_diff < deg_range[1]
-
-            outside_ratio = params['outside_ratio']  # over ratio >>> drop it!
-            inside_ratio = params['inside_ratio']  # over ratio >>> drop it!
-            check_distance = ((dist_inter_to_segment1[i, j, 1] >= dist_segments[i] and \
-                               dist_inter_to_segment1[i, j, 0] <= dist_segments[i] * outside_ratio) or \
-                              (dist_inter_to_segment1[i, j, 1] <= dist_segments[i] and \
-                               dist_inter_to_segment1[i, j, 0] <= dist_segments[i] * inside_ratio)) and \
-                             ((dist_inter_to_segment2[i, j, 1] >= dist_segments[j] and \
-                               dist_inter_to_segment2[i, j, 0] <= dist_segments[j] * outside_ratio) or \
-                              (dist_inter_to_segment2[i, j, 1] <= dist_segments[j] and \
-                               dist_inter_to_segment2[i, j, 0] <= dist_segments[j] * inside_ratio))
-
-            if check_degree and check_distance:
-                corner_info = None
-
-                if (deg1 >= 0 and deg1 <= 45 and deg2 >= 45 and deg2 <= 120) or \
-                        (deg2 >= 315 and deg1 >= 45 and deg1 <= 120):
-                    corner_info, color_info = 0, 'blue'
-                elif (deg1 >= 45 and deg1 <= 125 and deg2 >= 125 and deg2 <= 225):
-                    corner_info, color_info = 1, 'green'
-                elif (deg1 >= 125 and deg1 <= 225 and deg2 >= 225 and deg2 <= 315):
-                    corner_info, color_info = 2, 'black'
-                elif (deg1 >= 0 and deg1 <= 45 and deg2 >= 225 and deg2 <= 315) or \
-                        (deg2 >= 315 and deg1 >= 225 and deg1 <= 315):
-                    corner_info, color_info = 3, 'cyan'
-                else:
-                    corner_info, color_info = 4, 'red'  # we don't use it
-                    continue
-
-                corner_dict[corner_info].append([x, y, i, j])
-                inter_points.append([x, y])
-
-    square_list = []
-    connect_list = []
-    segments_list = []
-    for corner0 in corner_dict[0]:
-        for corner1 in corner_dict[1]:
-            connect01 = False
-            for corner0_line in corner0[2:]:
-                if corner0_line in corner1[2:]:
-                    connect01 = True
-                    break
-            if connect01:
-                for corner2 in corner_dict[2]:
-                    connect12 = False
-                    for corner1_line in corner1[2:]:
-                        if corner1_line in corner2[2:]:
-                            connect12 = True
-                            break
-                    if connect12:
-                        for corner3 in corner_dict[3]:
-                            connect23 = False
-                            for corner2_line in corner2[2:]:
-                                if corner2_line in corner3[2:]:
-                                    connect23 = True
-                                    break
-                            if connect23:
-                                for corner3_line in corner3[2:]:
-                                    if corner3_line in corner0[2:]:
-                                        # SQUARE!!!
-                                        '''
-                                        0 -- 1
-                                        |    |
-                                        3 -- 2
-                                        square_list:
-                                            order: 0 > 1 > 2 > 3
-                                            | x0, y0, x1, y1, x2, y2, x3, y3 |
-                                            | x0, y0, x1, y1, x2, y2, x3, y3 |
-                                            ...
-                                        connect_list:
-                                            order: 01 > 12 > 23 > 30
-                                            | line_idx01, line_idx12, line_idx23, line_idx30 |
-                                            | line_idx01, line_idx12, line_idx23, line_idx30 |
-                                            ...
-                                        segments_list:
-                                            order: 0 > 1 > 2 > 3
-                                            | line_idx0_i, line_idx0_j, line_idx1_i, line_idx1_j, line_idx2_i, line_idx2_j, line_idx3_i, line_idx3_j |
-                                            | line_idx0_i, line_idx0_j, line_idx1_i, line_idx1_j, line_idx2_i, line_idx2_j, line_idx3_i, line_idx3_j |
-                                            ...
-                                        '''
-                                        square_list.append(corner0[:2] + corner1[:2] + corner2[:2] + corner3[:2])
-                                        connect_list.append([corner0_line, corner1_line, corner2_line, corner3_line])
-                                        segments_list.append(corner0[2:] + corner1[2:] + corner2[2:] + corner3[2:])
-
-    def check_outside_inside(segments_info, connect_idx):
-        # return 'outside or inside', min distance, cover_param, peri_param
-        if connect_idx == segments_info[0]:
-            check_dist_mat = dist_inter_to_segment1
-        else:
-            check_dist_mat = dist_inter_to_segment2
-
-        i, j = segments_info
-        min_dist, max_dist = check_dist_mat[i, j, :]
-        connect_dist = dist_segments[connect_idx]
-        if max_dist > connect_dist:
-            return 'outside', min_dist, 0, 1
-        else:
-            return 'inside', min_dist, -1, -1
-
-    top_square = None
-
-    try:
-        map_size = input_shape[0] / 2
-        squares = np.array(square_list).reshape([-1, 4, 2])
-        score_array = []
-        connect_array = np.array(connect_list)
-        segments_array = np.array(segments_list).reshape([-1, 4, 2])
-
-        # get degree of corners:
-        squares_rollup = np.roll(squares, 1, axis=1)
-        squares_rolldown = np.roll(squares, -1, axis=1)
-        vec1 = squares_rollup - squares
-        normalized_vec1 = vec1 / (np.linalg.norm(vec1, axis=-1, keepdims=True) + 1e-10)
-        vec2 = squares_rolldown - squares
-        normalized_vec2 = vec2 / (np.linalg.norm(vec2, axis=-1, keepdims=True) + 1e-10)
-        inner_products = np.sum(normalized_vec1 * normalized_vec2, axis=-1)  # [n_squares, 4]
-        squares_degree = np.arccos(inner_products) * 180 / np.pi  # [n_squares, 4]
-
-        # get square score
-        overlap_scores = []
-        degree_scores = []
-        length_scores = []
-
-        for connects, segments, square, degree in zip(connect_array, segments_array, squares, squares_degree):
-            '''
-            0 -- 1
-            |    |
-            3 -- 2
-
-            # segments: [4, 2]
-            # connects: [4]
-            '''
-
-            ###################################### OVERLAP SCORES
-            cover = 0
-            perimeter = 0
-            # check 0 > 1 > 2 > 3
-            square_length = []
-
-            for start_idx in range(4):
-                end_idx = (start_idx + 1) % 4
-
-                connect_idx = connects[start_idx]  # segment idx of segment01
-                start_segments = segments[start_idx]
-                end_segments = segments[end_idx]
-
-                start_point = square[start_idx]
-                end_point = square[end_idx]
-
-                # check whether outside or inside
-                start_position, start_min, start_cover_param, start_peri_param = check_outside_inside(start_segments,
-                                                                                                      connect_idx)
-                end_position, end_min, end_cover_param, end_peri_param = check_outside_inside(end_segments, connect_idx)
-
-                cover += dist_segments[connect_idx] + start_cover_param * start_min + end_cover_param * end_min
-                perimeter += dist_segments[connect_idx] + start_peri_param * start_min + end_peri_param * end_min
-
-                square_length.append(
-                    dist_segments[connect_idx] + start_peri_param * start_min + end_peri_param * end_min)
-
-            overlap_scores.append(cover / perimeter)
-            ######################################
-            ###################################### DEGREE SCORES
-            '''
-            deg0 vs deg2
-            deg1 vs deg3
-            '''
-            deg0, deg1, deg2, deg3 = degree
-            deg_ratio1 = deg0 / deg2
-            if deg_ratio1 > 1.0:
-                deg_ratio1 = 1 / deg_ratio1
-            deg_ratio2 = deg1 / deg3
-            if deg_ratio2 > 1.0:
-                deg_ratio2 = 1 / deg_ratio2
-            degree_scores.append((deg_ratio1 + deg_ratio2) / 2)
-            ######################################
-            ###################################### LENGTH SCORES
-            '''
-            len0 vs len2
-            len1 vs len3
-            '''
-            len0, len1, len2, len3 = square_length
-            len_ratio1 = len0 / len2 if len2 > len0 else len2 / len0
-            len_ratio2 = len1 / len3 if len3 > len1 else len3 / len1
-            length_scores.append((len_ratio1 + len_ratio2) / 2)
-
-            ######################################
-
-        overlap_scores = np.array(overlap_scores)
-        overlap_scores /= np.max(overlap_scores)
-
-        degree_scores = np.array(degree_scores)
-        # degree_scores /= np.max(degree_scores)
-
-        length_scores = np.array(length_scores)
-
-        ###################################### AREA SCORES
-        area_scores = np.reshape(squares, [-1, 4, 2])
-        area_x = area_scores[:, :, 0]
-        area_y = area_scores[:, :, 1]
-        correction = area_x[:, -1] * area_y[:, 0] - area_y[:, -1] * area_x[:, 0]
-        area_scores = np.sum(area_x[:, :-1] * area_y[:, 1:], axis=-1) - np.sum(area_y[:, :-1] * area_x[:, 1:], axis=-1)
-        area_scores = 0.5 * np.abs(area_scores + correction)
-        area_scores /= (map_size * map_size)  # np.max(area_scores)
-        ######################################
-
-        ###################################### CENTER SCORES
-        centers = np.array([[256 // 2, 256 // 2]], dtype='float32')  # [1, 2]
-        # squares: [n, 4, 2]
-        square_centers = np.mean(squares, axis=1)  # [n, 2]
-        center2center = np.sqrt(np.sum((centers - square_centers) ** 2))
-        center_scores = center2center / (map_size / np.sqrt(2.0))
-
-        '''
-        score_w = [overlap, degree, area, center, length]
-        '''
-        score_w = [0.0, 1.0, 10.0, 0.5, 1.0]
-        score_array = params['w_overlap'] * overlap_scores \
-                      + params['w_degree'] * degree_scores \
-                      + params['w_area'] * area_scores \
-                      - params['w_center'] * center_scores \
-                      + params['w_length'] * length_scores
-
-        best_square = []
-
-        sorted_idx = np.argsort(score_array)[::-1]
-        score_array = score_array[sorted_idx]
-        squares = squares[sorted_idx]
-
-    except Exception as e:
-        pass
-
-    '''return list
-    merged_lines, squares, scores
-    '''
-
-    try:
-        new_segments[:, 0] = new_segments[:, 0] * 2 / input_shape[1] * original_shape[1]
-        new_segments[:, 1] = new_segments[:, 1] * 2 / input_shape[0] * original_shape[0]
-        new_segments[:, 2] = new_segments[:, 2] * 2 / input_shape[1] * original_shape[1]
-        new_segments[:, 3] = new_segments[:, 3] * 2 / input_shape[0] * original_shape[0]
-    except:
-        new_segments = []
-
-    try:
-        squares[:, :, 0] = squares[:, :, 0] * 2 / input_shape[1] * original_shape[1]
-        squares[:, :, 1] = squares[:, :, 1] * 2 / input_shape[0] * original_shape[0]
-    except:
-        squares = []
-        score_array = []
-
-    try:
-        inter_points = np.array(inter_points)
-        inter_points[:, 0] = inter_points[:, 0] * 2 / input_shape[1] * original_shape[1]
-        inter_points[:, 1] = inter_points[:, 1] * 2 / input_shape[0] * original_shape[0]
-    except:
-        inter_points = []
-
-    return new_segments, squares, score_array, inter_points
diff --git a/ControlNet/annotator/openpose/LICENSE b/ControlNet/annotator/openpose/LICENSE
deleted file mode 100644
index 6f60b76d35fa1012809985780964a5068adce4fd..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/openpose/LICENSE
+++ /dev/null
@@ -1,108 +0,0 @@
-OPENPOSE: MULTIPERSON KEYPOINT DETECTION
-SOFTWARE LICENSE AGREEMENT
-ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
-
-BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT.  IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
-
-This is a license agreement ("Agreement") between your academic institution or non-profit organization or self (called "Licensee" or "You" in this Agreement) and Carnegie Mellon University (called "Licensor" in this Agreement).  All rights not specifically granted to you in this Agreement are reserved for Licensor. 
-
-RESERVATION OF OWNERSHIP AND GRANT OF LICENSE: 
-Licensor retains exclusive ownership of any copy of the Software (as defined below) licensed under this Agreement and hereby grants to Licensee a personal, non-exclusive, 
-non-transferable license to use the Software for noncommercial research purposes, without the right to sublicense, pursuant to the terms and conditions of this Agreement.  As used in this Agreement, the term "Software" means (i) the actual copy of all or any portion of code for program routines made accessible to Licensee by Licensor pursuant to this Agreement, inclusive of backups, updates, and/or merged copies permitted hereunder or subsequently supplied by Licensor,  including all or any file structures, programming instructions, user interfaces and screen formats and sequences as well as any and all documentation and instructions related to it, and (ii) all or any derivatives and/or modifications created or made by You to any of the items specified in (i).
-
-CONFIDENTIALITY: Licensee acknowledges that the Software is proprietary to Licensor, and as such, Licensee agrees to receive all such materials in confidence and use the Software only in accordance with the terms of this Agreement.  Licensee agrees to use reasonable effort to protect the Software from unauthorized use, reproduction, distribution, or publication.
-
-COPYRIGHT: The Software is owned by Licensor and is protected by United 
-States copyright laws and applicable international treaties and/or conventions.
-
-PERMITTED USES:  The Software may be used for your own noncommercial internal research purposes. You understand and agree that Licensor is not obligated to implement any suggestions and/or feedback you might provide regarding the Software, but to the extent Licensor does so, you are not entitled to any compensation related thereto.
-
-DERIVATIVES: You may create derivatives of or make modifications to the Software, however, You agree that all and any such derivatives and modifications will be owned by Licensor and become a part of the Software licensed to You under this Agreement.  You may only use such derivatives and modifications for your own noncommercial internal research purposes, and you may not otherwise use, distribute or copy such derivatives and modifications in violation of this Agreement.
-
-BACKUPS:  If Licensee is an organization, it may make that number of copies of the Software necessary for internal noncommercial use at a single site within its organization provided that all information appearing in or on the original labels, including the copyright and trademark notices are copied onto the labels of the copies.
-
-USES NOT PERMITTED:  You may not distribute, copy or use the Software except as explicitly permitted herein. Licensee has not been granted any trademark license as part of this Agreement and may not use the name or mark “OpenPose", "Carnegie Mellon" or any renditions thereof without the prior written permission of Licensor.
-
-You may not sell, rent, lease, sublicense, lend, time-share or transfer, in whole or in part, or provide third parties access to prior or present versions (or any parts thereof) of the Software.
-
-ASSIGNMENT: You may not assign this Agreement or your rights hereunder without the prior written consent of Licensor. Any attempted assignment without such consent shall be null and void.
-
-TERM: The term of the license granted by this Agreement is from Licensee's acceptance of this Agreement by downloading the Software or by using the Software until terminated as provided below.
-
-The Agreement automatically terminates without notice if you fail to comply with any provision of this Agreement.  Licensee may terminate this Agreement by ceasing using the Software.  Upon any termination of this Agreement, Licensee will delete any and all copies of the Software. You agree that all provisions which operate to protect the proprietary rights of Licensor shall remain in force should breach occur and that the obligation of confidentiality described in this Agreement is binding in perpetuity and, as such, survives the term of the Agreement.
-
-FEE: Provided Licensee abides completely by the terms and conditions of this Agreement, there is no fee due to Licensor for Licensee's use of the Software in accordance with this Agreement.
-
-DISCLAIMER OF WARRANTIES:  THE SOFTWARE IS PROVIDED "AS-IS" WITHOUT WARRANTY OF ANY KIND INCLUDING ANY WARRANTIES OF PERFORMANCE OR MERCHANTABILITY OR FITNESS FOR A PARTICULAR USE OR PURPOSE OR OF NON-INFRINGEMENT.  LICENSEE BEARS ALL RISK RELATING TO QUALITY AND PERFORMANCE OF THE SOFTWARE AND RELATED MATERIALS.
-
-SUPPORT AND MAINTENANCE: No Software support or training by the Licensor is provided as part of this Agreement.  
-
-EXCLUSIVE REMEDY AND LIMITATION OF LIABILITY: To the maximum extent permitted under applicable law, Licensor shall not be liable for direct, indirect, special, incidental, or consequential damages or lost profits related to Licensee's use of and/or inability to use the Software, even if Licensor is advised of the possibility of such damage.
-
-EXPORT REGULATION: Licensee agrees to comply with any and all applicable 
-U.S. export control laws, regulations, and/or other laws related to embargoes and sanction programs administered by the Office of Foreign Assets Control.
-
-SEVERABILITY: If any provision(s) of this Agreement shall be held to be invalid, illegal, or unenforceable by a court or other tribunal of competent jurisdiction, the validity, legality and enforceability of the remaining provisions shall not in any way be affected or impaired thereby.
-
-NO IMPLIED WAIVERS: No failure or delay by Licensor in enforcing any right or remedy under this Agreement shall be construed as a waiver of any future or other exercise of such right or remedy by Licensor.
-
-GOVERNING LAW: This Agreement shall be construed and enforced in accordance with the laws of the Commonwealth of Pennsylvania without reference to conflict of laws principles.  You consent to the personal jurisdiction of the courts of this County and waive their rights to venue outside of Allegheny County, Pennsylvania.
-
-ENTIRE AGREEMENT AND AMENDMENTS: This Agreement constitutes the sole and entire agreement between Licensee and Licensor as to the matter set forth herein and supersedes any previous agreements, understandings, and arrangements between the parties relating hereto.
-
-
-
-************************************************************************
-
-THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
-
-This project incorporates material from the project(s) listed below (collectively, "Third Party Code").  This Third Party Code is licensed to you under their original license terms set forth below.  We reserves all other rights not expressly granted, whether by implication, estoppel or otherwise.
- 
-1.	Caffe, version 1.0.0, (https://github.com/BVLC/caffe/)
-
-COPYRIGHT
-
-All contributions by the University of California:
-Copyright (c) 2014-2017 The Regents of the University of California (Regents)
-All rights reserved.
-
-All other contributions:
-Copyright (c) 2014-2017, the respective contributors
-All rights reserved.
-
-Caffe uses a shared copyright model: each contributor holds copyright over
-their contributions to Caffe. The project versioning records all such
-contribution and copyright details. If a contributor wants to further mark
-their specific copyright on a particular contribution, they should indicate
-their copyright solely in the commit message of the change when it is
-committed.
-
-LICENSE
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met: 
-
-1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer. 
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution. 
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-CONTRIBUTION AGREEMENT
-
-By contributing to the BVLC/caffe repository through pull-request, comment,
-or otherwise, the contributor releases their content to the
-license and copyright terms herein.
-
-************END OF THIRD-PARTY SOFTWARE NOTICES AND INFORMATION**********
\ No newline at end of file
diff --git a/ControlNet/annotator/openpose/__init__.py b/ControlNet/annotator/openpose/__init__.py
deleted file mode 100644
index 92e530fd6913a92b1e624d3e334252bcfdba902f..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/openpose/__init__.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Openpose
-# Original from CMU https://github.com/CMU-Perceptual-Computing-Lab/openpose
-# 2nd Edited by https://github.com/Hzzone/pytorch-openpose
-# 3rd Edited by ControlNet
-
-import os
-os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
-
-import torch
-import numpy as np
-from . import util
-from .body import Body
-from .hand import Hand
-from annotator.util import annotator_ckpts_path
-
-
-body_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/body_pose_model.pth"
-hand_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/hand_pose_model.pth"
-
-
-class OpenposeDetector:
-    def __init__(self):
-        body_modelpath = os.path.join(annotator_ckpts_path, "body_pose_model.pth")
-        hand_modelpath = os.path.join(annotator_ckpts_path, "hand_pose_model.pth")
-
-        if not os.path.exists(hand_modelpath):
-            from basicsr.utils.download_util import load_file_from_url
-            load_file_from_url(body_model_path, model_dir=annotator_ckpts_path)
-            load_file_from_url(hand_model_path, model_dir=annotator_ckpts_path)
-
-        self.body_estimation = Body(body_modelpath)
-        self.hand_estimation = Hand(hand_modelpath)
-
-    def __call__(self, oriImg, hand=False):
-        oriImg = oriImg[:, :, ::-1].copy()
-        with torch.no_grad():
-            candidate, subset = self.body_estimation(oriImg)
-            canvas = np.zeros_like(oriImg)
-            canvas = util.draw_bodypose(canvas, candidate, subset)
-            if hand:
-                hands_list = util.handDetect(candidate, subset, oriImg)
-                all_hand_peaks = []
-                for x, y, w, is_left in hands_list:
-                    peaks = self.hand_estimation(oriImg[y:y+w, x:x+w, :])
-                    peaks[:, 0] = np.where(peaks[:, 0] == 0, peaks[:, 0], peaks[:, 0] + x)
-                    peaks[:, 1] = np.where(peaks[:, 1] == 0, peaks[:, 1], peaks[:, 1] + y)
-                    all_hand_peaks.append(peaks)
-                canvas = util.draw_handpose(canvas, all_hand_peaks)
-            return canvas, dict(candidate=candidate.tolist(), subset=subset.tolist())
diff --git a/ControlNet/annotator/openpose/body.py b/ControlNet/annotator/openpose/body.py
deleted file mode 100644
index 7c3cf7a388b4ac81004524e64125e383bdd455bd..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/openpose/body.py
+++ /dev/null
@@ -1,219 +0,0 @@
-import cv2
-import numpy as np
-import math
-import time
-from scipy.ndimage.filters import gaussian_filter
-import matplotlib.pyplot as plt
-import matplotlib
-import torch
-from torchvision import transforms
-
-from . import util
-from .model import bodypose_model
-
-class Body(object):
-    def __init__(self, model_path):
-        self.model = bodypose_model()
-        if torch.cuda.is_available():
-            self.model = self.model.cuda()
-            print('cuda')
-        model_dict = util.transfer(self.model, torch.load(model_path))
-        self.model.load_state_dict(model_dict)
-        self.model.eval()
-
-    def __call__(self, oriImg):
-        # scale_search = [0.5, 1.0, 1.5, 2.0]
-        scale_search = [0.5]
-        boxsize = 368
-        stride = 8
-        padValue = 128
-        thre1 = 0.1
-        thre2 = 0.05
-        multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
-        heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19))
-        paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
-
-        for m in range(len(multiplier)):
-            scale = multiplier[m]
-            imageToTest = cv2.resize(oriImg, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
-            imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
-            im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
-            im = np.ascontiguousarray(im)
-
-            data = torch.from_numpy(im).float()
-            if torch.cuda.is_available():
-                data = data.cuda()
-            # data = data.permute([2, 0, 1]).unsqueeze(0).float()
-            with torch.no_grad():
-                Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data)
-            Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy()
-            Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy()
-
-            # extract outputs, resize, and remove padding
-            # heatmap = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[1]].data), (1, 2, 0))  # output 1 is heatmaps
-            heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2), (1, 2, 0))  # output 1 is heatmaps
-            heatmap = cv2.resize(heatmap, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC)
-            heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
-            heatmap = cv2.resize(heatmap, (oriImg.shape[1], oriImg.shape[0]), interpolation=cv2.INTER_CUBIC)
-
-            # paf = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[0]].data), (1, 2, 0))  # output 0 is PAFs
-            paf = np.transpose(np.squeeze(Mconv7_stage6_L1), (1, 2, 0))  # output 0 is PAFs
-            paf = cv2.resize(paf, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC)
-            paf = paf[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
-            paf = cv2.resize(paf, (oriImg.shape[1], oriImg.shape[0]), interpolation=cv2.INTER_CUBIC)
-
-            heatmap_avg += heatmap_avg + heatmap / len(multiplier)
-            paf_avg += + paf / len(multiplier)
-
-        all_peaks = []
-        peak_counter = 0
-
-        for part in range(18):
-            map_ori = heatmap_avg[:, :, part]
-            one_heatmap = gaussian_filter(map_ori, sigma=3)
-
-            map_left = np.zeros(one_heatmap.shape)
-            map_left[1:, :] = one_heatmap[:-1, :]
-            map_right = np.zeros(one_heatmap.shape)
-            map_right[:-1, :] = one_heatmap[1:, :]
-            map_up = np.zeros(one_heatmap.shape)
-            map_up[:, 1:] = one_heatmap[:, :-1]
-            map_down = np.zeros(one_heatmap.shape)
-            map_down[:, :-1] = one_heatmap[:, 1:]
-
-            peaks_binary = np.logical_and.reduce(
-                (one_heatmap >= map_left, one_heatmap >= map_right, one_heatmap >= map_up, one_heatmap >= map_down, one_heatmap > thre1))
-            peaks = list(zip(np.nonzero(peaks_binary)[1], np.nonzero(peaks_binary)[0]))  # note reverse
-            peaks_with_score = [x + (map_ori[x[1], x[0]],) for x in peaks]
-            peak_id = range(peak_counter, peak_counter + len(peaks))
-            peaks_with_score_and_id = [peaks_with_score[i] + (peak_id[i],) for i in range(len(peak_id))]
-
-            all_peaks.append(peaks_with_score_and_id)
-            peak_counter += len(peaks)
-
-        # find connection in the specified sequence, center 29 is in the position 15
-        limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
-                   [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
-                   [1, 16], [16, 18], [3, 17], [6, 18]]
-        # the middle joints heatmap correpondence
-        mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44], [19, 20], [21, 22], \
-                  [23, 24], [25, 26], [27, 28], [29, 30], [47, 48], [49, 50], [53, 54], [51, 52], \
-                  [55, 56], [37, 38], [45, 46]]
-
-        connection_all = []
-        special_k = []
-        mid_num = 10
-
-        for k in range(len(mapIdx)):
-            score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]]
-            candA = all_peaks[limbSeq[k][0] - 1]
-            candB = all_peaks[limbSeq[k][1] - 1]
-            nA = len(candA)
-            nB = len(candB)
-            indexA, indexB = limbSeq[k]
-            if (nA != 0 and nB != 0):
-                connection_candidate = []
-                for i in range(nA):
-                    for j in range(nB):
-                        vec = np.subtract(candB[j][:2], candA[i][:2])
-                        norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1])
-                        norm = max(0.001, norm)
-                        vec = np.divide(vec, norm)
-
-                        startend = list(zip(np.linspace(candA[i][0], candB[j][0], num=mid_num), \
-                                            np.linspace(candA[i][1], candB[j][1], num=mid_num)))
-
-                        vec_x = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 0] \
-                                          for I in range(len(startend))])
-                        vec_y = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 1] \
-                                          for I in range(len(startend))])
-
-                        score_midpts = np.multiply(vec_x, vec[0]) + np.multiply(vec_y, vec[1])
-                        score_with_dist_prior = sum(score_midpts) / len(score_midpts) + min(
-                            0.5 * oriImg.shape[0] / norm - 1, 0)
-                        criterion1 = len(np.nonzero(score_midpts > thre2)[0]) > 0.8 * len(score_midpts)
-                        criterion2 = score_with_dist_prior > 0
-                        if criterion1 and criterion2:
-                            connection_candidate.append(
-                                [i, j, score_with_dist_prior, score_with_dist_prior + candA[i][2] + candB[j][2]])
-
-                connection_candidate = sorted(connection_candidate, key=lambda x: x[2], reverse=True)
-                connection = np.zeros((0, 5))
-                for c in range(len(connection_candidate)):
-                    i, j, s = connection_candidate[c][0:3]
-                    if (i not in connection[:, 3] and j not in connection[:, 4]):
-                        connection = np.vstack([connection, [candA[i][3], candB[j][3], s, i, j]])
-                        if (len(connection) >= min(nA, nB)):
-                            break
-
-                connection_all.append(connection)
-            else:
-                special_k.append(k)
-                connection_all.append([])
-
-        # last number in each row is the total parts number of that person
-        # the second last number in each row is the score of the overall configuration
-        subset = -1 * np.ones((0, 20))
-        candidate = np.array([item for sublist in all_peaks for item in sublist])
-
-        for k in range(len(mapIdx)):
-            if k not in special_k:
-                partAs = connection_all[k][:, 0]
-                partBs = connection_all[k][:, 1]
-                indexA, indexB = np.array(limbSeq[k]) - 1
-
-                for i in range(len(connection_all[k])):  # = 1:size(temp,1)
-                    found = 0
-                    subset_idx = [-1, -1]
-                    for j in range(len(subset)):  # 1:size(subset,1):
-                        if subset[j][indexA] == partAs[i] or subset[j][indexB] == partBs[i]:
-                            subset_idx[found] = j
-                            found += 1
-
-                    if found == 1:
-                        j = subset_idx[0]
-                        if subset[j][indexB] != partBs[i]:
-                            subset[j][indexB] = partBs[i]
-                            subset[j][-1] += 1
-                            subset[j][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
-                    elif found == 2:  # if found 2 and disjoint, merge them
-                        j1, j2 = subset_idx
-                        membership = ((subset[j1] >= 0).astype(int) + (subset[j2] >= 0).astype(int))[:-2]
-                        if len(np.nonzero(membership == 2)[0]) == 0:  # merge
-                            subset[j1][:-2] += (subset[j2][:-2] + 1)
-                            subset[j1][-2:] += subset[j2][-2:]
-                            subset[j1][-2] += connection_all[k][i][2]
-                            subset = np.delete(subset, j2, 0)
-                        else:  # as like found == 1
-                            subset[j1][indexB] = partBs[i]
-                            subset[j1][-1] += 1
-                            subset[j1][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
-
-                    # if find no partA in the subset, create a new subset
-                    elif not found and k < 17:
-                        row = -1 * np.ones(20)
-                        row[indexA] = partAs[i]
-                        row[indexB] = partBs[i]
-                        row[-1] = 2
-                        row[-2] = sum(candidate[connection_all[k][i, :2].astype(int), 2]) + connection_all[k][i][2]
-                        subset = np.vstack([subset, row])
-        # delete some rows of subset which has few parts occur
-        deleteIdx = []
-        for i in range(len(subset)):
-            if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4:
-                deleteIdx.append(i)
-        subset = np.delete(subset, deleteIdx, axis=0)
-
-        # subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts
-        # candidate: x, y, score, id
-        return candidate, subset
-
-if __name__ == "__main__":
-    body_estimation = Body('../model/body_pose_model.pth')
-
-    test_image = '../images/ski.jpg'
-    oriImg = cv2.imread(test_image)  # B,G,R order
-    candidate, subset = body_estimation(oriImg)
-    canvas = util.draw_bodypose(oriImg, candidate, subset)
-    plt.imshow(canvas[:, :, [2, 1, 0]])
-    plt.show()
diff --git a/ControlNet/annotator/openpose/hand.py b/ControlNet/annotator/openpose/hand.py
deleted file mode 100644
index 3d0bf17165ad7eb225332b51f4a2aa16718664b2..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/openpose/hand.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import cv2
-import json
-import numpy as np
-import math
-import time
-from scipy.ndimage.filters import gaussian_filter
-import matplotlib.pyplot as plt
-import matplotlib
-import torch
-from skimage.measure import label
-
-from .model import handpose_model
-from . import util
-
-class Hand(object):
-    def __init__(self, model_path):
-        self.model = handpose_model()
-        if torch.cuda.is_available():
-            self.model = self.model.cuda()
-            print('cuda')
-        model_dict = util.transfer(self.model, torch.load(model_path))
-        self.model.load_state_dict(model_dict)
-        self.model.eval()
-
-    def __call__(self, oriImg):
-        scale_search = [0.5, 1.0, 1.5, 2.0]
-        # scale_search = [0.5]
-        boxsize = 368
-        stride = 8
-        padValue = 128
-        thre = 0.05
-        multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
-        heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 22))
-        # paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
-
-        for m in range(len(multiplier)):
-            scale = multiplier[m]
-            imageToTest = cv2.resize(oriImg, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
-            imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
-            im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
-            im = np.ascontiguousarray(im)
-
-            data = torch.from_numpy(im).float()
-            if torch.cuda.is_available():
-                data = data.cuda()
-            # data = data.permute([2, 0, 1]).unsqueeze(0).float()
-            with torch.no_grad():
-                output = self.model(data).cpu().numpy()
-                # output = self.model(data).numpy()q
-
-            # extract outputs, resize, and remove padding
-            heatmap = np.transpose(np.squeeze(output), (1, 2, 0))  # output 1 is heatmaps
-            heatmap = cv2.resize(heatmap, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC)
-            heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
-            heatmap = cv2.resize(heatmap, (oriImg.shape[1], oriImg.shape[0]), interpolation=cv2.INTER_CUBIC)
-
-            heatmap_avg += heatmap / len(multiplier)
-
-        all_peaks = []
-        for part in range(21):
-            map_ori = heatmap_avg[:, :, part]
-            one_heatmap = gaussian_filter(map_ori, sigma=3)
-            binary = np.ascontiguousarray(one_heatmap > thre, dtype=np.uint8)
-            # 全部小于阈值
-            if np.sum(binary) == 0:
-                all_peaks.append([0, 0])
-                continue
-            label_img, label_numbers = label(binary, return_num=True, connectivity=binary.ndim)
-            max_index = np.argmax([np.sum(map_ori[label_img == i]) for i in range(1, label_numbers + 1)]) + 1
-            label_img[label_img != max_index] = 0
-            map_ori[label_img == 0] = 0
-
-            y, x = util.npmax(map_ori)
-            all_peaks.append([x, y])
-        return np.array(all_peaks)
-
-if __name__ == "__main__":
-    hand_estimation = Hand('../model/hand_pose_model.pth')
-
-    # test_image = '../images/hand.jpg'
-    test_image = '../images/hand.jpg'
-    oriImg = cv2.imread(test_image)  # B,G,R order
-    peaks = hand_estimation(oriImg)
-    canvas = util.draw_handpose(oriImg, peaks, True)
-    cv2.imshow('', canvas)
-    cv2.waitKey(0)
\ No newline at end of file
diff --git a/ControlNet/annotator/openpose/model.py b/ControlNet/annotator/openpose/model.py
deleted file mode 100644
index 5dfc80de827a17beccb9b0f3f7588545be78c9de..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/openpose/model.py
+++ /dev/null
@@ -1,219 +0,0 @@
-import torch
-from collections import OrderedDict
-
-import torch
-import torch.nn as nn
-
-def make_layers(block, no_relu_layers):
-    layers = []
-    for layer_name, v in block.items():
-        if 'pool' in layer_name:
-            layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1],
-                                    padding=v[2])
-            layers.append((layer_name, layer))
-        else:
-            conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1],
-                               kernel_size=v[2], stride=v[3],
-                               padding=v[4])
-            layers.append((layer_name, conv2d))
-            if layer_name not in no_relu_layers:
-                layers.append(('relu_'+layer_name, nn.ReLU(inplace=True)))
-
-    return nn.Sequential(OrderedDict(layers))
-
-class bodypose_model(nn.Module):
-    def __init__(self):
-        super(bodypose_model, self).__init__()
-
-        # these layers have no relu layer
-        no_relu_layers = ['conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',\
-                          'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',\
-                          'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',\
-                          'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1']
-        blocks = {}
-        block0 = OrderedDict([
-                      ('conv1_1', [3, 64, 3, 1, 1]),
-                      ('conv1_2', [64, 64, 3, 1, 1]),
-                      ('pool1_stage1', [2, 2, 0]),
-                      ('conv2_1', [64, 128, 3, 1, 1]),
-                      ('conv2_2', [128, 128, 3, 1, 1]),
-                      ('pool2_stage1', [2, 2, 0]),
-                      ('conv3_1', [128, 256, 3, 1, 1]),
-                      ('conv3_2', [256, 256, 3, 1, 1]),
-                      ('conv3_3', [256, 256, 3, 1, 1]),
-                      ('conv3_4', [256, 256, 3, 1, 1]),
-                      ('pool3_stage1', [2, 2, 0]),
-                      ('conv4_1', [256, 512, 3, 1, 1]),
-                      ('conv4_2', [512, 512, 3, 1, 1]),
-                      ('conv4_3_CPM', [512, 256, 3, 1, 1]),
-                      ('conv4_4_CPM', [256, 128, 3, 1, 1])
-                  ])
-
-
-        # Stage 1
-        block1_1 = OrderedDict([
-                        ('conv5_1_CPM_L1', [128, 128, 3, 1, 1]),
-                        ('conv5_2_CPM_L1', [128, 128, 3, 1, 1]),
-                        ('conv5_3_CPM_L1', [128, 128, 3, 1, 1]),
-                        ('conv5_4_CPM_L1', [128, 512, 1, 1, 0]),
-                        ('conv5_5_CPM_L1', [512, 38, 1, 1, 0])
-                    ])
-
-        block1_2 = OrderedDict([
-                        ('conv5_1_CPM_L2', [128, 128, 3, 1, 1]),
-                        ('conv5_2_CPM_L2', [128, 128, 3, 1, 1]),
-                        ('conv5_3_CPM_L2', [128, 128, 3, 1, 1]),
-                        ('conv5_4_CPM_L2', [128, 512, 1, 1, 0]),
-                        ('conv5_5_CPM_L2', [512, 19, 1, 1, 0])
-                    ])
-        blocks['block1_1'] = block1_1
-        blocks['block1_2'] = block1_2
-
-        self.model0 = make_layers(block0, no_relu_layers)
-
-        # Stages 2 - 6
-        for i in range(2, 7):
-            blocks['block%d_1' % i] = OrderedDict([
-                    ('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]),
-                    ('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]),
-                    ('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0])
-                ])
-
-            blocks['block%d_2' % i] = OrderedDict([
-                    ('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]),
-                    ('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]),
-                    ('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0])
-                ])
-
-        for k in blocks.keys():
-            blocks[k] = make_layers(blocks[k], no_relu_layers)
-
-        self.model1_1 = blocks['block1_1']
-        self.model2_1 = blocks['block2_1']
-        self.model3_1 = blocks['block3_1']
-        self.model4_1 = blocks['block4_1']
-        self.model5_1 = blocks['block5_1']
-        self.model6_1 = blocks['block6_1']
-
-        self.model1_2 = blocks['block1_2']
-        self.model2_2 = blocks['block2_2']
-        self.model3_2 = blocks['block3_2']
-        self.model4_2 = blocks['block4_2']
-        self.model5_2 = blocks['block5_2']
-        self.model6_2 = blocks['block6_2']
-
-
-    def forward(self, x):
-
-        out1 = self.model0(x)
-
-        out1_1 = self.model1_1(out1)
-        out1_2 = self.model1_2(out1)
-        out2 = torch.cat([out1_1, out1_2, out1], 1)
-
-        out2_1 = self.model2_1(out2)
-        out2_2 = self.model2_2(out2)
-        out3 = torch.cat([out2_1, out2_2, out1], 1)
-
-        out3_1 = self.model3_1(out3)
-        out3_2 = self.model3_2(out3)
-        out4 = torch.cat([out3_1, out3_2, out1], 1)
-
-        out4_1 = self.model4_1(out4)
-        out4_2 = self.model4_2(out4)
-        out5 = torch.cat([out4_1, out4_2, out1], 1)
-
-        out5_1 = self.model5_1(out5)
-        out5_2 = self.model5_2(out5)
-        out6 = torch.cat([out5_1, out5_2, out1], 1)
-
-        out6_1 = self.model6_1(out6)
-        out6_2 = self.model6_2(out6)
-
-        return out6_1, out6_2
-
-class handpose_model(nn.Module):
-    def __init__(self):
-        super(handpose_model, self).__init__()
-
-        # these layers have no relu layer
-        no_relu_layers = ['conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3',\
-                          'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6']
-        # stage 1
-        block1_0 = OrderedDict([
-                ('conv1_1', [3, 64, 3, 1, 1]),
-                ('conv1_2', [64, 64, 3, 1, 1]),
-                ('pool1_stage1', [2, 2, 0]),
-                ('conv2_1', [64, 128, 3, 1, 1]),
-                ('conv2_2', [128, 128, 3, 1, 1]),
-                ('pool2_stage1', [2, 2, 0]),
-                ('conv3_1', [128, 256, 3, 1, 1]),
-                ('conv3_2', [256, 256, 3, 1, 1]),
-                ('conv3_3', [256, 256, 3, 1, 1]),
-                ('conv3_4', [256, 256, 3, 1, 1]),
-                ('pool3_stage1', [2, 2, 0]),
-                ('conv4_1', [256, 512, 3, 1, 1]),
-                ('conv4_2', [512, 512, 3, 1, 1]),
-                ('conv4_3', [512, 512, 3, 1, 1]),
-                ('conv4_4', [512, 512, 3, 1, 1]),
-                ('conv5_1', [512, 512, 3, 1, 1]),
-                ('conv5_2', [512, 512, 3, 1, 1]),
-                ('conv5_3_CPM', [512, 128, 3, 1, 1])
-            ])
-
-        block1_1 = OrderedDict([
-            ('conv6_1_CPM', [128, 512, 1, 1, 0]),
-            ('conv6_2_CPM', [512, 22, 1, 1, 0])
-        ])
-
-        blocks = {}
-        blocks['block1_0'] = block1_0
-        blocks['block1_1'] = block1_1
-
-        # stage 2-6
-        for i in range(2, 7):
-            blocks['block%d' % i] = OrderedDict([
-                    ('Mconv1_stage%d' % i, [150, 128, 7, 1, 3]),
-                    ('Mconv2_stage%d' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv3_stage%d' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv4_stage%d' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv5_stage%d' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv6_stage%d' % i, [128, 128, 1, 1, 0]),
-                    ('Mconv7_stage%d' % i, [128, 22, 1, 1, 0])
-                ])
-
-        for k in blocks.keys():
-            blocks[k] = make_layers(blocks[k], no_relu_layers)
-
-        self.model1_0 = blocks['block1_0']
-        self.model1_1 = blocks['block1_1']
-        self.model2 = blocks['block2']
-        self.model3 = blocks['block3']
-        self.model4 = blocks['block4']
-        self.model5 = blocks['block5']
-        self.model6 = blocks['block6']
-
-    def forward(self, x):
-        out1_0 = self.model1_0(x)
-        out1_1 = self.model1_1(out1_0)
-        concat_stage2 = torch.cat([out1_1, out1_0], 1)
-        out_stage2 = self.model2(concat_stage2)
-        concat_stage3 = torch.cat([out_stage2, out1_0], 1)
-        out_stage3 = self.model3(concat_stage3)
-        concat_stage4 = torch.cat([out_stage3, out1_0], 1)
-        out_stage4 = self.model4(concat_stage4)
-        concat_stage5 = torch.cat([out_stage4, out1_0], 1)
-        out_stage5 = self.model5(concat_stage5)
-        concat_stage6 = torch.cat([out_stage5, out1_0], 1)
-        out_stage6 = self.model6(concat_stage6)
-        return out_stage6
-
-
diff --git a/ControlNet/annotator/openpose/util.py b/ControlNet/annotator/openpose/util.py
deleted file mode 100644
index 6f91ae0e65abaf0cbd62d803f56498991141e61b..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/openpose/util.py
+++ /dev/null
@@ -1,164 +0,0 @@
-import math
-import numpy as np
-import matplotlib
-import cv2
-
-
-def padRightDownCorner(img, stride, padValue):
-    h = img.shape[0]
-    w = img.shape[1]
-
-    pad = 4 * [None]
-    pad[0] = 0 # up
-    pad[1] = 0 # left
-    pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down
-    pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right
-
-    img_padded = img
-    pad_up = np.tile(img_padded[0:1, :, :]*0 + padValue, (pad[0], 1, 1))
-    img_padded = np.concatenate((pad_up, img_padded), axis=0)
-    pad_left = np.tile(img_padded[:, 0:1, :]*0 + padValue, (1, pad[1], 1))
-    img_padded = np.concatenate((pad_left, img_padded), axis=1)
-    pad_down = np.tile(img_padded[-2:-1, :, :]*0 + padValue, (pad[2], 1, 1))
-    img_padded = np.concatenate((img_padded, pad_down), axis=0)
-    pad_right = np.tile(img_padded[:, -2:-1, :]*0 + padValue, (1, pad[3], 1))
-    img_padded = np.concatenate((img_padded, pad_right), axis=1)
-
-    return img_padded, pad
-
-# transfer caffe model to pytorch which will match the layer name
-def transfer(model, model_weights):
-    transfered_model_weights = {}
-    for weights_name in model.state_dict().keys():
-        transfered_model_weights[weights_name] = model_weights['.'.join(weights_name.split('.')[1:])]
-    return transfered_model_weights
-
-# draw the body keypoint and lims
-def draw_bodypose(canvas, candidate, subset):
-    stickwidth = 4
-    limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
-               [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
-               [1, 16], [16, 18], [3, 17], [6, 18]]
-
-    colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
-              [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
-              [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
-    for i in range(18):
-        for n in range(len(subset)):
-            index = int(subset[n][i])
-            if index == -1:
-                continue
-            x, y = candidate[index][0:2]
-            cv2.circle(canvas, (int(x), int(y)), 4, colors[i], thickness=-1)
-    for i in range(17):
-        for n in range(len(subset)):
-            index = subset[n][np.array(limbSeq[i]) - 1]
-            if -1 in index:
-                continue
-            cur_canvas = canvas.copy()
-            Y = candidate[index.astype(int), 0]
-            X = candidate[index.astype(int), 1]
-            mX = np.mean(X)
-            mY = np.mean(Y)
-            length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
-            angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
-            polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
-            cv2.fillConvexPoly(cur_canvas, polygon, colors[i])
-            canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
-    # plt.imsave("preview.jpg", canvas[:, :, [2, 1, 0]])
-    # plt.imshow(canvas[:, :, [2, 1, 0]])
-    return canvas
-
-
-# image drawed by opencv is not good.
-def draw_handpose(canvas, all_hand_peaks, show_number=False):
-    edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \
-             [10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]
-
-    for peaks in all_hand_peaks:
-        for ie, e in enumerate(edges):
-            if np.sum(np.all(peaks[e], axis=1)==0)==0:
-                x1, y1 = peaks[e[0]]
-                x2, y2 = peaks[e[1]]
-                cv2.line(canvas, (x1, y1), (x2, y2), matplotlib.colors.hsv_to_rgb([ie/float(len(edges)), 1.0, 1.0])*255, thickness=2)
-
-        for i, keyponit in enumerate(peaks):
-            x, y = keyponit
-            cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
-            if show_number:
-                cv2.putText(canvas, str(i), (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (0, 0, 0), lineType=cv2.LINE_AA)
-    return canvas
-
-# detect hand according to body pose keypoints
-# please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp
-def handDetect(candidate, subset, oriImg):
-    # right hand: wrist 4, elbow 3, shoulder 2
-    # left hand: wrist 7, elbow 6, shoulder 5
-    ratioWristElbow = 0.33
-    detect_result = []
-    image_height, image_width = oriImg.shape[0:2]
-    for person in subset.astype(int):
-        # if any of three not detected
-        has_left = np.sum(person[[5, 6, 7]] == -1) == 0
-        has_right = np.sum(person[[2, 3, 4]] == -1) == 0
-        if not (has_left or has_right):
-            continue
-        hands = []
-        #left hand
-        if has_left:
-            left_shoulder_index, left_elbow_index, left_wrist_index = person[[5, 6, 7]]
-            x1, y1 = candidate[left_shoulder_index][:2]
-            x2, y2 = candidate[left_elbow_index][:2]
-            x3, y3 = candidate[left_wrist_index][:2]
-            hands.append([x1, y1, x2, y2, x3, y3, True])
-        # right hand
-        if has_right:
-            right_shoulder_index, right_elbow_index, right_wrist_index = person[[2, 3, 4]]
-            x1, y1 = candidate[right_shoulder_index][:2]
-            x2, y2 = candidate[right_elbow_index][:2]
-            x3, y3 = candidate[right_wrist_index][:2]
-            hands.append([x1, y1, x2, y2, x3, y3, False])
-
-        for x1, y1, x2, y2, x3, y3, is_left in hands:
-            # pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
-            # handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
-            # handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
-            # const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
-            # const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
-            # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
-            x = x3 + ratioWristElbow * (x3 - x2)
-            y = y3 + ratioWristElbow * (y3 - y2)
-            distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
-            distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
-            width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
-            # x-y refers to the center --> offset to topLeft point
-            # handRectangle.x -= handRectangle.width / 2.f;
-            # handRectangle.y -= handRectangle.height / 2.f;
-            x -= width / 2
-            y -= width / 2  # width = height
-            # overflow the image
-            if x < 0: x = 0
-            if y < 0: y = 0
-            width1 = width
-            width2 = width
-            if x + width > image_width: width1 = image_width - x
-            if y + width > image_height: width2 = image_height - y
-            width = min(width1, width2)
-            # the max hand box value is 20 pixels
-            if width >= 20:
-                detect_result.append([int(x), int(y), int(width), is_left])
-
-    '''
-    return value: [[x, y, w, True if left hand else False]].
-    width=height since the network require squared input.
-    x, y is the coordinate of top left 
-    '''
-    return detect_result
-
-# get max index of 2d array
-def npmax(array):
-    arrayindex = array.argmax(1)
-    arrayvalue = array.max(1)
-    i = arrayvalue.argmax()
-    j = arrayindex[i]
-    return i, j
diff --git a/ControlNet/annotator/uniformer/LICENSE b/ControlNet/annotator/uniformer/LICENSE
deleted file mode 100644
index c38dc639e6e238fbf59608f80b3a6ff1928ac429..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/LICENSE
+++ /dev/null
@@ -1,203 +0,0 @@
-Copyright 2022 SenseTime X-Lab. All rights reserved.
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2022 SenseTime X-Lab.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
\ No newline at end of file
diff --git a/ControlNet/annotator/uniformer/__init__.py b/ControlNet/annotator/uniformer/__init__.py
deleted file mode 100644
index 3364d40997447a4ec15ca7a525a4d0e92ab211bd..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Uniformer
-# From https://github.com/Sense-X/UniFormer
-# # Apache-2.0 license
-
-import os
-
-from annotator.uniformer.mmseg.apis import init_segmentor, inference_segmentor, show_result_pyplot
-from annotator.uniformer.mmseg.core.evaluation import get_palette
-from annotator.util import annotator_ckpts_path
-
-
-checkpoint_file = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/upernet_global_small.pth"
-
-
-class UniformerDetector:
-    def __init__(self):
-        modelpath = os.path.join(annotator_ckpts_path, "upernet_global_small.pth")
-        if not os.path.exists(modelpath):
-            from basicsr.utils.download_util import load_file_from_url
-            load_file_from_url(checkpoint_file, model_dir=annotator_ckpts_path)
-        config_file = os.path.join(os.path.dirname(annotator_ckpts_path), "uniformer", "exp", "upernet_global_small", "config.py")
-        self.model = init_segmentor(config_file, modelpath).cuda()
-
-    def __call__(self, img):
-        result = inference_segmentor(self.model, img)
-        res_img = show_result_pyplot(self.model, img, result, get_palette('ade'), opacity=1)
-        return res_img
diff --git a/ControlNet/annotator/uniformer/configs/_base_/default_runtime.py b/ControlNet/annotator/uniformer/configs/_base_/default_runtime.py
deleted file mode 100644
index b564cc4e7e7d9a67dacaaddecb100e4d8f5c005b..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/default_runtime.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# yapf:disable
-log_config = dict(
-    interval=50,
-    hooks=[
-        dict(type='TextLoggerHook', by_epoch=False),
-        # dict(type='TensorboardLoggerHook')
-    ])
-# yapf:enable
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
-cudnn_benchmark = True
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/ann_r50-d8.py b/ControlNet/annotator/uniformer/configs/_base_/models/ann_r50-d8.py
deleted file mode 100644
index a2cb653827e44e6015b3b83bc578003e614a6aa1..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/ann_r50-d8.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained='open-mmlab://resnet50_v1c',
-    backbone=dict(
-        type='ResNetV1c',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        dilations=(1, 1, 2, 4),
-        strides=(1, 2, 1, 1),
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        style='pytorch',
-        contract_dilation=True),
-    decode_head=dict(
-        type='ANNHead',
-        in_channels=[1024, 2048],
-        in_index=[2, 3],
-        channels=512,
-        project_channels=256,
-        query_scales=(1, ),
-        key_pool_scales=(1, 3, 6, 8),
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    auxiliary_head=dict(
-        type='FCNHead',
-        in_channels=1024,
-        in_index=2,
-        channels=256,
-        num_convs=1,
-        concat_input=False,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/apcnet_r50-d8.py b/ControlNet/annotator/uniformer/configs/_base_/models/apcnet_r50-d8.py
deleted file mode 100644
index c8f5316cbcf3896ba9de7ca2c801eba512f01d5e..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/apcnet_r50-d8.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained='open-mmlab://resnet50_v1c',
-    backbone=dict(
-        type='ResNetV1c',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        dilations=(1, 1, 2, 4),
-        strides=(1, 2, 1, 1),
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        style='pytorch',
-        contract_dilation=True),
-    decode_head=dict(
-        type='APCHead',
-        in_channels=2048,
-        in_index=3,
-        channels=512,
-        pool_scales=(1, 2, 3, 6),
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=dict(type='SyncBN', requires_grad=True),
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    auxiliary_head=dict(
-        type='FCNHead',
-        in_channels=1024,
-        in_index=2,
-        channels=256,
-        num_convs=1,
-        concat_input=False,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/ccnet_r50-d8.py b/ControlNet/annotator/uniformer/configs/_base_/models/ccnet_r50-d8.py
deleted file mode 100644
index 794148f576b9e215c3c6963e73dffe98204b7717..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/ccnet_r50-d8.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained='open-mmlab://resnet50_v1c',
-    backbone=dict(
-        type='ResNetV1c',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        dilations=(1, 1, 2, 4),
-        strides=(1, 2, 1, 1),
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        style='pytorch',
-        contract_dilation=True),
-    decode_head=dict(
-        type='CCHead',
-        in_channels=2048,
-        in_index=3,
-        channels=512,
-        recurrence=2,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    auxiliary_head=dict(
-        type='FCNHead',
-        in_channels=1024,
-        in_index=2,
-        channels=256,
-        num_convs=1,
-        concat_input=False,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/cgnet.py b/ControlNet/annotator/uniformer/configs/_base_/models/cgnet.py
deleted file mode 100644
index eff8d9458c877c5db894957e0b1b4597e40da6ab..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/cgnet.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', eps=1e-03, requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    backbone=dict(
-        type='CGNet',
-        norm_cfg=norm_cfg,
-        in_channels=3,
-        num_channels=(32, 64, 128),
-        num_blocks=(3, 21),
-        dilations=(2, 4),
-        reductions=(8, 16)),
-    decode_head=dict(
-        type='FCNHead',
-        in_channels=256,
-        in_index=2,
-        channels=256,
-        num_convs=0,
-        concat_input=False,
-        dropout_ratio=0,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        loss_decode=dict(
-            type='CrossEntropyLoss',
-            use_sigmoid=False,
-            loss_weight=1.0,
-            class_weight=[
-                2.5959933, 6.7415504, 3.5354059, 9.8663225, 9.690899, 9.369352,
-                10.289121, 9.953208, 4.3097677, 9.490387, 7.674431, 9.396905,
-                10.347791, 6.3927646, 10.226669, 10.241062, 10.280587,
-                10.396974, 10.055647
-            ])),
-    # model training and testing settings
-    train_cfg=dict(sampler=None),
-    test_cfg=dict(mode='whole'))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/danet_r50-d8.py b/ControlNet/annotator/uniformer/configs/_base_/models/danet_r50-d8.py
deleted file mode 100644
index 2c934939fac48525f22ad86f489a041dd7db7d09..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/danet_r50-d8.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained='open-mmlab://resnet50_v1c',
-    backbone=dict(
-        type='ResNetV1c',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        dilations=(1, 1, 2, 4),
-        strides=(1, 2, 1, 1),
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        style='pytorch',
-        contract_dilation=True),
-    decode_head=dict(
-        type='DAHead',
-        in_channels=2048,
-        in_index=3,
-        channels=512,
-        pam_channels=64,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    auxiliary_head=dict(
-        type='FCNHead',
-        in_channels=1024,
-        in_index=2,
-        channels=256,
-        num_convs=1,
-        concat_input=False,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/deeplabv3_r50-d8.py b/ControlNet/annotator/uniformer/configs/_base_/models/deeplabv3_r50-d8.py
deleted file mode 100644
index d7a43bee01422ad4795dd27874e0cd4bb6cbfecf..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/deeplabv3_r50-d8.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained='open-mmlab://resnet50_v1c',
-    backbone=dict(
-        type='ResNetV1c',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        dilations=(1, 1, 2, 4),
-        strides=(1, 2, 1, 1),
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        style='pytorch',
-        contract_dilation=True),
-    decode_head=dict(
-        type='ASPPHead',
-        in_channels=2048,
-        in_index=3,
-        channels=512,
-        dilations=(1, 12, 24, 36),
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    auxiliary_head=dict(
-        type='FCNHead',
-        in_channels=1024,
-        in_index=2,
-        channels=256,
-        num_convs=1,
-        concat_input=False,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/deeplabv3_unet_s5-d16.py b/ControlNet/annotator/uniformer/configs/_base_/models/deeplabv3_unet_s5-d16.py
deleted file mode 100644
index 0cd262999d8b2cb8e14a5c32190ae73f479d8e81..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/deeplabv3_unet_s5-d16.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained=None,
-    backbone=dict(
-        type='UNet',
-        in_channels=3,
-        base_channels=64,
-        num_stages=5,
-        strides=(1, 1, 1, 1, 1),
-        enc_num_convs=(2, 2, 2, 2, 2),
-        dec_num_convs=(2, 2, 2, 2),
-        downsamples=(True, True, True, True),
-        enc_dilations=(1, 1, 1, 1, 1),
-        dec_dilations=(1, 1, 1, 1),
-        with_cp=False,
-        conv_cfg=None,
-        norm_cfg=norm_cfg,
-        act_cfg=dict(type='ReLU'),
-        upsample_cfg=dict(type='InterpConv'),
-        norm_eval=False),
-    decode_head=dict(
-        type='ASPPHead',
-        in_channels=64,
-        in_index=4,
-        channels=16,
-        dilations=(1, 12, 24, 36),
-        dropout_ratio=0.1,
-        num_classes=2,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    auxiliary_head=dict(
-        type='FCNHead',
-        in_channels=128,
-        in_index=3,
-        channels=64,
-        num_convs=1,
-        concat_input=False,
-        dropout_ratio=0.1,
-        num_classes=2,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='slide', crop_size=256, stride=170))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/deeplabv3plus_r50-d8.py b/ControlNet/annotator/uniformer/configs/_base_/models/deeplabv3plus_r50-d8.py
deleted file mode 100644
index 050e39e091d816df9028d23aa3ecf9db74e441e1..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/deeplabv3plus_r50-d8.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained='open-mmlab://resnet50_v1c',
-    backbone=dict(
-        type='ResNetV1c',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        dilations=(1, 1, 2, 4),
-        strides=(1, 2, 1, 1),
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        style='pytorch',
-        contract_dilation=True),
-    decode_head=dict(
-        type='DepthwiseSeparableASPPHead',
-        in_channels=2048,
-        in_index=3,
-        channels=512,
-        dilations=(1, 12, 24, 36),
-        c1_in_channels=256,
-        c1_channels=48,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    auxiliary_head=dict(
-        type='FCNHead',
-        in_channels=1024,
-        in_index=2,
-        channels=256,
-        num_convs=1,
-        concat_input=False,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/dmnet_r50-d8.py b/ControlNet/annotator/uniformer/configs/_base_/models/dmnet_r50-d8.py
deleted file mode 100644
index d22ba52640bebd805b3b8d07025e276dfb023759..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/dmnet_r50-d8.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained='open-mmlab://resnet50_v1c',
-    backbone=dict(
-        type='ResNetV1c',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        dilations=(1, 1, 2, 4),
-        strides=(1, 2, 1, 1),
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        style='pytorch',
-        contract_dilation=True),
-    decode_head=dict(
-        type='DMHead',
-        in_channels=2048,
-        in_index=3,
-        channels=512,
-        filter_sizes=(1, 3, 5, 7),
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=dict(type='SyncBN', requires_grad=True),
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    auxiliary_head=dict(
-        type='FCNHead',
-        in_channels=1024,
-        in_index=2,
-        channels=256,
-        num_convs=1,
-        concat_input=False,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/dnl_r50-d8.py b/ControlNet/annotator/uniformer/configs/_base_/models/dnl_r50-d8.py
deleted file mode 100644
index edb4c174c51e34c103737ba39bfc48bf831e561d..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/dnl_r50-d8.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained='open-mmlab://resnet50_v1c',
-    backbone=dict(
-        type='ResNetV1c',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        dilations=(1, 1, 2, 4),
-        strides=(1, 2, 1, 1),
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        style='pytorch',
-        contract_dilation=True),
-    decode_head=dict(
-        type='DNLHead',
-        in_channels=2048,
-        in_index=3,
-        channels=512,
-        dropout_ratio=0.1,
-        reduction=2,
-        use_scale=True,
-        mode='embedded_gaussian',
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    auxiliary_head=dict(
-        type='FCNHead',
-        in_channels=1024,
-        in_index=2,
-        channels=256,
-        num_convs=1,
-        concat_input=False,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/emanet_r50-d8.py b/ControlNet/annotator/uniformer/configs/_base_/models/emanet_r50-d8.py
deleted file mode 100644
index 26adcd430926de0862204a71d345f2543167f27b..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/emanet_r50-d8.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained='open-mmlab://resnet50_v1c',
-    backbone=dict(
-        type='ResNetV1c',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        dilations=(1, 1, 2, 4),
-        strides=(1, 2, 1, 1),
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        style='pytorch',
-        contract_dilation=True),
-    decode_head=dict(
-        type='EMAHead',
-        in_channels=2048,
-        in_index=3,
-        channels=256,
-        ema_channels=512,
-        num_bases=64,
-        num_stages=3,
-        momentum=0.1,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    auxiliary_head=dict(
-        type='FCNHead',
-        in_channels=1024,
-        in_index=2,
-        channels=256,
-        num_convs=1,
-        concat_input=False,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/encnet_r50-d8.py b/ControlNet/annotator/uniformer/configs/_base_/models/encnet_r50-d8.py
deleted file mode 100644
index be777123a886503172a95fe0719e956a147bbd68..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/encnet_r50-d8.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained='open-mmlab://resnet50_v1c',
-    backbone=dict(
-        type='ResNetV1c',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        dilations=(1, 1, 2, 4),
-        strides=(1, 2, 1, 1),
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        style='pytorch',
-        contract_dilation=True),
-    decode_head=dict(
-        type='EncHead',
-        in_channels=[512, 1024, 2048],
-        in_index=(1, 2, 3),
-        channels=512,
-        num_codes=32,
-        use_se_loss=True,
-        add_lateral=False,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
-        loss_se_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.2)),
-    auxiliary_head=dict(
-        type='FCNHead',
-        in_channels=1024,
-        in_index=2,
-        channels=256,
-        num_convs=1,
-        concat_input=False,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/fast_scnn.py b/ControlNet/annotator/uniformer/configs/_base_/models/fast_scnn.py
deleted file mode 100644
index 32fdeb659355a5ce5ef2cc7c2f30742703811cdf..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/fast_scnn.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True, momentum=0.01)
-model = dict(
-    type='EncoderDecoder',
-    backbone=dict(
-        type='FastSCNN',
-        downsample_dw_channels=(32, 48),
-        global_in_channels=64,
-        global_block_channels=(64, 96, 128),
-        global_block_strides=(2, 2, 1),
-        global_out_channels=128,
-        higher_in_channels=64,
-        lower_in_channels=128,
-        fusion_out_channels=128,
-        out_indices=(0, 1, 2),
-        norm_cfg=norm_cfg,
-        align_corners=False),
-    decode_head=dict(
-        type='DepthwiseSeparableFCNHead',
-        in_channels=128,
-        channels=128,
-        concat_input=False,
-        num_classes=19,
-        in_index=-1,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
-    auxiliary_head=[
-        dict(
-            type='FCNHead',
-            in_channels=128,
-            channels=32,
-            num_convs=1,
-            num_classes=19,
-            in_index=-2,
-            norm_cfg=norm_cfg,
-            concat_input=False,
-            align_corners=False,
-            loss_decode=dict(
-                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
-        dict(
-            type='FCNHead',
-            in_channels=64,
-            channels=32,
-            num_convs=1,
-            num_classes=19,
-            in_index=-3,
-            norm_cfg=norm_cfg,
-            concat_input=False,
-            align_corners=False,
-            loss_decode=dict(
-                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
-    ],
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/fcn_hr18.py b/ControlNet/annotator/uniformer/configs/_base_/models/fcn_hr18.py
deleted file mode 100644
index c3e299bc89ada56ca14bbffcbdb08a586b8ed9e9..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/fcn_hr18.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained='open-mmlab://msra/hrnetv2_w18',
-    backbone=dict(
-        type='HRNet',
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        extra=dict(
-            stage1=dict(
-                num_modules=1,
-                num_branches=1,
-                block='BOTTLENECK',
-                num_blocks=(4, ),
-                num_channels=(64, )),
-            stage2=dict(
-                num_modules=1,
-                num_branches=2,
-                block='BASIC',
-                num_blocks=(4, 4),
-                num_channels=(18, 36)),
-            stage3=dict(
-                num_modules=4,
-                num_branches=3,
-                block='BASIC',
-                num_blocks=(4, 4, 4),
-                num_channels=(18, 36, 72)),
-            stage4=dict(
-                num_modules=3,
-                num_branches=4,
-                block='BASIC',
-                num_blocks=(4, 4, 4, 4),
-                num_channels=(18, 36, 72, 144)))),
-    decode_head=dict(
-        type='FCNHead',
-        in_channels=[18, 36, 72, 144],
-        in_index=(0, 1, 2, 3),
-        channels=sum([18, 36, 72, 144]),
-        input_transform='resize_concat',
-        kernel_size=1,
-        num_convs=1,
-        concat_input=False,
-        dropout_ratio=-1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/fcn_r50-d8.py b/ControlNet/annotator/uniformer/configs/_base_/models/fcn_r50-d8.py
deleted file mode 100644
index 5e98f6cc918b6146fc6d613c6918e825ef1355c3..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/fcn_r50-d8.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained='open-mmlab://resnet50_v1c',
-    backbone=dict(
-        type='ResNetV1c',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        dilations=(1, 1, 2, 4),
-        strides=(1, 2, 1, 1),
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        style='pytorch',
-        contract_dilation=True),
-    decode_head=dict(
-        type='FCNHead',
-        in_channels=2048,
-        in_index=3,
-        channels=512,
-        num_convs=2,
-        concat_input=True,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    auxiliary_head=dict(
-        type='FCNHead',
-        in_channels=1024,
-        in_index=2,
-        channels=256,
-        num_convs=1,
-        concat_input=False,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/fcn_unet_s5-d16.py b/ControlNet/annotator/uniformer/configs/_base_/models/fcn_unet_s5-d16.py
deleted file mode 100644
index a33e7972877f902d0e7d18401ca675e3e4e60a18..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/fcn_unet_s5-d16.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained=None,
-    backbone=dict(
-        type='UNet',
-        in_channels=3,
-        base_channels=64,
-        num_stages=5,
-        strides=(1, 1, 1, 1, 1),
-        enc_num_convs=(2, 2, 2, 2, 2),
-        dec_num_convs=(2, 2, 2, 2),
-        downsamples=(True, True, True, True),
-        enc_dilations=(1, 1, 1, 1, 1),
-        dec_dilations=(1, 1, 1, 1),
-        with_cp=False,
-        conv_cfg=None,
-        norm_cfg=norm_cfg,
-        act_cfg=dict(type='ReLU'),
-        upsample_cfg=dict(type='InterpConv'),
-        norm_eval=False),
-    decode_head=dict(
-        type='FCNHead',
-        in_channels=64,
-        in_index=4,
-        channels=64,
-        num_convs=1,
-        concat_input=False,
-        dropout_ratio=0.1,
-        num_classes=2,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    auxiliary_head=dict(
-        type='FCNHead',
-        in_channels=128,
-        in_index=3,
-        channels=64,
-        num_convs=1,
-        concat_input=False,
-        dropout_ratio=0.1,
-        num_classes=2,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='slide', crop_size=256, stride=170))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/fpn_r50.py b/ControlNet/annotator/uniformer/configs/_base_/models/fpn_r50.py
deleted file mode 100644
index 86ab327db92e44c14822d65f1c9277cb007f17c1..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/fpn_r50.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained='open-mmlab://resnet50_v1c',
-    backbone=dict(
-        type='ResNetV1c',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        dilations=(1, 1, 1, 1),
-        strides=(1, 2, 2, 2),
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        style='pytorch',
-        contract_dilation=True),
-    neck=dict(
-        type='FPN',
-        in_channels=[256, 512, 1024, 2048],
-        out_channels=256,
-        num_outs=4),
-    decode_head=dict(
-        type='FPNHead',
-        in_channels=[256, 256, 256, 256],
-        in_index=[0, 1, 2, 3],
-        feature_strides=[4, 8, 16, 32],
-        channels=128,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/fpn_uniformer.py b/ControlNet/annotator/uniformer/configs/_base_/models/fpn_uniformer.py
deleted file mode 100644
index 8aae98c5991055bfcc08e82ccdc09f8b1d9f8a8d..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/fpn_uniformer.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    backbone=dict(
-        type='UniFormer',
-        embed_dim=[64, 128, 320, 512],
-        layers=[3, 4, 8, 3],
-        head_dim=64,
-        mlp_ratio=4.,
-        qkv_bias=True,
-        drop_rate=0.,
-        attn_drop_rate=0.,
-        drop_path_rate=0.1),
-    neck=dict(
-        type='FPN',
-        in_channels=[64, 128, 320, 512],
-        out_channels=256,
-        num_outs=4),
-    decode_head=dict(
-        type='FPNHead',
-        in_channels=[256, 256, 256, 256],
-        in_index=[0, 1, 2, 3],
-        feature_strides=[4, 8, 16, 32],
-        channels=128,
-        dropout_ratio=0.1,
-        num_classes=150,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole')
-)
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/gcnet_r50-d8.py b/ControlNet/annotator/uniformer/configs/_base_/models/gcnet_r50-d8.py
deleted file mode 100644
index 3d2ad69f5c22adfe79d5fdabf920217628987166..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/gcnet_r50-d8.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained='open-mmlab://resnet50_v1c',
-    backbone=dict(
-        type='ResNetV1c',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        dilations=(1, 1, 2, 4),
-        strides=(1, 2, 1, 1),
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        style='pytorch',
-        contract_dilation=True),
-    decode_head=dict(
-        type='GCHead',
-        in_channels=2048,
-        in_index=3,
-        channels=512,
-        ratio=1 / 4.,
-        pooling_type='att',
-        fusion_types=('channel_add', ),
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    auxiliary_head=dict(
-        type='FCNHead',
-        in_channels=1024,
-        in_index=2,
-        channels=256,
-        num_convs=1,
-        concat_input=False,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/lraspp_m-v3-d8.py b/ControlNet/annotator/uniformer/configs/_base_/models/lraspp_m-v3-d8.py
deleted file mode 100644
index 93258242a90695cc94a7c6bd41562d6a75988771..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/lraspp_m-v3-d8.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    backbone=dict(
-        type='MobileNetV3',
-        arch='large',
-        out_indices=(1, 3, 16),
-        norm_cfg=norm_cfg),
-    decode_head=dict(
-        type='LRASPPHead',
-        in_channels=(16, 24, 960),
-        in_index=(0, 1, 2),
-        channels=128,
-        input_transform='multiple_select',
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        act_cfg=dict(type='ReLU'),
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/nonlocal_r50-d8.py b/ControlNet/annotator/uniformer/configs/_base_/models/nonlocal_r50-d8.py
deleted file mode 100644
index 5674a39854cafd1f2e363bac99c58ccae62f24da..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/nonlocal_r50-d8.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained='open-mmlab://resnet50_v1c',
-    backbone=dict(
-        type='ResNetV1c',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        dilations=(1, 1, 2, 4),
-        strides=(1, 2, 1, 1),
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        style='pytorch',
-        contract_dilation=True),
-    decode_head=dict(
-        type='NLHead',
-        in_channels=2048,
-        in_index=3,
-        channels=512,
-        dropout_ratio=0.1,
-        reduction=2,
-        use_scale=True,
-        mode='embedded_gaussian',
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    auxiliary_head=dict(
-        type='FCNHead',
-        in_channels=1024,
-        in_index=2,
-        channels=256,
-        num_convs=1,
-        concat_input=False,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/ocrnet_hr18.py b/ControlNet/annotator/uniformer/configs/_base_/models/ocrnet_hr18.py
deleted file mode 100644
index c60f62a7cdf3f5c5096a7a7e725e8268fddcb057..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/ocrnet_hr18.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='CascadeEncoderDecoder',
-    num_stages=2,
-    pretrained='open-mmlab://msra/hrnetv2_w18',
-    backbone=dict(
-        type='HRNet',
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        extra=dict(
-            stage1=dict(
-                num_modules=1,
-                num_branches=1,
-                block='BOTTLENECK',
-                num_blocks=(4, ),
-                num_channels=(64, )),
-            stage2=dict(
-                num_modules=1,
-                num_branches=2,
-                block='BASIC',
-                num_blocks=(4, 4),
-                num_channels=(18, 36)),
-            stage3=dict(
-                num_modules=4,
-                num_branches=3,
-                block='BASIC',
-                num_blocks=(4, 4, 4),
-                num_channels=(18, 36, 72)),
-            stage4=dict(
-                num_modules=3,
-                num_branches=4,
-                block='BASIC',
-                num_blocks=(4, 4, 4, 4),
-                num_channels=(18, 36, 72, 144)))),
-    decode_head=[
-        dict(
-            type='FCNHead',
-            in_channels=[18, 36, 72, 144],
-            channels=sum([18, 36, 72, 144]),
-            in_index=(0, 1, 2, 3),
-            input_transform='resize_concat',
-            kernel_size=1,
-            num_convs=1,
-            concat_input=False,
-            dropout_ratio=-1,
-            num_classes=19,
-            norm_cfg=norm_cfg,
-            align_corners=False,
-            loss_decode=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-        dict(
-            type='OCRHead',
-            in_channels=[18, 36, 72, 144],
-            in_index=(0, 1, 2, 3),
-            input_transform='resize_concat',
-            channels=512,
-            ocr_channels=256,
-            dropout_ratio=-1,
-            num_classes=19,
-            norm_cfg=norm_cfg,
-            align_corners=False,
-            loss_decode=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    ],
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/ocrnet_r50-d8.py b/ControlNet/annotator/uniformer/configs/_base_/models/ocrnet_r50-d8.py
deleted file mode 100644
index 615aa3ff703942b6c22b2d6e9642504dd3e41ebd..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/ocrnet_r50-d8.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='CascadeEncoderDecoder',
-    num_stages=2,
-    pretrained='open-mmlab://resnet50_v1c',
-    backbone=dict(
-        type='ResNetV1c',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        dilations=(1, 1, 2, 4),
-        strides=(1, 2, 1, 1),
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        style='pytorch',
-        contract_dilation=True),
-    decode_head=[
-        dict(
-            type='FCNHead',
-            in_channels=1024,
-            in_index=2,
-            channels=256,
-            num_convs=1,
-            concat_input=False,
-            dropout_ratio=0.1,
-            num_classes=19,
-            norm_cfg=norm_cfg,
-            align_corners=False,
-            loss_decode=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-        dict(
-            type='OCRHead',
-            in_channels=2048,
-            in_index=3,
-            channels=512,
-            ocr_channels=256,
-            dropout_ratio=0.1,
-            num_classes=19,
-            norm_cfg=norm_cfg,
-            align_corners=False,
-            loss_decode=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
-    ],
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/pointrend_r50.py b/ControlNet/annotator/uniformer/configs/_base_/models/pointrend_r50.py
deleted file mode 100644
index 9d323dbf9466d41e0800aa57ef84045f3d874bdf..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/pointrend_r50.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='CascadeEncoderDecoder',
-    num_stages=2,
-    pretrained='open-mmlab://resnet50_v1c',
-    backbone=dict(
-        type='ResNetV1c',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        dilations=(1, 1, 1, 1),
-        strides=(1, 2, 2, 2),
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        style='pytorch',
-        contract_dilation=True),
-    neck=dict(
-        type='FPN',
-        in_channels=[256, 512, 1024, 2048],
-        out_channels=256,
-        num_outs=4),
-    decode_head=[
-        dict(
-            type='FPNHead',
-            in_channels=[256, 256, 256, 256],
-            in_index=[0, 1, 2, 3],
-            feature_strides=[4, 8, 16, 32],
-            channels=128,
-            dropout_ratio=-1,
-            num_classes=19,
-            norm_cfg=norm_cfg,
-            align_corners=False,
-            loss_decode=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-        dict(
-            type='PointHead',
-            in_channels=[256],
-            in_index=[0],
-            channels=256,
-            num_fcs=3,
-            coarse_pred_each_layer=True,
-            dropout_ratio=-1,
-            num_classes=19,
-            align_corners=False,
-            loss_decode=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
-    ],
-    # model training and testing settings
-    train_cfg=dict(
-        num_points=2048, oversample_ratio=3, importance_sample_ratio=0.75),
-    test_cfg=dict(
-        mode='whole',
-        subdivision_steps=2,
-        subdivision_num_points=8196,
-        scale_factor=2))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/psanet_r50-d8.py b/ControlNet/annotator/uniformer/configs/_base_/models/psanet_r50-d8.py
deleted file mode 100644
index 689513fa9d2a40f14bf0ae4ae61f38f0dcc1b3da..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/psanet_r50-d8.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained='open-mmlab://resnet50_v1c',
-    backbone=dict(
-        type='ResNetV1c',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        dilations=(1, 1, 2, 4),
-        strides=(1, 2, 1, 1),
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        style='pytorch',
-        contract_dilation=True),
-    decode_head=dict(
-        type='PSAHead',
-        in_channels=2048,
-        in_index=3,
-        channels=512,
-        mask_size=(97, 97),
-        psa_type='bi-direction',
-        compact=False,
-        shrink_factor=2,
-        normalization_factor=1.0,
-        psa_softmax=True,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    auxiliary_head=dict(
-        type='FCNHead',
-        in_channels=1024,
-        in_index=2,
-        channels=256,
-        num_convs=1,
-        concat_input=False,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/pspnet_r50-d8.py b/ControlNet/annotator/uniformer/configs/_base_/models/pspnet_r50-d8.py
deleted file mode 100644
index f451e08ad2eb0732dcb806b1851eb978d4acf136..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/pspnet_r50-d8.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained='open-mmlab://resnet50_v1c',
-    backbone=dict(
-        type='ResNetV1c',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        dilations=(1, 1, 2, 4),
-        strides=(1, 2, 1, 1),
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        style='pytorch',
-        contract_dilation=True),
-    decode_head=dict(
-        type='PSPHead',
-        in_channels=2048,
-        in_index=3,
-        channels=512,
-        pool_scales=(1, 2, 3, 6),
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    auxiliary_head=dict(
-        type='FCNHead',
-        in_channels=1024,
-        in_index=2,
-        channels=256,
-        num_convs=1,
-        concat_input=False,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/pspnet_unet_s5-d16.py b/ControlNet/annotator/uniformer/configs/_base_/models/pspnet_unet_s5-d16.py
deleted file mode 100644
index fcff9ec4f41fad158344ecd77313dc14564f3682..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/pspnet_unet_s5-d16.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained=None,
-    backbone=dict(
-        type='UNet',
-        in_channels=3,
-        base_channels=64,
-        num_stages=5,
-        strides=(1, 1, 1, 1, 1),
-        enc_num_convs=(2, 2, 2, 2, 2),
-        dec_num_convs=(2, 2, 2, 2),
-        downsamples=(True, True, True, True),
-        enc_dilations=(1, 1, 1, 1, 1),
-        dec_dilations=(1, 1, 1, 1),
-        with_cp=False,
-        conv_cfg=None,
-        norm_cfg=norm_cfg,
-        act_cfg=dict(type='ReLU'),
-        upsample_cfg=dict(type='InterpConv'),
-        norm_eval=False),
-    decode_head=dict(
-        type='PSPHead',
-        in_channels=64,
-        in_index=4,
-        channels=16,
-        pool_scales=(1, 2, 3, 6),
-        dropout_ratio=0.1,
-        num_classes=2,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    auxiliary_head=dict(
-        type='FCNHead',
-        in_channels=128,
-        in_index=3,
-        channels=64,
-        num_convs=1,
-        concat_input=False,
-        dropout_ratio=0.1,
-        num_classes=2,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='slide', crop_size=256, stride=170))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/upernet_r50.py b/ControlNet/annotator/uniformer/configs/_base_/models/upernet_r50.py
deleted file mode 100644
index 10974962fdd7136031fd06de1700f497d355ceaa..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/upernet_r50.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained='open-mmlab://resnet50_v1c',
-    backbone=dict(
-        type='ResNetV1c',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        dilations=(1, 1, 1, 1),
-        strides=(1, 2, 2, 2),
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        style='pytorch',
-        contract_dilation=True),
-    decode_head=dict(
-        type='UPerHead',
-        in_channels=[256, 512, 1024, 2048],
-        in_index=[0, 1, 2, 3],
-        pool_scales=(1, 2, 3, 6),
-        channels=512,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    auxiliary_head=dict(
-        type='FCNHead',
-        in_channels=1024,
-        in_index=2,
-        channels=256,
-        num_convs=1,
-        concat_input=False,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/ControlNet/annotator/uniformer/configs/_base_/models/upernet_uniformer.py b/ControlNet/annotator/uniformer/configs/_base_/models/upernet_uniformer.py
deleted file mode 100644
index 41aa4db809dc6e2c508e98051f61807d07477903..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/models/upernet_uniformer.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# model settings
-norm_cfg = dict(type='BN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained=None,
-    backbone=dict(
-        type='UniFormer',
-        embed_dim=[64, 128, 320, 512],
-        layers=[3, 4, 8, 3],
-        head_dim=64,
-        mlp_ratio=4.,
-        qkv_bias=True,
-        drop_rate=0.,
-        attn_drop_rate=0.,
-        drop_path_rate=0.1),
-    decode_head=dict(
-        type='UPerHead',
-        in_channels=[64, 128, 320, 512],
-        in_index=[0, 1, 2, 3],
-        pool_scales=(1, 2, 3, 6),
-        channels=512,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    auxiliary_head=dict(
-        type='FCNHead',
-        in_channels=320,
-        in_index=2,
-        channels=256,
-        num_convs=1,
-        concat_input=False,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
\ No newline at end of file
diff --git a/ControlNet/annotator/uniformer/configs/_base_/schedules/schedule_160k.py b/ControlNet/annotator/uniformer/configs/_base_/schedules/schedule_160k.py
deleted file mode 100644
index 52603890b10f25faf8eec9f9e5a4468fae09b811..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/schedules/schedule_160k.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# optimizer
-optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
-optimizer_config = dict()
-# learning policy
-lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
-# runtime settings
-runner = dict(type='IterBasedRunner', max_iters=160000)
-checkpoint_config = dict(by_epoch=False, interval=16000)
-evaluation = dict(interval=16000, metric='mIoU')
diff --git a/ControlNet/annotator/uniformer/configs/_base_/schedules/schedule_20k.py b/ControlNet/annotator/uniformer/configs/_base_/schedules/schedule_20k.py
deleted file mode 100644
index bf780a1b6f6521833c6a5859675147824efa599d..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/schedules/schedule_20k.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# optimizer
-optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
-optimizer_config = dict()
-# learning policy
-lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
-# runtime settings
-runner = dict(type='IterBasedRunner', max_iters=20000)
-checkpoint_config = dict(by_epoch=False, interval=2000)
-evaluation = dict(interval=2000, metric='mIoU')
diff --git a/ControlNet/annotator/uniformer/configs/_base_/schedules/schedule_40k.py b/ControlNet/annotator/uniformer/configs/_base_/schedules/schedule_40k.py
deleted file mode 100644
index cdbf841abcb26eed87bf76ab816aff4bae0630ee..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/schedules/schedule_40k.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# optimizer
-optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
-optimizer_config = dict()
-# learning policy
-lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
-# runtime settings
-runner = dict(type='IterBasedRunner', max_iters=40000)
-checkpoint_config = dict(by_epoch=False, interval=4000)
-evaluation = dict(interval=4000, metric='mIoU')
diff --git a/ControlNet/annotator/uniformer/configs/_base_/schedules/schedule_80k.py b/ControlNet/annotator/uniformer/configs/_base_/schedules/schedule_80k.py
deleted file mode 100644
index c190cee6bdc7922b688ea75dc8f152fa15c24617..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/configs/_base_/schedules/schedule_80k.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# optimizer
-optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
-optimizer_config = dict()
-# learning policy
-lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
-# runtime settings
-runner = dict(type='IterBasedRunner', max_iters=80000)
-checkpoint_config = dict(by_epoch=False, interval=8000)
-evaluation = dict(interval=8000, metric='mIoU')
diff --git a/ControlNet/annotator/uniformer/exp/upernet_global_small/config.py b/ControlNet/annotator/uniformer/exp/upernet_global_small/config.py
deleted file mode 100644
index 01db96bf9b0be531aa0eaf62fee51543712f8670..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/exp/upernet_global_small/config.py
+++ /dev/null
@@ -1,38 +0,0 @@
-_base_ = [
-    '../../configs/_base_/models/upernet_uniformer.py', 
-    '../../configs/_base_/datasets/ade20k.py',
-    '../../configs/_base_/default_runtime.py', 
-    '../../configs/_base_/schedules/schedule_160k.py'
-]
-model = dict(
-    backbone=dict(
-        type='UniFormer',
-        embed_dim=[64, 128, 320, 512],
-        layers=[3, 4, 8, 3],
-        head_dim=64,
-        drop_path_rate=0.25,
-        windows=False,
-        hybrid=False
-    ),
-    decode_head=dict(
-        in_channels=[64, 128, 320, 512],
-        num_classes=150
-    ),
-    auxiliary_head=dict(
-        in_channels=320,
-        num_classes=150
-    ))
-
-# AdamW optimizer, no weight decay for position embedding & layer norm in backbone
-optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01,
-                 paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.),
-                                                 'relative_position_bias_table': dict(decay_mult=0.),
-                                                 'norm': dict(decay_mult=0.)}))
-
-lr_config = dict(_delete_=True, policy='poly',
-                 warmup='linear',
-                 warmup_iters=1500,
-                 warmup_ratio=1e-6,
-                 power=1.0, min_lr=0.0, by_epoch=False)
-
-data=dict(samples_per_gpu=2)
\ No newline at end of file
diff --git a/ControlNet/annotator/uniformer/exp/upernet_global_small/run.sh b/ControlNet/annotator/uniformer/exp/upernet_global_small/run.sh
deleted file mode 100644
index 9fb22edfa7a32624ea08a63fe7d720c40db3b696..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/exp/upernet_global_small/run.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/usr/bin/env bash
-
-work_path=$(dirname $0)
-PYTHONPATH="$(dirname $0)/../../":$PYTHONPATH \
-python -m torch.distributed.launch --nproc_per_node=8 \
-    tools/train.py ${work_path}/config.py \
-    --launcher pytorch \
-    --options model.backbone.pretrained_path='your_model_path/uniformer_small_in1k.pth' \
-    --work-dir ${work_path}/ckpt \
-    2>&1 | tee -a ${work_path}/log.txt
diff --git a/ControlNet/annotator/uniformer/exp/upernet_global_small/test.sh b/ControlNet/annotator/uniformer/exp/upernet_global_small/test.sh
deleted file mode 100644
index d9a85e7a0d3b7c96b060f473d41254b37a382fcb..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/exp/upernet_global_small/test.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/usr/bin/env bash
-
-work_path=$(dirname $0)
-PYTHONPATH="$(dirname $0)/../../":$PYTHONPATH \
-python -m torch.distributed.launch --nproc_per_node=8 \
-    tools/test.py ${work_path}/test_config_h32.py \
-    ${work_path}/ckpt/latest.pth \
-    --launcher pytorch \
-    --eval mIoU \
-    2>&1 | tee -a ${work_path}/log.txt
diff --git a/ControlNet/annotator/uniformer/exp/upernet_global_small/test_config_g.py b/ControlNet/annotator/uniformer/exp/upernet_global_small/test_config_g.py
deleted file mode 100644
index e43737a98a3b174a9f2fe059c06d511144686459..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/exp/upernet_global_small/test_config_g.py
+++ /dev/null
@@ -1,38 +0,0 @@
-_base_ = [
-    '../../configs/_base_/models/upernet_uniformer.py', 
-    '../../configs/_base_/datasets/ade20k.py',
-    '../../configs/_base_/default_runtime.py', 
-    '../../configs/_base_/schedules/schedule_160k.py'
-]
-model = dict(
-    backbone=dict(
-        type='UniFormer',
-        embed_dim=[64, 128, 320, 512],
-        layers=[3, 4, 8, 3],
-        head_dim=64,
-        drop_path_rate=0.25,
-        windows=False,
-        hybrid=False,
-    ),
-    decode_head=dict(
-        in_channels=[64, 128, 320, 512],
-        num_classes=150
-    ),
-    auxiliary_head=dict(
-        in_channels=320,
-        num_classes=150
-    ))
-
-# AdamW optimizer, no weight decay for position embedding & layer norm in backbone
-optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01,
-                 paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.),
-                                                 'relative_position_bias_table': dict(decay_mult=0.),
-                                                 'norm': dict(decay_mult=0.)}))
-
-lr_config = dict(_delete_=True, policy='poly',
-                 warmup='linear',
-                 warmup_iters=1500,
-                 warmup_ratio=1e-6,
-                 power=1.0, min_lr=0.0, by_epoch=False)
-
-data=dict(samples_per_gpu=2)
\ No newline at end of file
diff --git a/ControlNet/annotator/uniformer/exp/upernet_global_small/test_config_h32.py b/ControlNet/annotator/uniformer/exp/upernet_global_small/test_config_h32.py
deleted file mode 100644
index a31e3874f76f9f7b089ac8834d85df2441af9b0e..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/exp/upernet_global_small/test_config_h32.py
+++ /dev/null
@@ -1,39 +0,0 @@
-_base_ = [
-    '../../configs/_base_/models/upernet_uniformer.py', 
-    '../../configs/_base_/datasets/ade20k.py',
-    '../../configs/_base_/default_runtime.py', 
-    '../../configs/_base_/schedules/schedule_160k.py'
-]
-model = dict(
-    backbone=dict(
-        type='UniFormer',
-        embed_dim=[64, 128, 320, 512],
-        layers=[3, 4, 8, 3],
-        head_dim=64,
-        drop_path_rate=0.25,
-        windows=False,
-        hybrid=True,
-        window_size=32
-    ),
-    decode_head=dict(
-        in_channels=[64, 128, 320, 512],
-        num_classes=150
-    ),
-    auxiliary_head=dict(
-        in_channels=320,
-        num_classes=150
-    ))
-
-# AdamW optimizer, no weight decay for position embedding & layer norm in backbone
-optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01,
-                 paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.),
-                                                 'relative_position_bias_table': dict(decay_mult=0.),
-                                                 'norm': dict(decay_mult=0.)}))
-
-lr_config = dict(_delete_=True, policy='poly',
-                 warmup='linear',
-                 warmup_iters=1500,
-                 warmup_ratio=1e-6,
-                 power=1.0, min_lr=0.0, by_epoch=False)
-
-data=dict(samples_per_gpu=2)
\ No newline at end of file
diff --git a/ControlNet/annotator/uniformer/exp/upernet_global_small/test_config_w32.py b/ControlNet/annotator/uniformer/exp/upernet_global_small/test_config_w32.py
deleted file mode 100644
index 3d9e06f029e46c14cb9ddb39319cabe86fef9b44..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/exp/upernet_global_small/test_config_w32.py
+++ /dev/null
@@ -1,39 +0,0 @@
-_base_ = [
-    '../../configs/_base_/models/upernet_uniformer.py', 
-    '../../configs/_base_/datasets/ade20k.py',
-    '../../configs/_base_/default_runtime.py', 
-    '../../configs/_base_/schedules/schedule_160k.py'
-]
-model = dict(
-    backbone=dict(
-        type='UniFormer',
-        embed_dim=[64, 128, 320, 512],
-        layers=[3, 4, 8, 3],
-        head_dim=64,
-        drop_path_rate=0.25,
-        windows=True,
-        hybrid=False,
-        window_size=32
-    ),
-    decode_head=dict(
-        in_channels=[64, 128, 320, 512],
-        num_classes=150
-    ),
-    auxiliary_head=dict(
-        in_channels=320,
-        num_classes=150
-    ))
-
-# AdamW optimizer, no weight decay for position embedding & layer norm in backbone
-optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01,
-                 paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.),
-                                                 'relative_position_bias_table': dict(decay_mult=0.),
-                                                 'norm': dict(decay_mult=0.)}))
-
-lr_config = dict(_delete_=True, policy='poly',
-                 warmup='linear',
-                 warmup_iters=1500,
-                 warmup_ratio=1e-6,
-                 power=1.0, min_lr=0.0, by_epoch=False)
-
-data=dict(samples_per_gpu=2)
\ No newline at end of file
diff --git a/ControlNet/annotator/uniformer/mmcv/__init__.py b/ControlNet/annotator/uniformer/mmcv/__init__.py
deleted file mode 100644
index 210a2989138380559f23045b568d0fbbeb918c03..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-# flake8: noqa
-from .arraymisc import *
-from .fileio import *
-from .image import *
-from .utils import *
-from .version import *
-from .video import *
-from .visualization import *
-
-# The following modules are not imported to this level, so mmcv may be used
-# without PyTorch.
-# - runner
-# - parallel
-# - op
diff --git a/ControlNet/annotator/uniformer/mmcv/arraymisc/__init__.py b/ControlNet/annotator/uniformer/mmcv/arraymisc/__init__.py
deleted file mode 100644
index 4b4700d6139ae3d604ff6e542468cce4200c020c..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/arraymisc/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .quantization import dequantize, quantize
-
-__all__ = ['quantize', 'dequantize']
diff --git a/ControlNet/annotator/uniformer/mmcv/arraymisc/quantization.py b/ControlNet/annotator/uniformer/mmcv/arraymisc/quantization.py
deleted file mode 100644
index 8e47a3545780cf071a1ef8195efb0b7b662c8186..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/arraymisc/quantization.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-
-
-def quantize(arr, min_val, max_val, levels, dtype=np.int64):
-    """Quantize an array of (-inf, inf) to [0, levels-1].
-
-    Args:
-        arr (ndarray): Input array.
-        min_val (scalar): Minimum value to be clipped.
-        max_val (scalar): Maximum value to be clipped.
-        levels (int): Quantization levels.
-        dtype (np.type): The type of the quantized array.
-
-    Returns:
-        tuple: Quantized array.
-    """
-    if not (isinstance(levels, int) and levels > 1):
-        raise ValueError(
-            f'levels must be a positive integer, but got {levels}')
-    if min_val >= max_val:
-        raise ValueError(
-            f'min_val ({min_val}) must be smaller than max_val ({max_val})')
-
-    arr = np.clip(arr, min_val, max_val) - min_val
-    quantized_arr = np.minimum(
-        np.floor(levels * arr / (max_val - min_val)).astype(dtype), levels - 1)
-
-    return quantized_arr
-
-
-def dequantize(arr, min_val, max_val, levels, dtype=np.float64):
-    """Dequantize an array.
-
-    Args:
-        arr (ndarray): Input array.
-        min_val (scalar): Minimum value to be clipped.
-        max_val (scalar): Maximum value to be clipped.
-        levels (int): Quantization levels.
-        dtype (np.type): The type of the dequantized array.
-
-    Returns:
-        tuple: Dequantized array.
-    """
-    if not (isinstance(levels, int) and levels > 1):
-        raise ValueError(
-            f'levels must be a positive integer, but got {levels}')
-    if min_val >= max_val:
-        raise ValueError(
-            f'min_val ({min_val}) must be smaller than max_val ({max_val})')
-
-    dequantized_arr = (arr + 0.5).astype(dtype) * (max_val -
-                                                   min_val) / levels + min_val
-
-    return dequantized_arr
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/__init__.py b/ControlNet/annotator/uniformer/mmcv/cnn/__init__.py
deleted file mode 100644
index 7246c897430f0cc7ce12719ad8608824fc734446..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/__init__.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .alexnet import AlexNet
-# yapf: disable
-from .bricks import (ACTIVATION_LAYERS, CONV_LAYERS, NORM_LAYERS,
-                     PADDING_LAYERS, PLUGIN_LAYERS, UPSAMPLE_LAYERS,
-                     ContextBlock, Conv2d, Conv3d, ConvAWS2d, ConvModule,
-                     ConvTranspose2d, ConvTranspose3d, ConvWS2d,
-                     DepthwiseSeparableConvModule, GeneralizedAttention,
-                     HSigmoid, HSwish, Linear, MaxPool2d, MaxPool3d,
-                     NonLocal1d, NonLocal2d, NonLocal3d, Scale, Swish,
-                     build_activation_layer, build_conv_layer,
-                     build_norm_layer, build_padding_layer, build_plugin_layer,
-                     build_upsample_layer, conv_ws_2d, is_norm)
-from .builder import MODELS, build_model_from_cfg
-# yapf: enable
-from .resnet import ResNet, make_res_layer
-from .utils import (INITIALIZERS, Caffe2XavierInit, ConstantInit, KaimingInit,
-                    NormalInit, PretrainedInit, TruncNormalInit, UniformInit,
-                    XavierInit, bias_init_with_prob, caffe2_xavier_init,
-                    constant_init, fuse_conv_bn, get_model_complexity_info,
-                    initialize, kaiming_init, normal_init, trunc_normal_init,
-                    uniform_init, xavier_init)
-from .vgg import VGG, make_vgg_layer
-
-__all__ = [
-    'AlexNet', 'VGG', 'make_vgg_layer', 'ResNet', 'make_res_layer',
-    'constant_init', 'xavier_init', 'normal_init', 'trunc_normal_init',
-    'uniform_init', 'kaiming_init', 'caffe2_xavier_init',
-    'bias_init_with_prob', 'ConvModule', 'build_activation_layer',
-    'build_conv_layer', 'build_norm_layer', 'build_padding_layer',
-    'build_upsample_layer', 'build_plugin_layer', 'is_norm', 'NonLocal1d',
-    'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'HSigmoid', 'Swish', 'HSwish',
-    'GeneralizedAttention', 'ACTIVATION_LAYERS', 'CONV_LAYERS', 'NORM_LAYERS',
-    'PADDING_LAYERS', 'UPSAMPLE_LAYERS', 'PLUGIN_LAYERS', 'Scale',
-    'get_model_complexity_info', 'conv_ws_2d', 'ConvAWS2d', 'ConvWS2d',
-    'fuse_conv_bn', 'DepthwiseSeparableConvModule', 'Linear', 'Conv2d',
-    'ConvTranspose2d', 'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d', 'Conv3d',
-    'initialize', 'INITIALIZERS', 'ConstantInit', 'XavierInit', 'NormalInit',
-    'TruncNormalInit', 'UniformInit', 'KaimingInit', 'PretrainedInit',
-    'Caffe2XavierInit', 'MODELS', 'build_model_from_cfg'
-]
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/alexnet.py b/ControlNet/annotator/uniformer/mmcv/cnn/alexnet.py
deleted file mode 100644
index 89e36b8c7851f895d9ae7f07149f0e707456aab0..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/alexnet.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import logging
-
-import torch.nn as nn
-
-
-class AlexNet(nn.Module):
-    """AlexNet backbone.
-
-    Args:
-        num_classes (int): number of classes for classification.
-    """
-
-    def __init__(self, num_classes=-1):
-        super(AlexNet, self).__init__()
-        self.num_classes = num_classes
-        self.features = nn.Sequential(
-            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
-            nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=3, stride=2),
-            nn.Conv2d(64, 192, kernel_size=5, padding=2),
-            nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=3, stride=2),
-            nn.Conv2d(192, 384, kernel_size=3, padding=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(384, 256, kernel_size=3, padding=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(256, 256, kernel_size=3, padding=1),
-            nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=3, stride=2),
-        )
-        if self.num_classes > 0:
-            self.classifier = nn.Sequential(
-                nn.Dropout(),
-                nn.Linear(256 * 6 * 6, 4096),
-                nn.ReLU(inplace=True),
-                nn.Dropout(),
-                nn.Linear(4096, 4096),
-                nn.ReLU(inplace=True),
-                nn.Linear(4096, num_classes),
-            )
-
-    def init_weights(self, pretrained=None):
-        if isinstance(pretrained, str):
-            logger = logging.getLogger()
-            from ..runner import load_checkpoint
-            load_checkpoint(self, pretrained, strict=False, logger=logger)
-        elif pretrained is None:
-            # use default initializer
-            pass
-        else:
-            raise TypeError('pretrained must be a str or None')
-
-    def forward(self, x):
-
-        x = self.features(x)
-        if self.num_classes > 0:
-            x = x.view(x.size(0), 256 * 6 * 6)
-            x = self.classifier(x)
-
-        return x
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/__init__.py b/ControlNet/annotator/uniformer/mmcv/cnn/bricks/__init__.py
deleted file mode 100644
index 0f33124ed23fc6f27119a37bcb5ab004d3572be0..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/__init__.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .activation import build_activation_layer
-from .context_block import ContextBlock
-from .conv import build_conv_layer
-from .conv2d_adaptive_padding import Conv2dAdaptivePadding
-from .conv_module import ConvModule
-from .conv_ws import ConvAWS2d, ConvWS2d, conv_ws_2d
-from .depthwise_separable_conv_module import DepthwiseSeparableConvModule
-from .drop import Dropout, DropPath
-from .generalized_attention import GeneralizedAttention
-from .hsigmoid import HSigmoid
-from .hswish import HSwish
-from .non_local import NonLocal1d, NonLocal2d, NonLocal3d
-from .norm import build_norm_layer, is_norm
-from .padding import build_padding_layer
-from .plugin import build_plugin_layer
-from .registry import (ACTIVATION_LAYERS, CONV_LAYERS, NORM_LAYERS,
-                       PADDING_LAYERS, PLUGIN_LAYERS, UPSAMPLE_LAYERS)
-from .scale import Scale
-from .swish import Swish
-from .upsample import build_upsample_layer
-from .wrappers import (Conv2d, Conv3d, ConvTranspose2d, ConvTranspose3d,
-                       Linear, MaxPool2d, MaxPool3d)
-
-__all__ = [
-    'ConvModule', 'build_activation_layer', 'build_conv_layer',
-    'build_norm_layer', 'build_padding_layer', 'build_upsample_layer',
-    'build_plugin_layer', 'is_norm', 'HSigmoid', 'HSwish', 'NonLocal1d',
-    'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'GeneralizedAttention',
-    'ACTIVATION_LAYERS', 'CONV_LAYERS', 'NORM_LAYERS', 'PADDING_LAYERS',
-    'UPSAMPLE_LAYERS', 'PLUGIN_LAYERS', 'Scale', 'ConvAWS2d', 'ConvWS2d',
-    'conv_ws_2d', 'DepthwiseSeparableConvModule', 'Swish', 'Linear',
-    'Conv2dAdaptivePadding', 'Conv2d', 'ConvTranspose2d', 'MaxPool2d',
-    'ConvTranspose3d', 'MaxPool3d', 'Conv3d', 'Dropout', 'DropPath'
-]
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/activation.py b/ControlNet/annotator/uniformer/mmcv/cnn/bricks/activation.py
deleted file mode 100644
index cab2712287d5ef7be2f079dcb54a94b96394eab5..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/activation.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from annotator.uniformer.mmcv.utils import TORCH_VERSION, build_from_cfg, digit_version
-from .registry import ACTIVATION_LAYERS
-
-for module in [
-        nn.ReLU, nn.LeakyReLU, nn.PReLU, nn.RReLU, nn.ReLU6, nn.ELU,
-        nn.Sigmoid, nn.Tanh
-]:
-    ACTIVATION_LAYERS.register_module(module=module)
-
-
-@ACTIVATION_LAYERS.register_module(name='Clip')
-@ACTIVATION_LAYERS.register_module()
-class Clamp(nn.Module):
-    """Clamp activation layer.
-
-    This activation function is to clamp the feature map value within
-    :math:`[min, max]`. More details can be found in ``torch.clamp()``.
-
-    Args:
-        min (Number | optional): Lower-bound of the range to be clamped to.
-            Default to -1.
-        max (Number | optional): Upper-bound of the range to be clamped to.
-            Default to 1.
-    """
-
-    def __init__(self, min=-1., max=1.):
-        super(Clamp, self).__init__()
-        self.min = min
-        self.max = max
-
-    def forward(self, x):
-        """Forward function.
-
-        Args:
-            x (torch.Tensor): The input tensor.
-
-        Returns:
-            torch.Tensor: Clamped tensor.
-        """
-        return torch.clamp(x, min=self.min, max=self.max)
-
-
-class GELU(nn.Module):
-    r"""Applies the Gaussian Error Linear Units function:
-
-    .. math::
-        \text{GELU}(x) = x * \Phi(x)
-    where :math:`\Phi(x)` is the Cumulative Distribution Function for
-    Gaussian Distribution.
-
-    Shape:
-        - Input: :math:`(N, *)` where `*` means, any number of additional
-          dimensions
-        - Output: :math:`(N, *)`, same shape as the input
-
-    .. image:: scripts/activation_images/GELU.png
-
-    Examples::
-
-        >>> m = nn.GELU()
-        >>> input = torch.randn(2)
-        >>> output = m(input)
-    """
-
-    def forward(self, input):
-        return F.gelu(input)
-
-
-if (TORCH_VERSION == 'parrots'
-        or digit_version(TORCH_VERSION) < digit_version('1.4')):
-    ACTIVATION_LAYERS.register_module(module=GELU)
-else:
-    ACTIVATION_LAYERS.register_module(module=nn.GELU)
-
-
-def build_activation_layer(cfg):
-    """Build activation layer.
-
-    Args:
-        cfg (dict): The activation layer config, which should contain:
-            - type (str): Layer type.
-            - layer args: Args needed to instantiate an activation layer.
-
-    Returns:
-        nn.Module: Created activation layer.
-    """
-    return build_from_cfg(cfg, ACTIVATION_LAYERS)
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/context_block.py b/ControlNet/annotator/uniformer/mmcv/cnn/bricks/context_block.py
deleted file mode 100644
index d60fdb904c749ce3b251510dff3cc63cea70d42e..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/context_block.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch import nn
-
-from ..utils import constant_init, kaiming_init
-from .registry import PLUGIN_LAYERS
-
-
-def last_zero_init(m):
-    if isinstance(m, nn.Sequential):
-        constant_init(m[-1], val=0)
-    else:
-        constant_init(m, val=0)
-
-
-@PLUGIN_LAYERS.register_module()
-class ContextBlock(nn.Module):
-    """ContextBlock module in GCNet.
-
-    See 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
-    (https://arxiv.org/abs/1904.11492) for details.
-
-    Args:
-        in_channels (int): Channels of the input feature map.
-        ratio (float): Ratio of channels of transform bottleneck
-        pooling_type (str): Pooling method for context modeling.
-            Options are 'att' and 'avg', stand for attention pooling and
-            average pooling respectively. Default: 'att'.
-        fusion_types (Sequence[str]): Fusion method for feature fusion,
-            Options are 'channels_add', 'channel_mul', stand for channelwise
-            addition and multiplication respectively. Default: ('channel_add',)
-    """
-
-    _abbr_ = 'context_block'
-
-    def __init__(self,
-                 in_channels,
-                 ratio,
-                 pooling_type='att',
-                 fusion_types=('channel_add', )):
-        super(ContextBlock, self).__init__()
-        assert pooling_type in ['avg', 'att']
-        assert isinstance(fusion_types, (list, tuple))
-        valid_fusion_types = ['channel_add', 'channel_mul']
-        assert all([f in valid_fusion_types for f in fusion_types])
-        assert len(fusion_types) > 0, 'at least one fusion should be used'
-        self.in_channels = in_channels
-        self.ratio = ratio
-        self.planes = int(in_channels * ratio)
-        self.pooling_type = pooling_type
-        self.fusion_types = fusion_types
-        if pooling_type == 'att':
-            self.conv_mask = nn.Conv2d(in_channels, 1, kernel_size=1)
-            self.softmax = nn.Softmax(dim=2)
-        else:
-            self.avg_pool = nn.AdaptiveAvgPool2d(1)
-        if 'channel_add' in fusion_types:
-            self.channel_add_conv = nn.Sequential(
-                nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
-                nn.LayerNorm([self.planes, 1, 1]),
-                nn.ReLU(inplace=True),  # yapf: disable
-                nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
-        else:
-            self.channel_add_conv = None
-        if 'channel_mul' in fusion_types:
-            self.channel_mul_conv = nn.Sequential(
-                nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
-                nn.LayerNorm([self.planes, 1, 1]),
-                nn.ReLU(inplace=True),  # yapf: disable
-                nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
-        else:
-            self.channel_mul_conv = None
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        if self.pooling_type == 'att':
-            kaiming_init(self.conv_mask, mode='fan_in')
-            self.conv_mask.inited = True
-
-        if self.channel_add_conv is not None:
-            last_zero_init(self.channel_add_conv)
-        if self.channel_mul_conv is not None:
-            last_zero_init(self.channel_mul_conv)
-
-    def spatial_pool(self, x):
-        batch, channel, height, width = x.size()
-        if self.pooling_type == 'att':
-            input_x = x
-            # [N, C, H * W]
-            input_x = input_x.view(batch, channel, height * width)
-            # [N, 1, C, H * W]
-            input_x = input_x.unsqueeze(1)
-            # [N, 1, H, W]
-            context_mask = self.conv_mask(x)
-            # [N, 1, H * W]
-            context_mask = context_mask.view(batch, 1, height * width)
-            # [N, 1, H * W]
-            context_mask = self.softmax(context_mask)
-            # [N, 1, H * W, 1]
-            context_mask = context_mask.unsqueeze(-1)
-            # [N, 1, C, 1]
-            context = torch.matmul(input_x, context_mask)
-            # [N, C, 1, 1]
-            context = context.view(batch, channel, 1, 1)
-        else:
-            # [N, C, 1, 1]
-            context = self.avg_pool(x)
-
-        return context
-
-    def forward(self, x):
-        # [N, C, 1, 1]
-        context = self.spatial_pool(x)
-
-        out = x
-        if self.channel_mul_conv is not None:
-            # [N, C, 1, 1]
-            channel_mul_term = torch.sigmoid(self.channel_mul_conv(context))
-            out = out * channel_mul_term
-        if self.channel_add_conv is not None:
-            # [N, C, 1, 1]
-            channel_add_term = self.channel_add_conv(context)
-            out = out + channel_add_term
-
-        return out
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/conv.py b/ControlNet/annotator/uniformer/mmcv/cnn/bricks/conv.py
deleted file mode 100644
index cf54491997a48ac3e7fadc4183ab7bf3e831024c..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/conv.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from torch import nn
-
-from .registry import CONV_LAYERS
-
-CONV_LAYERS.register_module('Conv1d', module=nn.Conv1d)
-CONV_LAYERS.register_module('Conv2d', module=nn.Conv2d)
-CONV_LAYERS.register_module('Conv3d', module=nn.Conv3d)
-CONV_LAYERS.register_module('Conv', module=nn.Conv2d)
-
-
-def build_conv_layer(cfg, *args, **kwargs):
-    """Build convolution layer.
-
-    Args:
-        cfg (None or dict): The conv layer config, which should contain:
-            - type (str): Layer type.
-            - layer args: Args needed to instantiate an conv layer.
-        args (argument list): Arguments passed to the `__init__`
-            method of the corresponding conv layer.
-        kwargs (keyword arguments): Keyword arguments passed to the `__init__`
-            method of the corresponding conv layer.
-
-    Returns:
-        nn.Module: Created conv layer.
-    """
-    if cfg is None:
-        cfg_ = dict(type='Conv2d')
-    else:
-        if not isinstance(cfg, dict):
-            raise TypeError('cfg must be a dict')
-        if 'type' not in cfg:
-            raise KeyError('the cfg dict must contain the key "type"')
-        cfg_ = cfg.copy()
-
-    layer_type = cfg_.pop('type')
-    if layer_type not in CONV_LAYERS:
-        raise KeyError(f'Unrecognized norm type {layer_type}')
-    else:
-        conv_layer = CONV_LAYERS.get(layer_type)
-
-    layer = conv_layer(*args, **kwargs, **cfg_)
-
-    return layer
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/conv2d_adaptive_padding.py b/ControlNet/annotator/uniformer/mmcv/cnn/bricks/conv2d_adaptive_padding.py
deleted file mode 100644
index b45e758ac6cf8dfb0382d072fe09125bc7e9b888..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/conv2d_adaptive_padding.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import math
-
-from torch import nn
-from torch.nn import functional as F
-
-from .registry import CONV_LAYERS
-
-
-@CONV_LAYERS.register_module()
-class Conv2dAdaptivePadding(nn.Conv2d):
-    """Implementation of 2D convolution in tensorflow with `padding` as "same",
-    which applies padding to input (if needed) so that input image gets fully
-    covered by filter and stride you specified. For stride 1, this will ensure
-    that output image size is same as input. For stride of 2, output dimensions
-    will be half, for example.
-
-    Args:
-        in_channels (int): Number of channels in the input image
-        out_channels (int): Number of channels produced by the convolution
-        kernel_size (int or tuple): Size of the convolving kernel
-        stride (int or tuple, optional): Stride of the convolution. Default: 1
-        padding (int or tuple, optional): Zero-padding added to both sides of
-            the input. Default: 0
-        dilation (int or tuple, optional): Spacing between kernel elements.
-            Default: 1
-        groups (int, optional): Number of blocked connections from input
-            channels to output channels. Default: 1
-        bias (bool, optional): If ``True``, adds a learnable bias to the
-            output. Default: ``True``
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True):
-        super().__init__(in_channels, out_channels, kernel_size, stride, 0,
-                         dilation, groups, bias)
-
-    def forward(self, x):
-        img_h, img_w = x.size()[-2:]
-        kernel_h, kernel_w = self.weight.size()[-2:]
-        stride_h, stride_w = self.stride
-        output_h = math.ceil(img_h / stride_h)
-        output_w = math.ceil(img_w / stride_w)
-        pad_h = (
-            max((output_h - 1) * self.stride[0] +
-                (kernel_h - 1) * self.dilation[0] + 1 - img_h, 0))
-        pad_w = (
-            max((output_w - 1) * self.stride[1] +
-                (kernel_w - 1) * self.dilation[1] + 1 - img_w, 0))
-        if pad_h > 0 or pad_w > 0:
-            x = F.pad(x, [
-                pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
-            ])
-        return F.conv2d(x, self.weight, self.bias, self.stride, self.padding,
-                        self.dilation, self.groups)
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/conv_module.py b/ControlNet/annotator/uniformer/mmcv/cnn/bricks/conv_module.py
deleted file mode 100644
index e60e7e62245071c77b652093fddebff3948d7c3e..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/conv_module.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-
-import torch.nn as nn
-
-from annotator.uniformer.mmcv.utils import _BatchNorm, _InstanceNorm
-from ..utils import constant_init, kaiming_init
-from .activation import build_activation_layer
-from .conv import build_conv_layer
-from .norm import build_norm_layer
-from .padding import build_padding_layer
-from .registry import PLUGIN_LAYERS
-
-
-@PLUGIN_LAYERS.register_module()
-class ConvModule(nn.Module):
-    """A conv block that bundles conv/norm/activation layers.
-
-    This block simplifies the usage of convolution layers, which are commonly
-    used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
-    It is based upon three build methods: `build_conv_layer()`,
-    `build_norm_layer()` and `build_activation_layer()`.
-
-    Besides, we add some additional features in this module.
-    1. Automatically set `bias` of the conv layer.
-    2. Spectral norm is supported.
-    3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only
-    supports zero and circular padding, and we add "reflect" padding mode.
-
-    Args:
-        in_channels (int): Number of channels in the input feature map.
-            Same as that in ``nn._ConvNd``.
-        out_channels (int): Number of channels produced by the convolution.
-            Same as that in ``nn._ConvNd``.
-        kernel_size (int | tuple[int]): Size of the convolving kernel.
-            Same as that in ``nn._ConvNd``.
-        stride (int | tuple[int]): Stride of the convolution.
-            Same as that in ``nn._ConvNd``.
-        padding (int | tuple[int]): Zero-padding added to both sides of
-            the input. Same as that in ``nn._ConvNd``.
-        dilation (int | tuple[int]): Spacing between kernel elements.
-            Same as that in ``nn._ConvNd``.
-        groups (int): Number of blocked connections from input channels to
-            output channels. Same as that in ``nn._ConvNd``.
-        bias (bool | str): If specified as `auto`, it will be decided by the
-            norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise
-            False. Default: "auto".
-        conv_cfg (dict): Config dict for convolution layer. Default: None,
-            which means using conv2d.
-        norm_cfg (dict): Config dict for normalization layer. Default: None.
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='ReLU').
-        inplace (bool): Whether to use inplace mode for activation.
-            Default: True.
-        with_spectral_norm (bool): Whether use spectral norm in conv module.
-            Default: False.
-        padding_mode (str): If the `padding_mode` has not been supported by
-            current `Conv2d` in PyTorch, we will use our own padding layer
-            instead. Currently, we support ['zeros', 'circular'] with official
-            implementation and ['reflect'] with our own implementation.
-            Default: 'zeros'.
-        order (tuple[str]): The order of conv/norm/activation layers. It is a
-            sequence of "conv", "norm" and "act". Common examples are
-            ("conv", "norm", "act") and ("act", "conv", "norm").
-            Default: ('conv', 'norm', 'act').
-    """
-
-    _abbr_ = 'conv_block'
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias='auto',
-                 conv_cfg=None,
-                 norm_cfg=None,
-                 act_cfg=dict(type='ReLU'),
-                 inplace=True,
-                 with_spectral_norm=False,
-                 padding_mode='zeros',
-                 order=('conv', 'norm', 'act')):
-        super(ConvModule, self).__init__()
-        assert conv_cfg is None or isinstance(conv_cfg, dict)
-        assert norm_cfg is None or isinstance(norm_cfg, dict)
-        assert act_cfg is None or isinstance(act_cfg, dict)
-        official_padding_mode = ['zeros', 'circular']
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
-        self.inplace = inplace
-        self.with_spectral_norm = with_spectral_norm
-        self.with_explicit_padding = padding_mode not in official_padding_mode
-        self.order = order
-        assert isinstance(self.order, tuple) and len(self.order) == 3
-        assert set(order) == set(['conv', 'norm', 'act'])
-
-        self.with_norm = norm_cfg is not None
-        self.with_activation = act_cfg is not None
-        # if the conv layer is before a norm layer, bias is unnecessary.
-        if bias == 'auto':
-            bias = not self.with_norm
-        self.with_bias = bias
-
-        if self.with_explicit_padding:
-            pad_cfg = dict(type=padding_mode)
-            self.padding_layer = build_padding_layer(pad_cfg, padding)
-
-        # reset padding to 0 for conv module
-        conv_padding = 0 if self.with_explicit_padding else padding
-        # build convolution layer
-        self.conv = build_conv_layer(
-            conv_cfg,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=stride,
-            padding=conv_padding,
-            dilation=dilation,
-            groups=groups,
-            bias=bias)
-        # export the attributes of self.conv to a higher level for convenience
-        self.in_channels = self.conv.in_channels
-        self.out_channels = self.conv.out_channels
-        self.kernel_size = self.conv.kernel_size
-        self.stride = self.conv.stride
-        self.padding = padding
-        self.dilation = self.conv.dilation
-        self.transposed = self.conv.transposed
-        self.output_padding = self.conv.output_padding
-        self.groups = self.conv.groups
-
-        if self.with_spectral_norm:
-            self.conv = nn.utils.spectral_norm(self.conv)
-
-        # build normalization layers
-        if self.with_norm:
-            # norm layer is after conv layer
-            if order.index('norm') > order.index('conv'):
-                norm_channels = out_channels
-            else:
-                norm_channels = in_channels
-            self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels)
-            self.add_module(self.norm_name, norm)
-            if self.with_bias:
-                if isinstance(norm, (_BatchNorm, _InstanceNorm)):
-                    warnings.warn(
-                        'Unnecessary conv bias before batch/instance norm')
-        else:
-            self.norm_name = None
-
-        # build activation layer
-        if self.with_activation:
-            act_cfg_ = act_cfg.copy()
-            # nn.Tanh has no 'inplace' argument
-            if act_cfg_['type'] not in [
-                    'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish'
-            ]:
-                act_cfg_.setdefault('inplace', inplace)
-            self.activate = build_activation_layer(act_cfg_)
-
-        # Use msra init by default
-        self.init_weights()
-
-    @property
-    def norm(self):
-        if self.norm_name:
-            return getattr(self, self.norm_name)
-        else:
-            return None
-
-    def init_weights(self):
-        # 1. It is mainly for customized conv layers with their own
-        #    initialization manners by calling their own ``init_weights()``,
-        #    and we do not want ConvModule to override the initialization.
-        # 2. For customized conv layers without their own initialization
-        #    manners (that is, they don't have their own ``init_weights()``)
-        #    and PyTorch's conv layers, they will be initialized by
-        #    this method with default ``kaiming_init``.
-        # Note: For PyTorch's conv layers, they will be overwritten by our
-        #    initialization implementation using default ``kaiming_init``.
-        if not hasattr(self.conv, 'init_weights'):
-            if self.with_activation and self.act_cfg['type'] == 'LeakyReLU':
-                nonlinearity = 'leaky_relu'
-                a = self.act_cfg.get('negative_slope', 0.01)
-            else:
-                nonlinearity = 'relu'
-                a = 0
-            kaiming_init(self.conv, a=a, nonlinearity=nonlinearity)
-        if self.with_norm:
-            constant_init(self.norm, 1, bias=0)
-
-    def forward(self, x, activate=True, norm=True):
-        for layer in self.order:
-            if layer == 'conv':
-                if self.with_explicit_padding:
-                    x = self.padding_layer(x)
-                x = self.conv(x)
-            elif layer == 'norm' and norm and self.with_norm:
-                x = self.norm(x)
-            elif layer == 'act' and activate and self.with_activation:
-                x = self.activate(x)
-        return x
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/conv_ws.py b/ControlNet/annotator/uniformer/mmcv/cnn/bricks/conv_ws.py
deleted file mode 100644
index a3941e27874993418b3b5708d5a7485f175ff9c8..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/conv_ws.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from .registry import CONV_LAYERS
-
-
-def conv_ws_2d(input,
-               weight,
-               bias=None,
-               stride=1,
-               padding=0,
-               dilation=1,
-               groups=1,
-               eps=1e-5):
-    c_in = weight.size(0)
-    weight_flat = weight.view(c_in, -1)
-    mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1)
-    std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1)
-    weight = (weight - mean) / (std + eps)
-    return F.conv2d(input, weight, bias, stride, padding, dilation, groups)
-
-
-@CONV_LAYERS.register_module('ConvWS')
-class ConvWS2d(nn.Conv2d):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 eps=1e-5):
-        super(ConvWS2d, self).__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            bias=bias)
-        self.eps = eps
-
-    def forward(self, x):
-        return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding,
-                          self.dilation, self.groups, self.eps)
-
-
-@CONV_LAYERS.register_module(name='ConvAWS')
-class ConvAWS2d(nn.Conv2d):
-    """AWS (Adaptive Weight Standardization)
-
-    This is a variant of Weight Standardization
-    (https://arxiv.org/pdf/1903.10520.pdf)
-    It is used in DetectoRS to avoid NaN
-    (https://arxiv.org/pdf/2006.02334.pdf)
-
-    Args:
-        in_channels (int): Number of channels in the input image
-        out_channels (int): Number of channels produced by the convolution
-        kernel_size (int or tuple): Size of the conv kernel
-        stride (int or tuple, optional): Stride of the convolution. Default: 1
-        padding (int or tuple, optional): Zero-padding added to both sides of
-            the input. Default: 0
-        dilation (int or tuple, optional): Spacing between kernel elements.
-            Default: 1
-        groups (int, optional): Number of blocked connections from input
-            channels to output channels. Default: 1
-        bias (bool, optional): If set True, adds a learnable bias to the
-            output. Default: True
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True):
-        super().__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            bias=bias)
-        self.register_buffer('weight_gamma',
-                             torch.ones(self.out_channels, 1, 1, 1))
-        self.register_buffer('weight_beta',
-                             torch.zeros(self.out_channels, 1, 1, 1))
-
-    def _get_weight(self, weight):
-        weight_flat = weight.view(weight.size(0), -1)
-        mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
-        std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
-        weight = (weight - mean) / std
-        weight = self.weight_gamma * weight + self.weight_beta
-        return weight
-
-    def forward(self, x):
-        weight = self._get_weight(self.weight)
-        return F.conv2d(x, weight, self.bias, self.stride, self.padding,
-                        self.dilation, self.groups)
-
-    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
-                              missing_keys, unexpected_keys, error_msgs):
-        """Override default load function.
-
-        AWS overrides the function _load_from_state_dict to recover
-        weight_gamma and weight_beta if they are missing. If weight_gamma and
-        weight_beta are found in the checkpoint, this function will return
-        after super()._load_from_state_dict. Otherwise, it will compute the
-        mean and std of the pretrained weights and store them in weight_beta
-        and weight_gamma.
-        """
-
-        self.weight_gamma.data.fill_(-1)
-        local_missing_keys = []
-        super()._load_from_state_dict(state_dict, prefix, local_metadata,
-                                      strict, local_missing_keys,
-                                      unexpected_keys, error_msgs)
-        if self.weight_gamma.data.mean() > 0:
-            for k in local_missing_keys:
-                missing_keys.append(k)
-            return
-        weight = self.weight.data
-        weight_flat = weight.view(weight.size(0), -1)
-        mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
-        std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
-        self.weight_beta.data.copy_(mean)
-        self.weight_gamma.data.copy_(std)
-        missing_gamma_beta = [
-            k for k in local_missing_keys
-            if k.endswith('weight_gamma') or k.endswith('weight_beta')
-        ]
-        for k in missing_gamma_beta:
-            local_missing_keys.remove(k)
-        for k in local_missing_keys:
-            missing_keys.append(k)
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/depthwise_separable_conv_module.py b/ControlNet/annotator/uniformer/mmcv/cnn/bricks/depthwise_separable_conv_module.py
deleted file mode 100644
index 722d5d8d71f75486e2db3008907c4eadfca41d63..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/depthwise_separable_conv_module.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch.nn as nn
-
-from .conv_module import ConvModule
-
-
-class DepthwiseSeparableConvModule(nn.Module):
-    """Depthwise separable convolution module.
-
-    See https://arxiv.org/pdf/1704.04861.pdf for details.
-
-    This module can replace a ConvModule with the conv block replaced by two
-    conv block: depthwise conv block and pointwise conv block. The depthwise
-    conv block contains depthwise-conv/norm/activation layers. The pointwise
-    conv block contains pointwise-conv/norm/activation layers. It should be
-    noted that there will be norm/activation layer in the depthwise conv block
-    if `norm_cfg` and `act_cfg` are specified.
-
-    Args:
-        in_channels (int): Number of channels in the input feature map.
-            Same as that in ``nn._ConvNd``.
-        out_channels (int): Number of channels produced by the convolution.
-            Same as that in ``nn._ConvNd``.
-        kernel_size (int | tuple[int]): Size of the convolving kernel.
-            Same as that in ``nn._ConvNd``.
-        stride (int | tuple[int]): Stride of the convolution.
-            Same as that in ``nn._ConvNd``. Default: 1.
-        padding (int | tuple[int]): Zero-padding added to both sides of
-            the input. Same as that in ``nn._ConvNd``. Default: 0.
-        dilation (int | tuple[int]): Spacing between kernel elements.
-            Same as that in ``nn._ConvNd``. Default: 1.
-        norm_cfg (dict): Default norm config for both depthwise ConvModule and
-            pointwise ConvModule. Default: None.
-        act_cfg (dict): Default activation config for both depthwise ConvModule
-            and pointwise ConvModule. Default: dict(type='ReLU').
-        dw_norm_cfg (dict): Norm config of depthwise ConvModule. If it is
-            'default', it will be the same as `norm_cfg`. Default: 'default'.
-        dw_act_cfg (dict): Activation config of depthwise ConvModule. If it is
-            'default', it will be the same as `act_cfg`. Default: 'default'.
-        pw_norm_cfg (dict): Norm config of pointwise ConvModule. If it is
-            'default', it will be the same as `norm_cfg`. Default: 'default'.
-        pw_act_cfg (dict): Activation config of pointwise ConvModule. If it is
-            'default', it will be the same as `act_cfg`. Default: 'default'.
-        kwargs (optional): Other shared arguments for depthwise and pointwise
-            ConvModule. See ConvModule for ref.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 norm_cfg=None,
-                 act_cfg=dict(type='ReLU'),
-                 dw_norm_cfg='default',
-                 dw_act_cfg='default',
-                 pw_norm_cfg='default',
-                 pw_act_cfg='default',
-                 **kwargs):
-        super(DepthwiseSeparableConvModule, self).__init__()
-        assert 'groups' not in kwargs, 'groups should not be specified'
-
-        # if norm/activation config of depthwise/pointwise ConvModule is not
-        # specified, use default config.
-        dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg
-        dw_act_cfg = dw_act_cfg if dw_act_cfg != 'default' else act_cfg
-        pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg
-        pw_act_cfg = pw_act_cfg if pw_act_cfg != 'default' else act_cfg
-
-        # depthwise convolution
-        self.depthwise_conv = ConvModule(
-            in_channels,
-            in_channels,
-            kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=in_channels,
-            norm_cfg=dw_norm_cfg,
-            act_cfg=dw_act_cfg,
-            **kwargs)
-
-        self.pointwise_conv = ConvModule(
-            in_channels,
-            out_channels,
-            1,
-            norm_cfg=pw_norm_cfg,
-            act_cfg=pw_act_cfg,
-            **kwargs)
-
-    def forward(self, x):
-        x = self.depthwise_conv(x)
-        x = self.pointwise_conv(x)
-        return x
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/drop.py b/ControlNet/annotator/uniformer/mmcv/cnn/bricks/drop.py
deleted file mode 100644
index b7b4fccd457a0d51fb10c789df3c8537fe7b67c1..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/drop.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-import torch.nn as nn
-
-from annotator.uniformer.mmcv import build_from_cfg
-from .registry import DROPOUT_LAYERS
-
-
-def drop_path(x, drop_prob=0., training=False):
-    """Drop paths (Stochastic Depth) per sample (when applied in main path of
-    residual blocks).
-
-    We follow the implementation
-    https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py  # noqa: E501
-    """
-    if drop_prob == 0. or not training:
-        return x
-    keep_prob = 1 - drop_prob
-    # handle tensors with different dimensions, not just 4D tensors.
-    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
-    random_tensor = keep_prob + torch.rand(
-        shape, dtype=x.dtype, device=x.device)
-    output = x.div(keep_prob) * random_tensor.floor()
-    return output
-
-
-@DROPOUT_LAYERS.register_module()
-class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of
-    residual blocks).
-
-    We follow the implementation
-    https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py  # noqa: E501
-
-    Args:
-        drop_prob (float): Probability of the path to be zeroed. Default: 0.1
-    """
-
-    def __init__(self, drop_prob=0.1):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-
-    def forward(self, x):
-        return drop_path(x, self.drop_prob, self.training)
-
-
-@DROPOUT_LAYERS.register_module()
-class Dropout(nn.Dropout):
-    """A wrapper for ``torch.nn.Dropout``, We rename the ``p`` of
-    ``torch.nn.Dropout`` to ``drop_prob`` so as to be consistent with
-    ``DropPath``
-
-    Args:
-        drop_prob (float): Probability of the elements to be
-            zeroed. Default: 0.5.
-        inplace (bool):  Do the operation inplace or not. Default: False.
-    """
-
-    def __init__(self, drop_prob=0.5, inplace=False):
-        super().__init__(p=drop_prob, inplace=inplace)
-
-
-def build_dropout(cfg, default_args=None):
-    """Builder for drop out layers."""
-    return build_from_cfg(cfg, DROPOUT_LAYERS, default_args)
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/generalized_attention.py b/ControlNet/annotator/uniformer/mmcv/cnn/bricks/generalized_attention.py
deleted file mode 100644
index 988d9adf2f289ef223bd1c680a5ae1d3387f0269..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/generalized_attention.py
+++ /dev/null
@@ -1,412 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import math
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ..utils import kaiming_init
-from .registry import PLUGIN_LAYERS
-
-
-@PLUGIN_LAYERS.register_module()
-class GeneralizedAttention(nn.Module):
-    """GeneralizedAttention module.
-
-    See 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks'
-    (https://arxiv.org/abs/1711.07971) for details.
-
-    Args:
-        in_channels (int): Channels of the input feature map.
-        spatial_range (int): The spatial range. -1 indicates no spatial range
-            constraint. Default: -1.
-        num_heads (int): The head number of empirical_attention module.
-            Default: 9.
-        position_embedding_dim (int): The position embedding dimension.
-            Default: -1.
-        position_magnitude (int): A multiplier acting on coord difference.
-            Default: 1.
-        kv_stride (int): The feature stride acting on key/value feature map.
-            Default: 2.
-        q_stride (int): The feature stride acting on query feature map.
-            Default: 1.
-        attention_type (str): A binary indicator string for indicating which
-            items in generalized empirical_attention module are used.
-            Default: '1111'.
-
-            - '1000' indicates 'query and key content' (appr - appr) item,
-            - '0100' indicates 'query content and relative position'
-              (appr - position) item,
-            - '0010' indicates 'key content only' (bias - appr) item,
-            - '0001' indicates 'relative position only' (bias - position) item.
-    """
-
-    _abbr_ = 'gen_attention_block'
-
-    def __init__(self,
-                 in_channels,
-                 spatial_range=-1,
-                 num_heads=9,
-                 position_embedding_dim=-1,
-                 position_magnitude=1,
-                 kv_stride=2,
-                 q_stride=1,
-                 attention_type='1111'):
-
-        super(GeneralizedAttention, self).__init__()
-
-        # hard range means local range for non-local operation
-        self.position_embedding_dim = (
-            position_embedding_dim
-            if position_embedding_dim > 0 else in_channels)
-
-        self.position_magnitude = position_magnitude
-        self.num_heads = num_heads
-        self.in_channels = in_channels
-        self.spatial_range = spatial_range
-        self.kv_stride = kv_stride
-        self.q_stride = q_stride
-        self.attention_type = [bool(int(_)) for _ in attention_type]
-        self.qk_embed_dim = in_channels // num_heads
-        out_c = self.qk_embed_dim * num_heads
-
-        if self.attention_type[0] or self.attention_type[1]:
-            self.query_conv = nn.Conv2d(
-                in_channels=in_channels,
-                out_channels=out_c,
-                kernel_size=1,
-                bias=False)
-            self.query_conv.kaiming_init = True
-
-        if self.attention_type[0] or self.attention_type[2]:
-            self.key_conv = nn.Conv2d(
-                in_channels=in_channels,
-                out_channels=out_c,
-                kernel_size=1,
-                bias=False)
-            self.key_conv.kaiming_init = True
-
-        self.v_dim = in_channels // num_heads
-        self.value_conv = nn.Conv2d(
-            in_channels=in_channels,
-            out_channels=self.v_dim * num_heads,
-            kernel_size=1,
-            bias=False)
-        self.value_conv.kaiming_init = True
-
-        if self.attention_type[1] or self.attention_type[3]:
-            self.appr_geom_fc_x = nn.Linear(
-                self.position_embedding_dim // 2, out_c, bias=False)
-            self.appr_geom_fc_x.kaiming_init = True
-
-            self.appr_geom_fc_y = nn.Linear(
-                self.position_embedding_dim // 2, out_c, bias=False)
-            self.appr_geom_fc_y.kaiming_init = True
-
-        if self.attention_type[2]:
-            stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
-            appr_bias_value = -2 * stdv * torch.rand(out_c) + stdv
-            self.appr_bias = nn.Parameter(appr_bias_value)
-
-        if self.attention_type[3]:
-            stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
-            geom_bias_value = -2 * stdv * torch.rand(out_c) + stdv
-            self.geom_bias = nn.Parameter(geom_bias_value)
-
-        self.proj_conv = nn.Conv2d(
-            in_channels=self.v_dim * num_heads,
-            out_channels=in_channels,
-            kernel_size=1,
-            bias=True)
-        self.proj_conv.kaiming_init = True
-        self.gamma = nn.Parameter(torch.zeros(1))
-
-        if self.spatial_range >= 0:
-            # only works when non local is after 3*3 conv
-            if in_channels == 256:
-                max_len = 84
-            elif in_channels == 512:
-                max_len = 42
-
-            max_len_kv = int((max_len - 1.0) / self.kv_stride + 1)
-            local_constraint_map = np.ones(
-                (max_len, max_len, max_len_kv, max_len_kv), dtype=np.int)
-            for iy in range(max_len):
-                for ix in range(max_len):
-                    local_constraint_map[
-                        iy, ix,
-                        max((iy - self.spatial_range) //
-                            self.kv_stride, 0):min((iy + self.spatial_range +
-                                                    1) // self.kv_stride +
-                                                   1, max_len),
-                        max((ix - self.spatial_range) //
-                            self.kv_stride, 0):min((ix + self.spatial_range +
-                                                    1) // self.kv_stride +
-                                                   1, max_len)] = 0
-
-            self.local_constraint_map = nn.Parameter(
-                torch.from_numpy(local_constraint_map).byte(),
-                requires_grad=False)
-
-        if self.q_stride > 1:
-            self.q_downsample = nn.AvgPool2d(
-                kernel_size=1, stride=self.q_stride)
-        else:
-            self.q_downsample = None
-
-        if self.kv_stride > 1:
-            self.kv_downsample = nn.AvgPool2d(
-                kernel_size=1, stride=self.kv_stride)
-        else:
-            self.kv_downsample = None
-
-        self.init_weights()
-
-    def get_position_embedding(self,
-                               h,
-                               w,
-                               h_kv,
-                               w_kv,
-                               q_stride,
-                               kv_stride,
-                               device,
-                               dtype,
-                               feat_dim,
-                               wave_length=1000):
-        # the default type of Tensor is float32, leading to type mismatch
-        # in fp16 mode. Cast it to support fp16 mode.
-        h_idxs = torch.linspace(0, h - 1, h).to(device=device, dtype=dtype)
-        h_idxs = h_idxs.view((h, 1)) * q_stride
-
-        w_idxs = torch.linspace(0, w - 1, w).to(device=device, dtype=dtype)
-        w_idxs = w_idxs.view((w, 1)) * q_stride
-
-        h_kv_idxs = torch.linspace(0, h_kv - 1, h_kv).to(
-            device=device, dtype=dtype)
-        h_kv_idxs = h_kv_idxs.view((h_kv, 1)) * kv_stride
-
-        w_kv_idxs = torch.linspace(0, w_kv - 1, w_kv).to(
-            device=device, dtype=dtype)
-        w_kv_idxs = w_kv_idxs.view((w_kv, 1)) * kv_stride
-
-        # (h, h_kv, 1)
-        h_diff = h_idxs.unsqueeze(1) - h_kv_idxs.unsqueeze(0)
-        h_diff *= self.position_magnitude
-
-        # (w, w_kv, 1)
-        w_diff = w_idxs.unsqueeze(1) - w_kv_idxs.unsqueeze(0)
-        w_diff *= self.position_magnitude
-
-        feat_range = torch.arange(0, feat_dim / 4).to(
-            device=device, dtype=dtype)
-
-        dim_mat = torch.Tensor([wave_length]).to(device=device, dtype=dtype)
-        dim_mat = dim_mat**((4. / feat_dim) * feat_range)
-        dim_mat = dim_mat.view((1, 1, -1))
-
-        embedding_x = torch.cat(
-            ((w_diff / dim_mat).sin(), (w_diff / dim_mat).cos()), dim=2)
-
-        embedding_y = torch.cat(
-            ((h_diff / dim_mat).sin(), (h_diff / dim_mat).cos()), dim=2)
-
-        return embedding_x, embedding_y
-
-    def forward(self, x_input):
-        num_heads = self.num_heads
-
-        # use empirical_attention
-        if self.q_downsample is not None:
-            x_q = self.q_downsample(x_input)
-        else:
-            x_q = x_input
-        n, _, h, w = x_q.shape
-
-        if self.kv_downsample is not None:
-            x_kv = self.kv_downsample(x_input)
-        else:
-            x_kv = x_input
-        _, _, h_kv, w_kv = x_kv.shape
-
-        if self.attention_type[0] or self.attention_type[1]:
-            proj_query = self.query_conv(x_q).view(
-                (n, num_heads, self.qk_embed_dim, h * w))
-            proj_query = proj_query.permute(0, 1, 3, 2)
-
-        if self.attention_type[0] or self.attention_type[2]:
-            proj_key = self.key_conv(x_kv).view(
-                (n, num_heads, self.qk_embed_dim, h_kv * w_kv))
-
-        if self.attention_type[1] or self.attention_type[3]:
-            position_embed_x, position_embed_y = self.get_position_embedding(
-                h, w, h_kv, w_kv, self.q_stride, self.kv_stride,
-                x_input.device, x_input.dtype, self.position_embedding_dim)
-            # (n, num_heads, w, w_kv, dim)
-            position_feat_x = self.appr_geom_fc_x(position_embed_x).\
-                view(1, w, w_kv, num_heads, self.qk_embed_dim).\
-                permute(0, 3, 1, 2, 4).\
-                repeat(n, 1, 1, 1, 1)
-
-            # (n, num_heads, h, h_kv, dim)
-            position_feat_y = self.appr_geom_fc_y(position_embed_y).\
-                view(1, h, h_kv, num_heads, self.qk_embed_dim).\
-                permute(0, 3, 1, 2, 4).\
-                repeat(n, 1, 1, 1, 1)
-
-            position_feat_x /= math.sqrt(2)
-            position_feat_y /= math.sqrt(2)
-
-        # accelerate for saliency only
-        if (np.sum(self.attention_type) == 1) and self.attention_type[2]:
-            appr_bias = self.appr_bias.\
-                view(1, num_heads, 1, self.qk_embed_dim).\
-                repeat(n, 1, 1, 1)
-
-            energy = torch.matmul(appr_bias, proj_key).\
-                view(n, num_heads, 1, h_kv * w_kv)
-
-            h = 1
-            w = 1
-        else:
-            # (n, num_heads, h*w, h_kv*w_kv), query before key, 540mb for
-            if not self.attention_type[0]:
-                energy = torch.zeros(
-                    n,
-                    num_heads,
-                    h,
-                    w,
-                    h_kv,
-                    w_kv,
-                    dtype=x_input.dtype,
-                    device=x_input.device)
-
-            # attention_type[0]: appr - appr
-            # attention_type[1]: appr - position
-            # attention_type[2]: bias - appr
-            # attention_type[3]: bias - position
-            if self.attention_type[0] or self.attention_type[2]:
-                if self.attention_type[0] and self.attention_type[2]:
-                    appr_bias = self.appr_bias.\
-                        view(1, num_heads, 1, self.qk_embed_dim)
-                    energy = torch.matmul(proj_query + appr_bias, proj_key).\
-                        view(n, num_heads, h, w, h_kv, w_kv)
-
-                elif self.attention_type[0]:
-                    energy = torch.matmul(proj_query, proj_key).\
-                        view(n, num_heads, h, w, h_kv, w_kv)
-
-                elif self.attention_type[2]:
-                    appr_bias = self.appr_bias.\
-                        view(1, num_heads, 1, self.qk_embed_dim).\
-                        repeat(n, 1, 1, 1)
-
-                    energy += torch.matmul(appr_bias, proj_key).\
-                        view(n, num_heads, 1, 1, h_kv, w_kv)
-
-            if self.attention_type[1] or self.attention_type[3]:
-                if self.attention_type[1] and self.attention_type[3]:
-                    geom_bias = self.geom_bias.\
-                        view(1, num_heads, 1, self.qk_embed_dim)
-
-                    proj_query_reshape = (proj_query + geom_bias).\
-                        view(n, num_heads, h, w, self.qk_embed_dim)
-
-                    energy_x = torch.matmul(
-                        proj_query_reshape.permute(0, 1, 3, 2, 4),
-                        position_feat_x.permute(0, 1, 2, 4, 3))
-                    energy_x = energy_x.\
-                        permute(0, 1, 3, 2, 4).unsqueeze(4)
-
-                    energy_y = torch.matmul(
-                        proj_query_reshape,
-                        position_feat_y.permute(0, 1, 2, 4, 3))
-                    energy_y = energy_y.unsqueeze(5)
-
-                    energy += energy_x + energy_y
-
-                elif self.attention_type[1]:
-                    proj_query_reshape = proj_query.\
-                        view(n, num_heads, h, w, self.qk_embed_dim)
-                    proj_query_reshape = proj_query_reshape.\
-                        permute(0, 1, 3, 2, 4)
-                    position_feat_x_reshape = position_feat_x.\
-                        permute(0, 1, 2, 4, 3)
-                    position_feat_y_reshape = position_feat_y.\
-                        permute(0, 1, 2, 4, 3)
-
-                    energy_x = torch.matmul(proj_query_reshape,
-                                            position_feat_x_reshape)
-                    energy_x = energy_x.permute(0, 1, 3, 2, 4).unsqueeze(4)
-
-                    energy_y = torch.matmul(proj_query_reshape,
-                                            position_feat_y_reshape)
-                    energy_y = energy_y.unsqueeze(5)
-
-                    energy += energy_x + energy_y
-
-                elif self.attention_type[3]:
-                    geom_bias = self.geom_bias.\
-                        view(1, num_heads, self.qk_embed_dim, 1).\
-                        repeat(n, 1, 1, 1)
-
-                    position_feat_x_reshape = position_feat_x.\
-                        view(n, num_heads, w*w_kv, self.qk_embed_dim)
-
-                    position_feat_y_reshape = position_feat_y.\
-                        view(n, num_heads, h * h_kv, self.qk_embed_dim)
-
-                    energy_x = torch.matmul(position_feat_x_reshape, geom_bias)
-                    energy_x = energy_x.view(n, num_heads, 1, w, 1, w_kv)
-
-                    energy_y = torch.matmul(position_feat_y_reshape, geom_bias)
-                    energy_y = energy_y.view(n, num_heads, h, 1, h_kv, 1)
-
-                    energy += energy_x + energy_y
-
-            energy = energy.view(n, num_heads, h * w, h_kv * w_kv)
-
-        if self.spatial_range >= 0:
-            cur_local_constraint_map = \
-                self.local_constraint_map[:h, :w, :h_kv, :w_kv].\
-                contiguous().\
-                view(1, 1, h*w, h_kv*w_kv)
-
-            energy = energy.masked_fill_(cur_local_constraint_map,
-                                         float('-inf'))
-
-        attention = F.softmax(energy, 3)
-
-        proj_value = self.value_conv(x_kv)
-        proj_value_reshape = proj_value.\
-            view((n, num_heads, self.v_dim, h_kv * w_kv)).\
-            permute(0, 1, 3, 2)
-
-        out = torch.matmul(attention, proj_value_reshape).\
-            permute(0, 1, 3, 2).\
-            contiguous().\
-            view(n, self.v_dim * self.num_heads, h, w)
-
-        out = self.proj_conv(out)
-
-        # output is downsampled, upsample back to input size
-        if self.q_downsample is not None:
-            out = F.interpolate(
-                out,
-                size=x_input.shape[2:],
-                mode='bilinear',
-                align_corners=False)
-
-        out = self.gamma * out + x_input
-        return out
-
-    def init_weights(self):
-        for m in self.modules():
-            if hasattr(m, 'kaiming_init') and m.kaiming_init:
-                kaiming_init(
-                    m,
-                    mode='fan_in',
-                    nonlinearity='leaky_relu',
-                    bias=0,
-                    distribution='uniform',
-                    a=1)
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/hsigmoid.py b/ControlNet/annotator/uniformer/mmcv/cnn/bricks/hsigmoid.py
deleted file mode 100644
index 30b1a3d6580cf0360710426fbea1f05acdf07b4b..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/hsigmoid.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch.nn as nn
-
-from .registry import ACTIVATION_LAYERS
-
-
-@ACTIVATION_LAYERS.register_module()
-class HSigmoid(nn.Module):
-    """Hard Sigmoid Module. Apply the hard sigmoid function:
-    Hsigmoid(x) = min(max((x + bias) / divisor, min_value), max_value)
-    Default: Hsigmoid(x) = min(max((x + 1) / 2, 0), 1)
-
-    Args:
-        bias (float): Bias of the input feature map. Default: 1.0.
-        divisor (float): Divisor of the input feature map. Default: 2.0.
-        min_value (float): Lower bound value. Default: 0.0.
-        max_value (float): Upper bound value. Default: 1.0.
-
-    Returns:
-        Tensor: The output tensor.
-    """
-
-    def __init__(self, bias=1.0, divisor=2.0, min_value=0.0, max_value=1.0):
-        super(HSigmoid, self).__init__()
-        self.bias = bias
-        self.divisor = divisor
-        assert self.divisor != 0
-        self.min_value = min_value
-        self.max_value = max_value
-
-    def forward(self, x):
-        x = (x + self.bias) / self.divisor
-
-        return x.clamp_(self.min_value, self.max_value)
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/hswish.py b/ControlNet/annotator/uniformer/mmcv/cnn/bricks/hswish.py
deleted file mode 100644
index 7e0c090ff037c99ee6c5c84c4592e87beae02208..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/hswish.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch.nn as nn
-
-from .registry import ACTIVATION_LAYERS
-
-
-@ACTIVATION_LAYERS.register_module()
-class HSwish(nn.Module):
-    """Hard Swish Module.
-
-    This module applies the hard swish function:
-
-    .. math::
-        Hswish(x) = x * ReLU6(x + 3) / 6
-
-    Args:
-        inplace (bool): can optionally do the operation in-place.
-            Default: False.
-
-    Returns:
-        Tensor: The output tensor.
-    """
-
-    def __init__(self, inplace=False):
-        super(HSwish, self).__init__()
-        self.act = nn.ReLU6(inplace)
-
-    def forward(self, x):
-        return x * self.act(x + 3) / 6
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/non_local.py b/ControlNet/annotator/uniformer/mmcv/cnn/bricks/non_local.py
deleted file mode 100644
index 92d00155ef275c1201ea66bba30470a1785cc5d7..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/non_local.py
+++ /dev/null
@@ -1,306 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from abc import ABCMeta
-
-import torch
-import torch.nn as nn
-
-from ..utils import constant_init, normal_init
-from .conv_module import ConvModule
-from .registry import PLUGIN_LAYERS
-
-
-class _NonLocalNd(nn.Module, metaclass=ABCMeta):
-    """Basic Non-local module.
-
-    This module is proposed in
-    "Non-local Neural Networks"
-    Paper reference: https://arxiv.org/abs/1711.07971
-    Code reference: https://github.com/AlexHex7/Non-local_pytorch
-
-    Args:
-        in_channels (int): Channels of the input feature map.
-        reduction (int): Channel reduction ratio. Default: 2.
-        use_scale (bool): Whether to scale pairwise_weight by
-            `1/sqrt(inter_channels)` when the mode is `embedded_gaussian`.
-            Default: True.
-        conv_cfg (None | dict): The config dict for convolution layers.
-            If not specified, it will use `nn.Conv2d` for convolution layers.
-            Default: None.
-        norm_cfg (None | dict): The config dict for normalization layers.
-            Default: None. (This parameter is only applicable to conv_out.)
-        mode (str): Options are `gaussian`, `concatenation`,
-            `embedded_gaussian` and `dot_product`. Default: embedded_gaussian.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 reduction=2,
-                 use_scale=True,
-                 conv_cfg=None,
-                 norm_cfg=None,
-                 mode='embedded_gaussian',
-                 **kwargs):
-        super(_NonLocalNd, self).__init__()
-        self.in_channels = in_channels
-        self.reduction = reduction
-        self.use_scale = use_scale
-        self.inter_channels = max(in_channels // reduction, 1)
-        self.mode = mode
-
-        if mode not in [
-                'gaussian', 'embedded_gaussian', 'dot_product', 'concatenation'
-        ]:
-            raise ValueError("Mode should be in 'gaussian', 'concatenation', "
-                             f"'embedded_gaussian' or 'dot_product', but got "
-                             f'{mode} instead.')
-
-        # g, theta, phi are defaulted as `nn.ConvNd`.
-        # Here we use ConvModule for potential usage.
-        self.g = ConvModule(
-            self.in_channels,
-            self.inter_channels,
-            kernel_size=1,
-            conv_cfg=conv_cfg,
-            act_cfg=None)
-        self.conv_out = ConvModule(
-            self.inter_channels,
-            self.in_channels,
-            kernel_size=1,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=None)
-
-        if self.mode != 'gaussian':
-            self.theta = ConvModule(
-                self.in_channels,
-                self.inter_channels,
-                kernel_size=1,
-                conv_cfg=conv_cfg,
-                act_cfg=None)
-            self.phi = ConvModule(
-                self.in_channels,
-                self.inter_channels,
-                kernel_size=1,
-                conv_cfg=conv_cfg,
-                act_cfg=None)
-
-        if self.mode == 'concatenation':
-            self.concat_project = ConvModule(
-                self.inter_channels * 2,
-                1,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                bias=False,
-                act_cfg=dict(type='ReLU'))
-
-        self.init_weights(**kwargs)
-
-    def init_weights(self, std=0.01, zeros_init=True):
-        if self.mode != 'gaussian':
-            for m in [self.g, self.theta, self.phi]:
-                normal_init(m.conv, std=std)
-        else:
-            normal_init(self.g.conv, std=std)
-        if zeros_init:
-            if self.conv_out.norm_cfg is None:
-                constant_init(self.conv_out.conv, 0)
-            else:
-                constant_init(self.conv_out.norm, 0)
-        else:
-            if self.conv_out.norm_cfg is None:
-                normal_init(self.conv_out.conv, std=std)
-            else:
-                normal_init(self.conv_out.norm, std=std)
-
-    def gaussian(self, theta_x, phi_x):
-        # NonLocal1d pairwise_weight: [N, H, H]
-        # NonLocal2d pairwise_weight: [N, HxW, HxW]
-        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
-        pairwise_weight = torch.matmul(theta_x, phi_x)
-        pairwise_weight = pairwise_weight.softmax(dim=-1)
-        return pairwise_weight
-
-    def embedded_gaussian(self, theta_x, phi_x):
-        # NonLocal1d pairwise_weight: [N, H, H]
-        # NonLocal2d pairwise_weight: [N, HxW, HxW]
-        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
-        pairwise_weight = torch.matmul(theta_x, phi_x)
-        if self.use_scale:
-            # theta_x.shape[-1] is `self.inter_channels`
-            pairwise_weight /= theta_x.shape[-1]**0.5
-        pairwise_weight = pairwise_weight.softmax(dim=-1)
-        return pairwise_weight
-
-    def dot_product(self, theta_x, phi_x):
-        # NonLocal1d pairwise_weight: [N, H, H]
-        # NonLocal2d pairwise_weight: [N, HxW, HxW]
-        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
-        pairwise_weight = torch.matmul(theta_x, phi_x)
-        pairwise_weight /= pairwise_weight.shape[-1]
-        return pairwise_weight
-
-    def concatenation(self, theta_x, phi_x):
-        # NonLocal1d pairwise_weight: [N, H, H]
-        # NonLocal2d pairwise_weight: [N, HxW, HxW]
-        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
-        h = theta_x.size(2)
-        w = phi_x.size(3)
-        theta_x = theta_x.repeat(1, 1, 1, w)
-        phi_x = phi_x.repeat(1, 1, h, 1)
-
-        concat_feature = torch.cat([theta_x, phi_x], dim=1)
-        pairwise_weight = self.concat_project(concat_feature)
-        n, _, h, w = pairwise_weight.size()
-        pairwise_weight = pairwise_weight.view(n, h, w)
-        pairwise_weight /= pairwise_weight.shape[-1]
-
-        return pairwise_weight
-
-    def forward(self, x):
-        # Assume `reduction = 1`, then `inter_channels = C`
-        # or `inter_channels = C` when `mode="gaussian"`
-
-        # NonLocal1d x: [N, C, H]
-        # NonLocal2d x: [N, C, H, W]
-        # NonLocal3d x: [N, C, T, H, W]
-        n = x.size(0)
-
-        # NonLocal1d g_x: [N, H, C]
-        # NonLocal2d g_x: [N, HxW, C]
-        # NonLocal3d g_x: [N, TxHxW, C]
-        g_x = self.g(x).view(n, self.inter_channels, -1)
-        g_x = g_x.permute(0, 2, 1)
-
-        # NonLocal1d theta_x: [N, H, C], phi_x: [N, C, H]
-        # NonLocal2d theta_x: [N, HxW, C], phi_x: [N, C, HxW]
-        # NonLocal3d theta_x: [N, TxHxW, C], phi_x: [N, C, TxHxW]
-        if self.mode == 'gaussian':
-            theta_x = x.view(n, self.in_channels, -1)
-            theta_x = theta_x.permute(0, 2, 1)
-            if self.sub_sample:
-                phi_x = self.phi(x).view(n, self.in_channels, -1)
-            else:
-                phi_x = x.view(n, self.in_channels, -1)
-        elif self.mode == 'concatenation':
-            theta_x = self.theta(x).view(n, self.inter_channels, -1, 1)
-            phi_x = self.phi(x).view(n, self.inter_channels, 1, -1)
-        else:
-            theta_x = self.theta(x).view(n, self.inter_channels, -1)
-            theta_x = theta_x.permute(0, 2, 1)
-            phi_x = self.phi(x).view(n, self.inter_channels, -1)
-
-        pairwise_func = getattr(self, self.mode)
-        # NonLocal1d pairwise_weight: [N, H, H]
-        # NonLocal2d pairwise_weight: [N, HxW, HxW]
-        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
-        pairwise_weight = pairwise_func(theta_x, phi_x)
-
-        # NonLocal1d y: [N, H, C]
-        # NonLocal2d y: [N, HxW, C]
-        # NonLocal3d y: [N, TxHxW, C]
-        y = torch.matmul(pairwise_weight, g_x)
-        # NonLocal1d y: [N, C, H]
-        # NonLocal2d y: [N, C, H, W]
-        # NonLocal3d y: [N, C, T, H, W]
-        y = y.permute(0, 2, 1).contiguous().reshape(n, self.inter_channels,
-                                                    *x.size()[2:])
-
-        output = x + self.conv_out(y)
-
-        return output
-
-
-class NonLocal1d(_NonLocalNd):
-    """1D Non-local module.
-
-    Args:
-        in_channels (int): Same as `NonLocalND`.
-        sub_sample (bool): Whether to apply max pooling after pairwise
-            function (Note that the `sub_sample` is applied on spatial only).
-            Default: False.
-        conv_cfg (None | dict): Same as `NonLocalND`.
-            Default: dict(type='Conv1d').
-    """
-
-    def __init__(self,
-                 in_channels,
-                 sub_sample=False,
-                 conv_cfg=dict(type='Conv1d'),
-                 **kwargs):
-        super(NonLocal1d, self).__init__(
-            in_channels, conv_cfg=conv_cfg, **kwargs)
-
-        self.sub_sample = sub_sample
-
-        if sub_sample:
-            max_pool_layer = nn.MaxPool1d(kernel_size=2)
-            self.g = nn.Sequential(self.g, max_pool_layer)
-            if self.mode != 'gaussian':
-                self.phi = nn.Sequential(self.phi, max_pool_layer)
-            else:
-                self.phi = max_pool_layer
-
-
-@PLUGIN_LAYERS.register_module()
-class NonLocal2d(_NonLocalNd):
-    """2D Non-local module.
-
-    Args:
-        in_channels (int): Same as `NonLocalND`.
-        sub_sample (bool): Whether to apply max pooling after pairwise
-            function (Note that the `sub_sample` is applied on spatial only).
-            Default: False.
-        conv_cfg (None | dict): Same as `NonLocalND`.
-            Default: dict(type='Conv2d').
-    """
-
-    _abbr_ = 'nonlocal_block'
-
-    def __init__(self,
-                 in_channels,
-                 sub_sample=False,
-                 conv_cfg=dict(type='Conv2d'),
-                 **kwargs):
-        super(NonLocal2d, self).__init__(
-            in_channels, conv_cfg=conv_cfg, **kwargs)
-
-        self.sub_sample = sub_sample
-
-        if sub_sample:
-            max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
-            self.g = nn.Sequential(self.g, max_pool_layer)
-            if self.mode != 'gaussian':
-                self.phi = nn.Sequential(self.phi, max_pool_layer)
-            else:
-                self.phi = max_pool_layer
-
-
-class NonLocal3d(_NonLocalNd):
-    """3D Non-local module.
-
-    Args:
-        in_channels (int): Same as `NonLocalND`.
-        sub_sample (bool): Whether to apply max pooling after pairwise
-            function (Note that the `sub_sample` is applied on spatial only).
-            Default: False.
-        conv_cfg (None | dict): Same as `NonLocalND`.
-            Default: dict(type='Conv3d').
-    """
-
-    def __init__(self,
-                 in_channels,
-                 sub_sample=False,
-                 conv_cfg=dict(type='Conv3d'),
-                 **kwargs):
-        super(NonLocal3d, self).__init__(
-            in_channels, conv_cfg=conv_cfg, **kwargs)
-        self.sub_sample = sub_sample
-
-        if sub_sample:
-            max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
-            self.g = nn.Sequential(self.g, max_pool_layer)
-            if self.mode != 'gaussian':
-                self.phi = nn.Sequential(self.phi, max_pool_layer)
-            else:
-                self.phi = max_pool_layer
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/norm.py b/ControlNet/annotator/uniformer/mmcv/cnn/bricks/norm.py
deleted file mode 100644
index 408f4b42731b19a3beeef68b6a5e610d0bbc18b3..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/norm.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import inspect
-
-import torch.nn as nn
-
-from annotator.uniformer.mmcv.utils import is_tuple_of
-from annotator.uniformer.mmcv.utils.parrots_wrapper import SyncBatchNorm, _BatchNorm, _InstanceNorm
-from .registry import NORM_LAYERS
-
-NORM_LAYERS.register_module('BN', module=nn.BatchNorm2d)
-NORM_LAYERS.register_module('BN1d', module=nn.BatchNorm1d)
-NORM_LAYERS.register_module('BN2d', module=nn.BatchNorm2d)
-NORM_LAYERS.register_module('BN3d', module=nn.BatchNorm3d)
-NORM_LAYERS.register_module('SyncBN', module=SyncBatchNorm)
-NORM_LAYERS.register_module('GN', module=nn.GroupNorm)
-NORM_LAYERS.register_module('LN', module=nn.LayerNorm)
-NORM_LAYERS.register_module('IN', module=nn.InstanceNorm2d)
-NORM_LAYERS.register_module('IN1d', module=nn.InstanceNorm1d)
-NORM_LAYERS.register_module('IN2d', module=nn.InstanceNorm2d)
-NORM_LAYERS.register_module('IN3d', module=nn.InstanceNorm3d)
-
-
-def infer_abbr(class_type):
-    """Infer abbreviation from the class name.
-
-    When we build a norm layer with `build_norm_layer()`, we want to preserve
-    the norm type in variable names, e.g, self.bn1, self.gn. This method will
-    infer the abbreviation to map class types to abbreviations.
-
-    Rule 1: If the class has the property "_abbr_", return the property.
-    Rule 2: If the parent class is _BatchNorm, GroupNorm, LayerNorm or
-    InstanceNorm, the abbreviation of this layer will be "bn", "gn", "ln" and
-    "in" respectively.
-    Rule 3: If the class name contains "batch", "group", "layer" or "instance",
-    the abbreviation of this layer will be "bn", "gn", "ln" and "in"
-    respectively.
-    Rule 4: Otherwise, the abbreviation falls back to "norm".
-
-    Args:
-        class_type (type): The norm layer type.
-
-    Returns:
-        str: The inferred abbreviation.
-    """
-    if not inspect.isclass(class_type):
-        raise TypeError(
-            f'class_type must be a type, but got {type(class_type)}')
-    if hasattr(class_type, '_abbr_'):
-        return class_type._abbr_
-    if issubclass(class_type, _InstanceNorm):  # IN is a subclass of BN
-        return 'in'
-    elif issubclass(class_type, _BatchNorm):
-        return 'bn'
-    elif issubclass(class_type, nn.GroupNorm):
-        return 'gn'
-    elif issubclass(class_type, nn.LayerNorm):
-        return 'ln'
-    else:
-        class_name = class_type.__name__.lower()
-        if 'batch' in class_name:
-            return 'bn'
-        elif 'group' in class_name:
-            return 'gn'
-        elif 'layer' in class_name:
-            return 'ln'
-        elif 'instance' in class_name:
-            return 'in'
-        else:
-            return 'norm_layer'
-
-
-def build_norm_layer(cfg, num_features, postfix=''):
-    """Build normalization layer.
-
-    Args:
-        cfg (dict): The norm layer config, which should contain:
-
-            - type (str): Layer type.
-            - layer args: Args needed to instantiate a norm layer.
-            - requires_grad (bool, optional): Whether stop gradient updates.
-        num_features (int): Number of input channels.
-        postfix (int | str): The postfix to be appended into norm abbreviation
-            to create named layer.
-
-    Returns:
-        (str, nn.Module): The first element is the layer name consisting of
-            abbreviation and postfix, e.g., bn1, gn. The second element is the
-            created norm layer.
-    """
-    if not isinstance(cfg, dict):
-        raise TypeError('cfg must be a dict')
-    if 'type' not in cfg:
-        raise KeyError('the cfg dict must contain the key "type"')
-    cfg_ = cfg.copy()
-
-    layer_type = cfg_.pop('type')
-    if layer_type not in NORM_LAYERS:
-        raise KeyError(f'Unrecognized norm type {layer_type}')
-
-    norm_layer = NORM_LAYERS.get(layer_type)
-    abbr = infer_abbr(norm_layer)
-
-    assert isinstance(postfix, (int, str))
-    name = abbr + str(postfix)
-
-    requires_grad = cfg_.pop('requires_grad', True)
-    cfg_.setdefault('eps', 1e-5)
-    if layer_type != 'GN':
-        layer = norm_layer(num_features, **cfg_)
-        if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'):
-            layer._specify_ddp_gpu_num(1)
-    else:
-        assert 'num_groups' in cfg_
-        layer = norm_layer(num_channels=num_features, **cfg_)
-
-    for param in layer.parameters():
-        param.requires_grad = requires_grad
-
-    return name, layer
-
-
-def is_norm(layer, exclude=None):
-    """Check if a layer is a normalization layer.
-
-    Args:
-        layer (nn.Module): The layer to be checked.
-        exclude (type | tuple[type]): Types to be excluded.
-
-    Returns:
-        bool: Whether the layer is a norm layer.
-    """
-    if exclude is not None:
-        if not isinstance(exclude, tuple):
-            exclude = (exclude, )
-        if not is_tuple_of(exclude, type):
-            raise TypeError(
-                f'"exclude" must be either None or type or a tuple of types, '
-                f'but got {type(exclude)}: {exclude}')
-
-    if exclude and isinstance(layer, exclude):
-        return False
-
-    all_norm_bases = (_BatchNorm, _InstanceNorm, nn.GroupNorm, nn.LayerNorm)
-    return isinstance(layer, all_norm_bases)
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/padding.py b/ControlNet/annotator/uniformer/mmcv/cnn/bricks/padding.py
deleted file mode 100644
index e4ac6b28a1789bd551c613a7d3e7b622433ac7ec..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/padding.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch.nn as nn
-
-from .registry import PADDING_LAYERS
-
-PADDING_LAYERS.register_module('zero', module=nn.ZeroPad2d)
-PADDING_LAYERS.register_module('reflect', module=nn.ReflectionPad2d)
-PADDING_LAYERS.register_module('replicate', module=nn.ReplicationPad2d)
-
-
-def build_padding_layer(cfg, *args, **kwargs):
-    """Build padding layer.
-
-    Args:
-        cfg (None or dict): The padding layer config, which should contain:
-            - type (str): Layer type.
-            - layer args: Args needed to instantiate a padding layer.
-
-    Returns:
-        nn.Module: Created padding layer.
-    """
-    if not isinstance(cfg, dict):
-        raise TypeError('cfg must be a dict')
-    if 'type' not in cfg:
-        raise KeyError('the cfg dict must contain the key "type"')
-
-    cfg_ = cfg.copy()
-    padding_type = cfg_.pop('type')
-    if padding_type not in PADDING_LAYERS:
-        raise KeyError(f'Unrecognized padding type {padding_type}.')
-    else:
-        padding_layer = PADDING_LAYERS.get(padding_type)
-
-    layer = padding_layer(*args, **kwargs, **cfg_)
-
-    return layer
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/plugin.py b/ControlNet/annotator/uniformer/mmcv/cnn/bricks/plugin.py
deleted file mode 100644
index 07c010d4053174dd41107aa654ea67e82b46a25c..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/plugin.py
+++ /dev/null
@@ -1,88 +0,0 @@
-import inspect
-import platform
-
-from .registry import PLUGIN_LAYERS
-
-if platform.system() == 'Windows':
-    import regex as re
-else:
-    import re
-
-
-def infer_abbr(class_type):
-    """Infer abbreviation from the class name.
-
-    This method will infer the abbreviation to map class types to
-    abbreviations.
-
-    Rule 1: If the class has the property "abbr", return the property.
-    Rule 2: Otherwise, the abbreviation falls back to snake case of class
-    name, e.g. the abbreviation of ``FancyBlock`` will be ``fancy_block``.
-
-    Args:
-        class_type (type): The norm layer type.
-
-    Returns:
-        str: The inferred abbreviation.
-    """
-
-    def camel2snack(word):
-        """Convert camel case word into snack case.
-
-        Modified from `inflection lib
-        <https://inflection.readthedocs.io/en/latest/#inflection.underscore>`_.
-
-        Example::
-
-            >>> camel2snack("FancyBlock")
-            'fancy_block'
-        """
-
-        word = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', word)
-        word = re.sub(r'([a-z\d])([A-Z])', r'\1_\2', word)
-        word = word.replace('-', '_')
-        return word.lower()
-
-    if not inspect.isclass(class_type):
-        raise TypeError(
-            f'class_type must be a type, but got {type(class_type)}')
-    if hasattr(class_type, '_abbr_'):
-        return class_type._abbr_
-    else:
-        return camel2snack(class_type.__name__)
-
-
-def build_plugin_layer(cfg, postfix='', **kwargs):
-    """Build plugin layer.
-
-    Args:
-        cfg (None or dict): cfg should contain:
-            type (str): identify plugin layer type.
-            layer args: args needed to instantiate a plugin layer.
-        postfix (int, str): appended into norm abbreviation to
-            create named layer. Default: ''.
-
-    Returns:
-        tuple[str, nn.Module]:
-            name (str): abbreviation + postfix
-            layer (nn.Module): created plugin layer
-    """
-    if not isinstance(cfg, dict):
-        raise TypeError('cfg must be a dict')
-    if 'type' not in cfg:
-        raise KeyError('the cfg dict must contain the key "type"')
-    cfg_ = cfg.copy()
-
-    layer_type = cfg_.pop('type')
-    if layer_type not in PLUGIN_LAYERS:
-        raise KeyError(f'Unrecognized plugin type {layer_type}')
-
-    plugin_layer = PLUGIN_LAYERS.get(layer_type)
-    abbr = infer_abbr(plugin_layer)
-
-    assert isinstance(postfix, (int, str))
-    name = abbr + str(postfix)
-
-    layer = plugin_layer(**kwargs, **cfg_)
-
-    return name, layer
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/registry.py b/ControlNet/annotator/uniformer/mmcv/cnn/bricks/registry.py
deleted file mode 100644
index 39eabc58db4b5954478a2ac1ab91cea5e45ab055..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/registry.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from annotator.uniformer.mmcv.utils import Registry
-
-CONV_LAYERS = Registry('conv layer')
-NORM_LAYERS = Registry('norm layer')
-ACTIVATION_LAYERS = Registry('activation layer')
-PADDING_LAYERS = Registry('padding layer')
-UPSAMPLE_LAYERS = Registry('upsample layer')
-PLUGIN_LAYERS = Registry('plugin layer')
-
-DROPOUT_LAYERS = Registry('drop out layers')
-POSITIONAL_ENCODING = Registry('position encoding')
-ATTENTION = Registry('attention')
-FEEDFORWARD_NETWORK = Registry('feed-forward Network')
-TRANSFORMER_LAYER = Registry('transformerLayer')
-TRANSFORMER_LAYER_SEQUENCE = Registry('transformer-layers sequence')
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/scale.py b/ControlNet/annotator/uniformer/mmcv/cnn/bricks/scale.py
deleted file mode 100644
index c905fffcc8bf998d18d94f927591963c428025e2..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/scale.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-import torch.nn as nn
-
-
-class Scale(nn.Module):
-    """A learnable scale parameter.
-
-    This layer scales the input by a learnable factor. It multiplies a
-    learnable scale parameter of shape (1,) with input of any shape.
-
-    Args:
-        scale (float): Initial value of scale factor. Default: 1.0
-    """
-
-    def __init__(self, scale=1.0):
-        super(Scale, self).__init__()
-        self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))
-
-    def forward(self, x):
-        return x * self.scale
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/swish.py b/ControlNet/annotator/uniformer/mmcv/cnn/bricks/swish.py
deleted file mode 100644
index e2ca8ed7b749413f011ae54aac0cab27e6f0b51f..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/swish.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-import torch.nn as nn
-
-from .registry import ACTIVATION_LAYERS
-
-
-@ACTIVATION_LAYERS.register_module()
-class Swish(nn.Module):
-    """Swish Module.
-
-    This module applies the swish function:
-
-    .. math::
-        Swish(x) = x * Sigmoid(x)
-
-    Returns:
-        Tensor: The output tensor.
-    """
-
-    def __init__(self):
-        super(Swish, self).__init__()
-
-    def forward(self, x):
-        return x * torch.sigmoid(x)
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/transformer.py b/ControlNet/annotator/uniformer/mmcv/cnn/bricks/transformer.py
deleted file mode 100644
index e61ae0dd941a7be00b3e41a3de833ec50470a45f..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/transformer.py
+++ /dev/null
@@ -1,595 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import warnings
-
-import torch
-import torch.nn as nn
-
-from annotator.uniformer.mmcv import ConfigDict, deprecated_api_warning
-from annotator.uniformer.mmcv.cnn import Linear, build_activation_layer, build_norm_layer
-from annotator.uniformer.mmcv.runner.base_module import BaseModule, ModuleList, Sequential
-from annotator.uniformer.mmcv.utils import build_from_cfg
-from .drop import build_dropout
-from .registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING,
-                       TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE)
-
-# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file
-try:
-    from annotator.uniformer.mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention  # noqa F401
-    warnings.warn(
-        ImportWarning(
-            '``MultiScaleDeformableAttention`` has been moved to '
-            '``mmcv.ops.multi_scale_deform_attn``, please change original path '  # noqa E501
-            '``from annotator.uniformer.mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` '  # noqa E501
-            'to ``from annotator.uniformer.mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` '  # noqa E501
-        ))
-
-except ImportError:
-    warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from '
-                  '``mmcv.ops.multi_scale_deform_attn``, '
-                  'You should install ``mmcv-full`` if you need this module. ')
-
-
-def build_positional_encoding(cfg, default_args=None):
-    """Builder for Position Encoding."""
-    return build_from_cfg(cfg, POSITIONAL_ENCODING, default_args)
-
-
-def build_attention(cfg, default_args=None):
-    """Builder for attention."""
-    return build_from_cfg(cfg, ATTENTION, default_args)
-
-
-def build_feedforward_network(cfg, default_args=None):
-    """Builder for feed-forward network (FFN)."""
-    return build_from_cfg(cfg, FEEDFORWARD_NETWORK, default_args)
-
-
-def build_transformer_layer(cfg, default_args=None):
-    """Builder for transformer layer."""
-    return build_from_cfg(cfg, TRANSFORMER_LAYER, default_args)
-
-
-def build_transformer_layer_sequence(cfg, default_args=None):
-    """Builder for transformer encoder and transformer decoder."""
-    return build_from_cfg(cfg, TRANSFORMER_LAYER_SEQUENCE, default_args)
-
-
-@ATTENTION.register_module()
-class MultiheadAttention(BaseModule):
-    """A wrapper for ``torch.nn.MultiheadAttention``.
-
-    This module implements MultiheadAttention with identity connection,
-    and positional encoding  is also passed as input.
-
-    Args:
-        embed_dims (int): The embedding dimension.
-        num_heads (int): Parallel attention heads.
-        attn_drop (float): A Dropout layer on attn_output_weights.
-            Default: 0.0.
-        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
-            Default: 0.0.
-        dropout_layer (obj:`ConfigDict`): The dropout_layer used
-            when adding the shortcut.
-        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
-            Default: None.
-        batch_first (bool): When it is True,  Key, Query and Value are shape of
-            (batch, n, embed_dim), otherwise (n, batch, embed_dim).
-             Default to False.
-    """
-
-    def __init__(self,
-                 embed_dims,
-                 num_heads,
-                 attn_drop=0.,
-                 proj_drop=0.,
-                 dropout_layer=dict(type='Dropout', drop_prob=0.),
-                 init_cfg=None,
-                 batch_first=False,
-                 **kwargs):
-        super(MultiheadAttention, self).__init__(init_cfg)
-        if 'dropout' in kwargs:
-            warnings.warn('The arguments `dropout` in MultiheadAttention '
-                          'has been deprecated, now you can separately '
-                          'set `attn_drop`(float), proj_drop(float), '
-                          'and `dropout_layer`(dict) ')
-            attn_drop = kwargs['dropout']
-            dropout_layer['drop_prob'] = kwargs.pop('dropout')
-
-        self.embed_dims = embed_dims
-        self.num_heads = num_heads
-        self.batch_first = batch_first
-
-        self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop,
-                                          **kwargs)
-
-        self.proj_drop = nn.Dropout(proj_drop)
-        self.dropout_layer = build_dropout(
-            dropout_layer) if dropout_layer else nn.Identity()
-
-    @deprecated_api_warning({'residual': 'identity'},
-                            cls_name='MultiheadAttention')
-    def forward(self,
-                query,
-                key=None,
-                value=None,
-                identity=None,
-                query_pos=None,
-                key_pos=None,
-                attn_mask=None,
-                key_padding_mask=None,
-                **kwargs):
-        """Forward function for `MultiheadAttention`.
-
-        **kwargs allow passing a more general data flow when combining
-        with other operations in `transformerlayer`.
-
-        Args:
-            query (Tensor): The input query with shape [num_queries, bs,
-                embed_dims] if self.batch_first is False, else
-                [bs, num_queries embed_dims].
-            key (Tensor): The key tensor with shape [num_keys, bs,
-                embed_dims] if self.batch_first is False, else
-                [bs, num_keys, embed_dims] .
-                If None, the ``query`` will be used. Defaults to None.
-            value (Tensor): The value tensor with same shape as `key`.
-                Same in `nn.MultiheadAttention.forward`. Defaults to None.
-                If None, the `key` will be used.
-            identity (Tensor): This tensor, with the same shape as x,
-                will be used for the identity link.
-                If None, `x` will be used. Defaults to None.
-            query_pos (Tensor): The positional encoding for query, with
-                the same shape as `x`. If not None, it will
-                be added to `x` before forward function. Defaults to None.
-            key_pos (Tensor): The positional encoding for `key`, with the
-                same shape as `key`. Defaults to None. If not None, it will
-                be added to `key` before forward function. If None, and
-                `query_pos` has the same shape as `key`, then `query_pos`
-                will be used for `key_pos`. Defaults to None.
-            attn_mask (Tensor): ByteTensor mask with shape [num_queries,
-                num_keys]. Same in `nn.MultiheadAttention.forward`.
-                Defaults to None.
-            key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
-                Defaults to None.
-
-        Returns:
-            Tensor: forwarded results with shape
-                [num_queries, bs, embed_dims]
-                if self.batch_first is False, else
-                [bs, num_queries embed_dims].
-        """
-
-        if key is None:
-            key = query
-        if value is None:
-            value = key
-        if identity is None:
-            identity = query
-        if key_pos is None:
-            if query_pos is not None:
-                # use query_pos if key_pos is not available
-                if query_pos.shape == key.shape:
-                    key_pos = query_pos
-                else:
-                    warnings.warn(f'position encoding of key is'
-                                  f'missing in {self.__class__.__name__}.')
-        if query_pos is not None:
-            query = query + query_pos
-        if key_pos is not None:
-            key = key + key_pos
-
-        # Because the dataflow('key', 'query', 'value') of
-        # ``torch.nn.MultiheadAttention`` is (num_query, batch,
-        # embed_dims), We should adjust the shape of dataflow from
-        # batch_first (batch, num_query, embed_dims) to num_query_first
-        # (num_query ,batch, embed_dims), and recover ``attn_output``
-        # from num_query_first to batch_first.
-        if self.batch_first:
-            query = query.transpose(0, 1)
-            key = key.transpose(0, 1)
-            value = value.transpose(0, 1)
-
-        out = self.attn(
-            query=query,
-            key=key,
-            value=value,
-            attn_mask=attn_mask,
-            key_padding_mask=key_padding_mask)[0]
-
-        if self.batch_first:
-            out = out.transpose(0, 1)
-
-        return identity + self.dropout_layer(self.proj_drop(out))
-
-
-@FEEDFORWARD_NETWORK.register_module()
-class FFN(BaseModule):
-    """Implements feed-forward networks (FFNs) with identity connection.
-
-    Args:
-        embed_dims (int): The feature dimension. Same as
-            `MultiheadAttention`. Defaults: 256.
-        feedforward_channels (int): The hidden dimension of FFNs.
-            Defaults: 1024.
-        num_fcs (int, optional): The number of fully-connected layers in
-            FFNs. Default: 2.
-        act_cfg (dict, optional): The activation config for FFNs.
-            Default: dict(type='ReLU')
-        ffn_drop (float, optional): Probability of an element to be
-            zeroed in FFN. Default 0.0.
-        add_identity (bool, optional): Whether to add the
-            identity connection. Default: `True`.
-        dropout_layer (obj:`ConfigDict`): The dropout_layer used
-            when adding the shortcut.
-        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
-            Default: None.
-    """
-
-    @deprecated_api_warning(
-        {
-            'dropout': 'ffn_drop',
-            'add_residual': 'add_identity'
-        },
-        cls_name='FFN')
-    def __init__(self,
-                 embed_dims=256,
-                 feedforward_channels=1024,
-                 num_fcs=2,
-                 act_cfg=dict(type='ReLU', inplace=True),
-                 ffn_drop=0.,
-                 dropout_layer=None,
-                 add_identity=True,
-                 init_cfg=None,
-                 **kwargs):
-        super(FFN, self).__init__(init_cfg)
-        assert num_fcs >= 2, 'num_fcs should be no less ' \
-            f'than 2. got {num_fcs}.'
-        self.embed_dims = embed_dims
-        self.feedforward_channels = feedforward_channels
-        self.num_fcs = num_fcs
-        self.act_cfg = act_cfg
-        self.activate = build_activation_layer(act_cfg)
-
-        layers = []
-        in_channels = embed_dims
-        for _ in range(num_fcs - 1):
-            layers.append(
-                Sequential(
-                    Linear(in_channels, feedforward_channels), self.activate,
-                    nn.Dropout(ffn_drop)))
-            in_channels = feedforward_channels
-        layers.append(Linear(feedforward_channels, embed_dims))
-        layers.append(nn.Dropout(ffn_drop))
-        self.layers = Sequential(*layers)
-        self.dropout_layer = build_dropout(
-            dropout_layer) if dropout_layer else torch.nn.Identity()
-        self.add_identity = add_identity
-
-    @deprecated_api_warning({'residual': 'identity'}, cls_name='FFN')
-    def forward(self, x, identity=None):
-        """Forward function for `FFN`.
-
-        The function would add x to the output tensor if residue is None.
-        """
-        out = self.layers(x)
-        if not self.add_identity:
-            return self.dropout_layer(out)
-        if identity is None:
-            identity = x
-        return identity + self.dropout_layer(out)
-
-
-@TRANSFORMER_LAYER.register_module()
-class BaseTransformerLayer(BaseModule):
-    """Base `TransformerLayer` for vision transformer.
-
-    It can be built from `mmcv.ConfigDict` and support more flexible
-    customization, for example, using any number of `FFN or LN ` and
-    use different kinds of `attention` by specifying a list of `ConfigDict`
-    named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
-    when you specifying `norm` as the first element of `operation_order`.
-    More details about the `prenorm`: `On Layer Normalization in the
-    Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .
-
-    Args:
-        attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
-            Configs for `self_attention` or `cross_attention` modules,
-            The order of the configs in the list should be consistent with
-            corresponding attentions in operation_order.
-            If it is a dict, all of the attention modules in operation_order
-            will be built with this config. Default: None.
-        ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
-            Configs for FFN, The order of the configs in the list should be
-            consistent with corresponding ffn in operation_order.
-            If it is a dict, all of the attention modules in operation_order
-            will be built with this config.
-        operation_order (tuple[str]): The execution order of operation
-            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
-            Support `prenorm` when you specifying first element as `norm`.
-            Default：None.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='LN').
-        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
-            Default: None.
-        batch_first (bool): Key, Query and Value are shape
-            of (batch, n, embed_dim)
-            or (n, batch, embed_dim). Default to False.
-    """
-
-    def __init__(self,
-                 attn_cfgs=None,
-                 ffn_cfgs=dict(
-                     type='FFN',
-                     embed_dims=256,
-                     feedforward_channels=1024,
-                     num_fcs=2,
-                     ffn_drop=0.,
-                     act_cfg=dict(type='ReLU', inplace=True),
-                 ),
-                 operation_order=None,
-                 norm_cfg=dict(type='LN'),
-                 init_cfg=None,
-                 batch_first=False,
-                 **kwargs):
-
-        deprecated_args = dict(
-            feedforward_channels='feedforward_channels',
-            ffn_dropout='ffn_drop',
-            ffn_num_fcs='num_fcs')
-        for ori_name, new_name in deprecated_args.items():
-            if ori_name in kwargs:
-                warnings.warn(
-                    f'The arguments `{ori_name}` in BaseTransformerLayer '
-                    f'has been deprecated, now you should set `{new_name}` '
-                    f'and other FFN related arguments '
-                    f'to a dict named `ffn_cfgs`. ')
-                ffn_cfgs[new_name] = kwargs[ori_name]
-
-        super(BaseTransformerLayer, self).__init__(init_cfg)
-
-        self.batch_first = batch_first
-
-        assert set(operation_order) & set(
-            ['self_attn', 'norm', 'ffn', 'cross_attn']) == \
-            set(operation_order), f'The operation_order of' \
-            f' {self.__class__.__name__} should ' \
-            f'contains all four operation type ' \
-            f"{['self_attn', 'norm', 'ffn', 'cross_attn']}"
-
-        num_attn = operation_order.count('self_attn') + operation_order.count(
-            'cross_attn')
-        if isinstance(attn_cfgs, dict):
-            attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
-        else:
-            assert num_attn == len(attn_cfgs), f'The length ' \
-                f'of attn_cfg {num_attn} is ' \
-                f'not consistent with the number of attention' \
-                f'in operation_order {operation_order}.'
-
-        self.num_attn = num_attn
-        self.operation_order = operation_order
-        self.norm_cfg = norm_cfg
-        self.pre_norm = operation_order[0] == 'norm'
-        self.attentions = ModuleList()
-
-        index = 0
-        for operation_name in operation_order:
-            if operation_name in ['self_attn', 'cross_attn']:
-                if 'batch_first' in attn_cfgs[index]:
-                    assert self.batch_first == attn_cfgs[index]['batch_first']
-                else:
-                    attn_cfgs[index]['batch_first'] = self.batch_first
-                attention = build_attention(attn_cfgs[index])
-                # Some custom attentions used as `self_attn`
-                # or `cross_attn` can have different behavior.
-                attention.operation_name = operation_name
-                self.attentions.append(attention)
-                index += 1
-
-        self.embed_dims = self.attentions[0].embed_dims
-
-        self.ffns = ModuleList()
-        num_ffns = operation_order.count('ffn')
-        if isinstance(ffn_cfgs, dict):
-            ffn_cfgs = ConfigDict(ffn_cfgs)
-        if isinstance(ffn_cfgs, dict):
-            ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
-        assert len(ffn_cfgs) == num_ffns
-        for ffn_index in range(num_ffns):
-            if 'embed_dims' not in ffn_cfgs[ffn_index]:
-                ffn_cfgs['embed_dims'] = self.embed_dims
-            else:
-                assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims
-            self.ffns.append(
-                build_feedforward_network(ffn_cfgs[ffn_index],
-                                          dict(type='FFN')))
-
-        self.norms = ModuleList()
-        num_norms = operation_order.count('norm')
-        for _ in range(num_norms):
-            self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])
-
-    def forward(self,
-                query,
-                key=None,
-                value=None,
-                query_pos=None,
-                key_pos=None,
-                attn_masks=None,
-                query_key_padding_mask=None,
-                key_padding_mask=None,
-                **kwargs):
-        """Forward function for `TransformerDecoderLayer`.
-
-        **kwargs contains some specific arguments of attentions.
-
-        Args:
-            query (Tensor): The input query with shape
-                [num_queries, bs, embed_dims] if
-                self.batch_first is False, else
-                [bs, num_queries embed_dims].
-            key (Tensor): The key tensor with shape [num_keys, bs,
-                embed_dims] if self.batch_first is False, else
-                [bs, num_keys, embed_dims] .
-            value (Tensor): The value tensor with same shape as `key`.
-            query_pos (Tensor): The positional encoding for `query`.
-                Default: None.
-            key_pos (Tensor): The positional encoding for `key`.
-                Default: None.
-            attn_masks (List[Tensor] | None): 2D Tensor used in
-                calculation of corresponding attention. The length of
-                it should equal to the number of `attention` in
-                `operation_order`. Default: None.
-            query_key_padding_mask (Tensor): ByteTensor for `query`, with
-                shape [bs, num_queries]. Only used in `self_attn` layer.
-                Defaults to None.
-            key_padding_mask (Tensor): ByteTensor for `query`, with
-                shape [bs, num_keys]. Default: None.
-
-        Returns:
-            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
-        """
-
-        norm_index = 0
-        attn_index = 0
-        ffn_index = 0
-        identity = query
-        if attn_masks is None:
-            attn_masks = [None for _ in range(self.num_attn)]
-        elif isinstance(attn_masks, torch.Tensor):
-            attn_masks = [
-                copy.deepcopy(attn_masks) for _ in range(self.num_attn)
-            ]
-            warnings.warn(f'Use same attn_mask in all attentions in '
-                          f'{self.__class__.__name__} ')
-        else:
-            assert len(attn_masks) == self.num_attn, f'The length of ' \
-                        f'attn_masks {len(attn_masks)} must be equal ' \
-                        f'to the number of attention in ' \
-                        f'operation_order {self.num_attn}'
-
-        for layer in self.operation_order:
-            if layer == 'self_attn':
-                temp_key = temp_value = query
-                query = self.attentions[attn_index](
-                    query,
-                    temp_key,
-                    temp_value,
-                    identity if self.pre_norm else None,
-                    query_pos=query_pos,
-                    key_pos=query_pos,
-                    attn_mask=attn_masks[attn_index],
-                    key_padding_mask=query_key_padding_mask,
-                    **kwargs)
-                attn_index += 1
-                identity = query
-
-            elif layer == 'norm':
-                query = self.norms[norm_index](query)
-                norm_index += 1
-
-            elif layer == 'cross_attn':
-                query = self.attentions[attn_index](
-                    query,
-                    key,
-                    value,
-                    identity if self.pre_norm else None,
-                    query_pos=query_pos,
-                    key_pos=key_pos,
-                    attn_mask=attn_masks[attn_index],
-                    key_padding_mask=key_padding_mask,
-                    **kwargs)
-                attn_index += 1
-                identity = query
-
-            elif layer == 'ffn':
-                query = self.ffns[ffn_index](
-                    query, identity if self.pre_norm else None)
-                ffn_index += 1
-
-        return query
-
-
-@TRANSFORMER_LAYER_SEQUENCE.register_module()
-class TransformerLayerSequence(BaseModule):
-    """Base class for TransformerEncoder and TransformerDecoder in vision
-    transformer.
-
-    As base-class of Encoder and Decoder in vision transformer.
-    Support customization such as specifying different kind
-    of `transformer_layer` in `transformer_coder`.
-
-    Args:
-        transformerlayer (list[obj:`mmcv.ConfigDict`] |
-            obj:`mmcv.ConfigDict`): Config of transformerlayer
-            in TransformerCoder. If it is obj:`mmcv.ConfigDict`,
-             it would be repeated `num_layer` times to a
-             list[`mmcv.ConfigDict`]. Default: None.
-        num_layers (int): The number of `TransformerLayer`. Default: None.
-        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
-            Default: None.
-    """
-
-    def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None):
-        super(TransformerLayerSequence, self).__init__(init_cfg)
-        if isinstance(transformerlayers, dict):
-            transformerlayers = [
-                copy.deepcopy(transformerlayers) for _ in range(num_layers)
-            ]
-        else:
-            assert isinstance(transformerlayers, list) and \
-                   len(transformerlayers) == num_layers
-        self.num_layers = num_layers
-        self.layers = ModuleList()
-        for i in range(num_layers):
-            self.layers.append(build_transformer_layer(transformerlayers[i]))
-        self.embed_dims = self.layers[0].embed_dims
-        self.pre_norm = self.layers[0].pre_norm
-
-    def forward(self,
-                query,
-                key,
-                value,
-                query_pos=None,
-                key_pos=None,
-                attn_masks=None,
-                query_key_padding_mask=None,
-                key_padding_mask=None,
-                **kwargs):
-        """Forward function for `TransformerCoder`.
-
-        Args:
-            query (Tensor): Input query with shape
-                `(num_queries, bs, embed_dims)`.
-            key (Tensor): The key tensor with shape
-                `(num_keys, bs, embed_dims)`.
-            value (Tensor): The value tensor with shape
-                `(num_keys, bs, embed_dims)`.
-            query_pos (Tensor): The positional encoding for `query`.
-                Default: None.
-            key_pos (Tensor): The positional encoding for `key`.
-                Default: None.
-            attn_masks (List[Tensor], optional): Each element is 2D Tensor
-                which is used in calculation of corresponding attention in
-                operation_order. Default: None.
-            query_key_padding_mask (Tensor): ByteTensor for `query`, with
-                shape [bs, num_queries]. Only used in self-attention
-                Default: None.
-            key_padding_mask (Tensor): ByteTensor for `query`, with
-                shape [bs, num_keys]. Default: None.
-
-        Returns:
-            Tensor:  results with shape [num_queries, bs, embed_dims].
-        """
-        for layer in self.layers:
-            query = layer(
-                query,
-                key,
-                value,
-                query_pos=query_pos,
-                key_pos=key_pos,
-                attn_masks=attn_masks,
-                query_key_padding_mask=query_key_padding_mask,
-                key_padding_mask=key_padding_mask,
-                **kwargs)
-        return query
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/upsample.py b/ControlNet/annotator/uniformer/mmcv/cnn/bricks/upsample.py
deleted file mode 100644
index a1a353767d0ce8518f0d7289bed10dba0178ed12..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/upsample.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ..utils import xavier_init
-from .registry import UPSAMPLE_LAYERS
-
-UPSAMPLE_LAYERS.register_module('nearest', module=nn.Upsample)
-UPSAMPLE_LAYERS.register_module('bilinear', module=nn.Upsample)
-
-
-@UPSAMPLE_LAYERS.register_module(name='pixel_shuffle')
-class PixelShufflePack(nn.Module):
-    """Pixel Shuffle upsample layer.
-
-    This module packs `F.pixel_shuffle()` and a nn.Conv2d module together to
-    achieve a simple upsampling with pixel shuffle.
-
-    Args:
-        in_channels (int): Number of input channels.
-        out_channels (int): Number of output channels.
-        scale_factor (int): Upsample ratio.
-        upsample_kernel (int): Kernel size of the conv layer to expand the
-            channels.
-    """
-
-    def __init__(self, in_channels, out_channels, scale_factor,
-                 upsample_kernel):
-        super(PixelShufflePack, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.scale_factor = scale_factor
-        self.upsample_kernel = upsample_kernel
-        self.upsample_conv = nn.Conv2d(
-            self.in_channels,
-            self.out_channels * scale_factor * scale_factor,
-            self.upsample_kernel,
-            padding=(self.upsample_kernel - 1) // 2)
-        self.init_weights()
-
-    def init_weights(self):
-        xavier_init(self.upsample_conv, distribution='uniform')
-
-    def forward(self, x):
-        x = self.upsample_conv(x)
-        x = F.pixel_shuffle(x, self.scale_factor)
-        return x
-
-
-def build_upsample_layer(cfg, *args, **kwargs):
-    """Build upsample layer.
-
-    Args:
-        cfg (dict): The upsample layer config, which should contain:
-
-            - type (str): Layer type.
-            - scale_factor (int): Upsample ratio, which is not applicable to
-                deconv.
-            - layer args: Args needed to instantiate a upsample layer.
-        args (argument list): Arguments passed to the ``__init__``
-            method of the corresponding conv layer.
-        kwargs (keyword arguments): Keyword arguments passed to the
-            ``__init__`` method of the corresponding conv layer.
-
-    Returns:
-        nn.Module: Created upsample layer.
-    """
-    if not isinstance(cfg, dict):
-        raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
-    if 'type' not in cfg:
-        raise KeyError(
-            f'the cfg dict must contain the key "type", but got {cfg}')
-    cfg_ = cfg.copy()
-
-    layer_type = cfg_.pop('type')
-    if layer_type not in UPSAMPLE_LAYERS:
-        raise KeyError(f'Unrecognized upsample type {layer_type}')
-    else:
-        upsample = UPSAMPLE_LAYERS.get(layer_type)
-
-    if upsample is nn.Upsample:
-        cfg_['mode'] = layer_type
-    layer = upsample(*args, **kwargs, **cfg_)
-    return layer
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/wrappers.py b/ControlNet/annotator/uniformer/mmcv/cnn/bricks/wrappers.py
deleted file mode 100644
index 8aebf67bf52355a513f21756ee74fe510902d075..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/bricks/wrappers.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-r"""Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/wrappers.py  # noqa: E501
-
-Wrap some nn modules to support empty tensor input. Currently, these wrappers
-are mainly used in mask heads like fcn_mask_head and maskiou_heads since mask
-heads are trained on only positive RoIs.
-"""
-import math
-
-import torch
-import torch.nn as nn
-from torch.nn.modules.utils import _pair, _triple
-
-from .registry import CONV_LAYERS, UPSAMPLE_LAYERS
-
-if torch.__version__ == 'parrots':
-    TORCH_VERSION = torch.__version__
-else:
-    # torch.__version__ could be 1.3.1+cu92, we only need the first two
-    # for comparison
-    TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2])
-
-
-def obsolete_torch_version(torch_version, version_threshold):
-    return torch_version == 'parrots' or torch_version <= version_threshold
-
-
-class NewEmptyTensorOp(torch.autograd.Function):
-
-    @staticmethod
-    def forward(ctx, x, new_shape):
-        ctx.shape = x.shape
-        return x.new_empty(new_shape)
-
-    @staticmethod
-    def backward(ctx, grad):
-        shape = ctx.shape
-        return NewEmptyTensorOp.apply(grad, shape), None
-
-
-@CONV_LAYERS.register_module('Conv', force=True)
-class Conv2d(nn.Conv2d):
-
-    def forward(self, x):
-        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
-            out_shape = [x.shape[0], self.out_channels]
-            for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size,
-                                     self.padding, self.stride, self.dilation):
-                o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
-                out_shape.append(o)
-            empty = NewEmptyTensorOp.apply(x, out_shape)
-            if self.training:
-                # produce dummy gradient to avoid DDP warning.
-                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
-                return empty + dummy
-            else:
-                return empty
-
-        return super().forward(x)
-
-
-@CONV_LAYERS.register_module('Conv3d', force=True)
-class Conv3d(nn.Conv3d):
-
-    def forward(self, x):
-        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
-            out_shape = [x.shape[0], self.out_channels]
-            for i, k, p, s, d in zip(x.shape[-3:], self.kernel_size,
-                                     self.padding, self.stride, self.dilation):
-                o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
-                out_shape.append(o)
-            empty = NewEmptyTensorOp.apply(x, out_shape)
-            if self.training:
-                # produce dummy gradient to avoid DDP warning.
-                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
-                return empty + dummy
-            else:
-                return empty
-
-        return super().forward(x)
-
-
-@CONV_LAYERS.register_module()
-@CONV_LAYERS.register_module('deconv')
-@UPSAMPLE_LAYERS.register_module('deconv', force=True)
-class ConvTranspose2d(nn.ConvTranspose2d):
-
-    def forward(self, x):
-        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
-            out_shape = [x.shape[0], self.out_channels]
-            for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size,
-                                         self.padding, self.stride,
-                                         self.dilation, self.output_padding):
-                out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
-            empty = NewEmptyTensorOp.apply(x, out_shape)
-            if self.training:
-                # produce dummy gradient to avoid DDP warning.
-                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
-                return empty + dummy
-            else:
-                return empty
-
-        return super().forward(x)
-
-
-@CONV_LAYERS.register_module()
-@CONV_LAYERS.register_module('deconv3d')
-@UPSAMPLE_LAYERS.register_module('deconv3d', force=True)
-class ConvTranspose3d(nn.ConvTranspose3d):
-
-    def forward(self, x):
-        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
-            out_shape = [x.shape[0], self.out_channels]
-            for i, k, p, s, d, op in zip(x.shape[-3:], self.kernel_size,
-                                         self.padding, self.stride,
-                                         self.dilation, self.output_padding):
-                out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
-            empty = NewEmptyTensorOp.apply(x, out_shape)
-            if self.training:
-                # produce dummy gradient to avoid DDP warning.
-                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
-                return empty + dummy
-            else:
-                return empty
-
-        return super().forward(x)
-
-
-class MaxPool2d(nn.MaxPool2d):
-
-    def forward(self, x):
-        # PyTorch 1.9 does not support empty tensor inference yet
-        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
-            out_shape = list(x.shape[:2])
-            for i, k, p, s, d in zip(x.shape[-2:], _pair(self.kernel_size),
-                                     _pair(self.padding), _pair(self.stride),
-                                     _pair(self.dilation)):
-                o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
-                o = math.ceil(o) if self.ceil_mode else math.floor(o)
-                out_shape.append(o)
-            empty = NewEmptyTensorOp.apply(x, out_shape)
-            return empty
-
-        return super().forward(x)
-
-
-class MaxPool3d(nn.MaxPool3d):
-
-    def forward(self, x):
-        # PyTorch 1.9 does not support empty tensor inference yet
-        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
-            out_shape = list(x.shape[:2])
-            for i, k, p, s, d in zip(x.shape[-3:], _triple(self.kernel_size),
-                                     _triple(self.padding),
-                                     _triple(self.stride),
-                                     _triple(self.dilation)):
-                o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
-                o = math.ceil(o) if self.ceil_mode else math.floor(o)
-                out_shape.append(o)
-            empty = NewEmptyTensorOp.apply(x, out_shape)
-            return empty
-
-        return super().forward(x)
-
-
-class Linear(torch.nn.Linear):
-
-    def forward(self, x):
-        # empty tensor forward of Linear layer is supported in Pytorch 1.6
-        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 5)):
-            out_shape = [x.shape[0], self.out_features]
-            empty = NewEmptyTensorOp.apply(x, out_shape)
-            if self.training:
-                # produce dummy gradient to avoid DDP warning.
-                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
-                return empty + dummy
-            else:
-                return empty
-
-        return super().forward(x)
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/builder.py b/ControlNet/annotator/uniformer/mmcv/cnn/builder.py
deleted file mode 100644
index 7567316c566bd3aca6d8f65a84b00e9e890948a7..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/builder.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from ..runner import Sequential
-from ..utils import Registry, build_from_cfg
-
-
-def build_model_from_cfg(cfg, registry, default_args=None):
-    """Build a PyTorch model from config dict(s). Different from
-    ``build_from_cfg``, if cfg is a list, a ``nn.Sequential`` will be built.
-
-    Args:
-        cfg (dict, list[dict]): The config of modules, is is either a config
-            dict or a list of config dicts. If cfg is a list, a
-            the built modules will be wrapped with ``nn.Sequential``.
-        registry (:obj:`Registry`): A registry the module belongs to.
-        default_args (dict, optional): Default arguments to build the module.
-            Defaults to None.
-
-    Returns:
-        nn.Module: A built nn module.
-    """
-    if isinstance(cfg, list):
-        modules = [
-            build_from_cfg(cfg_, registry, default_args) for cfg_ in cfg
-        ]
-        return Sequential(*modules)
-    else:
-        return build_from_cfg(cfg, registry, default_args)
-
-
-MODELS = Registry('model', build_func=build_model_from_cfg)
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/resnet.py b/ControlNet/annotator/uniformer/mmcv/cnn/resnet.py
deleted file mode 100644
index 1cb3ac057ee2d52c46fc94685b5d4e698aad8d5f..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/resnet.py
+++ /dev/null
@@ -1,316 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import logging
-
-import torch.nn as nn
-import torch.utils.checkpoint as cp
-
-from .utils import constant_init, kaiming_init
-
-
-def conv3x3(in_planes, out_planes, stride=1, dilation=1):
-    """3x3 convolution with padding."""
-    return nn.Conv2d(
-        in_planes,
-        out_planes,
-        kernel_size=3,
-        stride=stride,
-        padding=dilation,
-        dilation=dilation,
-        bias=False)
-
-
-class BasicBlock(nn.Module):
-    expansion = 1
-
-    def __init__(self,
-                 inplanes,
-                 planes,
-                 stride=1,
-                 dilation=1,
-                 downsample=None,
-                 style='pytorch',
-                 with_cp=False):
-        super(BasicBlock, self).__init__()
-        assert style in ['pytorch', 'caffe']
-        self.conv1 = conv3x3(inplanes, planes, stride, dilation)
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = conv3x3(planes, planes)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.downsample = downsample
-        self.stride = stride
-        self.dilation = dilation
-        assert not with_cp
-
-    def forward(self, x):
-        residual = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-
-        if self.downsample is not None:
-            residual = self.downsample(x)
-
-        out += residual
-        out = self.relu(out)
-
-        return out
-
-
-class Bottleneck(nn.Module):
-    expansion = 4
-
-    def __init__(self,
-                 inplanes,
-                 planes,
-                 stride=1,
-                 dilation=1,
-                 downsample=None,
-                 style='pytorch',
-                 with_cp=False):
-        """Bottleneck block.
-
-        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
-        it is "caffe", the stride-two layer is the first 1x1 conv layer.
-        """
-        super(Bottleneck, self).__init__()
-        assert style in ['pytorch', 'caffe']
-        if style == 'pytorch':
-            conv1_stride = 1
-            conv2_stride = stride
-        else:
-            conv1_stride = stride
-            conv2_stride = 1
-        self.conv1 = nn.Conv2d(
-            inplanes, planes, kernel_size=1, stride=conv1_stride, bias=False)
-        self.conv2 = nn.Conv2d(
-            planes,
-            planes,
-            kernel_size=3,
-            stride=conv2_stride,
-            padding=dilation,
-            dilation=dilation,
-            bias=False)
-
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.conv3 = nn.Conv2d(
-            planes, planes * self.expansion, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
-        self.relu = nn.ReLU(inplace=True)
-        self.downsample = downsample
-        self.stride = stride
-        self.dilation = dilation
-        self.with_cp = with_cp
-
-    def forward(self, x):
-
-        def _inner_forward(x):
-            residual = x
-
-            out = self.conv1(x)
-            out = self.bn1(out)
-            out = self.relu(out)
-
-            out = self.conv2(out)
-            out = self.bn2(out)
-            out = self.relu(out)
-
-            out = self.conv3(out)
-            out = self.bn3(out)
-
-            if self.downsample is not None:
-                residual = self.downsample(x)
-
-            out += residual
-
-            return out
-
-        if self.with_cp and x.requires_grad:
-            out = cp.checkpoint(_inner_forward, x)
-        else:
-            out = _inner_forward(x)
-
-        out = self.relu(out)
-
-        return out
-
-
-def make_res_layer(block,
-                   inplanes,
-                   planes,
-                   blocks,
-                   stride=1,
-                   dilation=1,
-                   style='pytorch',
-                   with_cp=False):
-    downsample = None
-    if stride != 1 or inplanes != planes * block.expansion:
-        downsample = nn.Sequential(
-            nn.Conv2d(
-                inplanes,
-                planes * block.expansion,
-                kernel_size=1,
-                stride=stride,
-                bias=False),
-            nn.BatchNorm2d(planes * block.expansion),
-        )
-
-    layers = []
-    layers.append(
-        block(
-            inplanes,
-            planes,
-            stride,
-            dilation,
-            downsample,
-            style=style,
-            with_cp=with_cp))
-    inplanes = planes * block.expansion
-    for _ in range(1, blocks):
-        layers.append(
-            block(inplanes, planes, 1, dilation, style=style, with_cp=with_cp))
-
-    return nn.Sequential(*layers)
-
-
-class ResNet(nn.Module):
-    """ResNet backbone.
-
-    Args:
-        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
-        num_stages (int): Resnet stages, normally 4.
-        strides (Sequence[int]): Strides of the first block of each stage.
-        dilations (Sequence[int]): Dilation of each stage.
-        out_indices (Sequence[int]): Output from which stages.
-        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
-            layer is the 3x3 conv layer, otherwise the stride-two layer is
-            the first 1x1 conv layer.
-        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
-            not freezing any parameters.
-        bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze
-            running stats (mean and var).
-        bn_frozen (bool): Whether to freeze weight and bias of BN layers.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed.
-    """
-
-    arch_settings = {
-        18: (BasicBlock, (2, 2, 2, 2)),
-        34: (BasicBlock, (3, 4, 6, 3)),
-        50: (Bottleneck, (3, 4, 6, 3)),
-        101: (Bottleneck, (3, 4, 23, 3)),
-        152: (Bottleneck, (3, 8, 36, 3))
-    }
-
-    def __init__(self,
-                 depth,
-                 num_stages=4,
-                 strides=(1, 2, 2, 2),
-                 dilations=(1, 1, 1, 1),
-                 out_indices=(0, 1, 2, 3),
-                 style='pytorch',
-                 frozen_stages=-1,
-                 bn_eval=True,
-                 bn_frozen=False,
-                 with_cp=False):
-        super(ResNet, self).__init__()
-        if depth not in self.arch_settings:
-            raise KeyError(f'invalid depth {depth} for resnet')
-        assert num_stages >= 1 and num_stages <= 4
-        block, stage_blocks = self.arch_settings[depth]
-        stage_blocks = stage_blocks[:num_stages]
-        assert len(strides) == len(dilations) == num_stages
-        assert max(out_indices) < num_stages
-
-        self.out_indices = out_indices
-        self.style = style
-        self.frozen_stages = frozen_stages
-        self.bn_eval = bn_eval
-        self.bn_frozen = bn_frozen
-        self.with_cp = with_cp
-
-        self.inplanes = 64
-        self.conv1 = nn.Conv2d(
-            3, 64, kernel_size=7, stride=2, padding=3, bias=False)
-        self.bn1 = nn.BatchNorm2d(64)
-        self.relu = nn.ReLU(inplace=True)
-        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-
-        self.res_layers = []
-        for i, num_blocks in enumerate(stage_blocks):
-            stride = strides[i]
-            dilation = dilations[i]
-            planes = 64 * 2**i
-            res_layer = make_res_layer(
-                block,
-                self.inplanes,
-                planes,
-                num_blocks,
-                stride=stride,
-                dilation=dilation,
-                style=self.style,
-                with_cp=with_cp)
-            self.inplanes = planes * block.expansion
-            layer_name = f'layer{i + 1}'
-            self.add_module(layer_name, res_layer)
-            self.res_layers.append(layer_name)
-
-        self.feat_dim = block.expansion * 64 * 2**(len(stage_blocks) - 1)
-
-    def init_weights(self, pretrained=None):
-        if isinstance(pretrained, str):
-            logger = logging.getLogger()
-            from ..runner import load_checkpoint
-            load_checkpoint(self, pretrained, strict=False, logger=logger)
-        elif pretrained is None:
-            for m in self.modules():
-                if isinstance(m, nn.Conv2d):
-                    kaiming_init(m)
-                elif isinstance(m, nn.BatchNorm2d):
-                    constant_init(m, 1)
-        else:
-            raise TypeError('pretrained must be a str or None')
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = self.relu(x)
-        x = self.maxpool(x)
-        outs = []
-        for i, layer_name in enumerate(self.res_layers):
-            res_layer = getattr(self, layer_name)
-            x = res_layer(x)
-            if i in self.out_indices:
-                outs.append(x)
-        if len(outs) == 1:
-            return outs[0]
-        else:
-            return tuple(outs)
-
-    def train(self, mode=True):
-        super(ResNet, self).train(mode)
-        if self.bn_eval:
-            for m in self.modules():
-                if isinstance(m, nn.BatchNorm2d):
-                    m.eval()
-                    if self.bn_frozen:
-                        for params in m.parameters():
-                            params.requires_grad = False
-        if mode and self.frozen_stages >= 0:
-            for param in self.conv1.parameters():
-                param.requires_grad = False
-            for param in self.bn1.parameters():
-                param.requires_grad = False
-            self.bn1.eval()
-            self.bn1.weight.requires_grad = False
-            self.bn1.bias.requires_grad = False
-            for i in range(1, self.frozen_stages + 1):
-                mod = getattr(self, f'layer{i}')
-                mod.eval()
-                for param in mod.parameters():
-                    param.requires_grad = False
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/utils/__init__.py b/ControlNet/annotator/uniformer/mmcv/cnn/utils/__init__.py
deleted file mode 100644
index a263e31c1e3977712827ca229bbc04910b4e928e..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/utils/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .flops_counter import get_model_complexity_info
-from .fuse_conv_bn import fuse_conv_bn
-from .sync_bn import revert_sync_batchnorm
-from .weight_init import (INITIALIZERS, Caffe2XavierInit, ConstantInit,
-                          KaimingInit, NormalInit, PretrainedInit,
-                          TruncNormalInit, UniformInit, XavierInit,
-                          bias_init_with_prob, caffe2_xavier_init,
-                          constant_init, initialize, kaiming_init, normal_init,
-                          trunc_normal_init, uniform_init, xavier_init)
-
-__all__ = [
-    'get_model_complexity_info', 'bias_init_with_prob', 'caffe2_xavier_init',
-    'constant_init', 'kaiming_init', 'normal_init', 'trunc_normal_init',
-    'uniform_init', 'xavier_init', 'fuse_conv_bn', 'initialize',
-    'INITIALIZERS', 'ConstantInit', 'XavierInit', 'NormalInit',
-    'TruncNormalInit', 'UniformInit', 'KaimingInit', 'PretrainedInit',
-    'Caffe2XavierInit', 'revert_sync_batchnorm'
-]
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/utils/flops_counter.py b/ControlNet/annotator/uniformer/mmcv/cnn/utils/flops_counter.py
deleted file mode 100644
index d10af5feca7f4b8c0ba359b7b1c826f754e048be..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/utils/flops_counter.py
+++ /dev/null
@@ -1,599 +0,0 @@
-# Modified from flops-counter.pytorch by Vladislav Sovrasov
-# original repo: https://github.com/sovrasov/flops-counter.pytorch
-
-# MIT License
-
-# Copyright (c) 2018 Vladislav Sovrasov
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-import sys
-from functools import partial
-
-import numpy as np
-import torch
-import torch.nn as nn
-
-import annotator.uniformer.mmcv as mmcv
-
-
-def get_model_complexity_info(model,
-                              input_shape,
-                              print_per_layer_stat=True,
-                              as_strings=True,
-                              input_constructor=None,
-                              flush=False,
-                              ost=sys.stdout):
-    """Get complexity information of a model.
-
-    This method can calculate FLOPs and parameter counts of a model with
-    corresponding input shape. It can also print complexity information for
-    each layer in a model.
-
-    Supported layers are listed as below:
-        - Convolutions: ``nn.Conv1d``, ``nn.Conv2d``, ``nn.Conv3d``.
-        - Activations: ``nn.ReLU``, ``nn.PReLU``, ``nn.ELU``, ``nn.LeakyReLU``,
-            ``nn.ReLU6``.
-        - Poolings: ``nn.MaxPool1d``, ``nn.MaxPool2d``, ``nn.MaxPool3d``,
-            ``nn.AvgPool1d``, ``nn.AvgPool2d``, ``nn.AvgPool3d``,
-            ``nn.AdaptiveMaxPool1d``, ``nn.AdaptiveMaxPool2d``,
-            ``nn.AdaptiveMaxPool3d``, ``nn.AdaptiveAvgPool1d``,
-            ``nn.AdaptiveAvgPool2d``, ``nn.AdaptiveAvgPool3d``.
-        - BatchNorms: ``nn.BatchNorm1d``, ``nn.BatchNorm2d``,
-            ``nn.BatchNorm3d``, ``nn.GroupNorm``, ``nn.InstanceNorm1d``,
-            ``InstanceNorm2d``, ``InstanceNorm3d``, ``nn.LayerNorm``.
-        - Linear: ``nn.Linear``.
-        - Deconvolution: ``nn.ConvTranspose2d``.
-        - Upsample: ``nn.Upsample``.
-
-    Args:
-        model (nn.Module): The model for complexity calculation.
-        input_shape (tuple): Input shape used for calculation.
-        print_per_layer_stat (bool): Whether to print complexity information
-            for each layer in a model. Default: True.
-        as_strings (bool): Output FLOPs and params counts in a string form.
-            Default: True.
-        input_constructor (None | callable): If specified, it takes a callable
-            method that generates input. otherwise, it will generate a random
-            tensor with input shape to calculate FLOPs. Default: None.
-        flush (bool): same as that in :func:`print`. Default: False.
-        ost (stream): same as ``file`` param in :func:`print`.
-            Default: sys.stdout.
-
-    Returns:
-        tuple[float | str]: If ``as_strings`` is set to True, it will return
-            FLOPs and parameter counts in a string format. otherwise, it will
-            return those in a float number format.
-    """
-    assert type(input_shape) is tuple
-    assert len(input_shape) >= 1
-    assert isinstance(model, nn.Module)
-    flops_model = add_flops_counting_methods(model)
-    flops_model.eval()
-    flops_model.start_flops_count()
-    if input_constructor:
-        input = input_constructor(input_shape)
-        _ = flops_model(**input)
-    else:
-        try:
-            batch = torch.ones(()).new_empty(
-                (1, *input_shape),
-                dtype=next(flops_model.parameters()).dtype,
-                device=next(flops_model.parameters()).device)
-        except StopIteration:
-            # Avoid StopIteration for models which have no parameters,
-            # like `nn.Relu()`, `nn.AvgPool2d`, etc.
-            batch = torch.ones(()).new_empty((1, *input_shape))
-
-        _ = flops_model(batch)
-
-    flops_count, params_count = flops_model.compute_average_flops_cost()
-    if print_per_layer_stat:
-        print_model_with_flops(
-            flops_model, flops_count, params_count, ost=ost, flush=flush)
-    flops_model.stop_flops_count()
-
-    if as_strings:
-        return flops_to_string(flops_count), params_to_string(params_count)
-
-    return flops_count, params_count
-
-
-def flops_to_string(flops, units='GFLOPs', precision=2):
-    """Convert FLOPs number into a string.
-
-    Note that Here we take a multiply-add counts as one FLOP.
-
-    Args:
-        flops (float): FLOPs number to be converted.
-        units (str | None): Converted FLOPs units. Options are None, 'GFLOPs',
-            'MFLOPs', 'KFLOPs', 'FLOPs'. If set to None, it will automatically
-            choose the most suitable unit for FLOPs. Default: 'GFLOPs'.
-        precision (int): Digit number after the decimal point. Default: 2.
-
-    Returns:
-        str: The converted FLOPs number with units.
-
-    Examples:
-        >>> flops_to_string(1e9)
-        '1.0 GFLOPs'
-        >>> flops_to_string(2e5, 'MFLOPs')
-        '0.2 MFLOPs'
-        >>> flops_to_string(3e-9, None)
-        '3e-09 FLOPs'
-    """
-    if units is None:
-        if flops // 10**9 > 0:
-            return str(round(flops / 10.**9, precision)) + ' GFLOPs'
-        elif flops // 10**6 > 0:
-            return str(round(flops / 10.**6, precision)) + ' MFLOPs'
-        elif flops // 10**3 > 0:
-            return str(round(flops / 10.**3, precision)) + ' KFLOPs'
-        else:
-            return str(flops) + ' FLOPs'
-    else:
-        if units == 'GFLOPs':
-            return str(round(flops / 10.**9, precision)) + ' ' + units
-        elif units == 'MFLOPs':
-            return str(round(flops / 10.**6, precision)) + ' ' + units
-        elif units == 'KFLOPs':
-            return str(round(flops / 10.**3, precision)) + ' ' + units
-        else:
-            return str(flops) + ' FLOPs'
-
-
-def params_to_string(num_params, units=None, precision=2):
-    """Convert parameter number into a string.
-
-    Args:
-        num_params (float): Parameter number to be converted.
-        units (str | None): Converted FLOPs units. Options are None, 'M',
-            'K' and ''. If set to None, it will automatically choose the most
-            suitable unit for Parameter number. Default: None.
-        precision (int): Digit number after the decimal point. Default: 2.
-
-    Returns:
-        str: The converted parameter number with units.
-
-    Examples:
-        >>> params_to_string(1e9)
-        '1000.0 M'
-        >>> params_to_string(2e5)
-        '200.0 k'
-        >>> params_to_string(3e-9)
-        '3e-09'
-    """
-    if units is None:
-        if num_params // 10**6 > 0:
-            return str(round(num_params / 10**6, precision)) + ' M'
-        elif num_params // 10**3:
-            return str(round(num_params / 10**3, precision)) + ' k'
-        else:
-            return str(num_params)
-    else:
-        if units == 'M':
-            return str(round(num_params / 10.**6, precision)) + ' ' + units
-        elif units == 'K':
-            return str(round(num_params / 10.**3, precision)) + ' ' + units
-        else:
-            return str(num_params)
-
-
-def print_model_with_flops(model,
-                           total_flops,
-                           total_params,
-                           units='GFLOPs',
-                           precision=3,
-                           ost=sys.stdout,
-                           flush=False):
-    """Print a model with FLOPs for each layer.
-
-    Args:
-        model (nn.Module): The model to be printed.
-        total_flops (float): Total FLOPs of the model.
-        total_params (float): Total parameter counts of the model.
-        units (str | None): Converted FLOPs units. Default: 'GFLOPs'.
-        precision (int): Digit number after the decimal point. Default: 3.
-        ost (stream): same as `file` param in :func:`print`.
-            Default: sys.stdout.
-        flush (bool): same as that in :func:`print`. Default: False.
-
-    Example:
-        >>> class ExampleModel(nn.Module):
-
-        >>> def __init__(self):
-        >>>     super().__init__()
-        >>>     self.conv1 = nn.Conv2d(3, 8, 3)
-        >>>     self.conv2 = nn.Conv2d(8, 256, 3)
-        >>>     self.conv3 = nn.Conv2d(256, 8, 3)
-        >>>     self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
-        >>>     self.flatten = nn.Flatten()
-        >>>     self.fc = nn.Linear(8, 1)
-
-        >>> def forward(self, x):
-        >>>     x = self.conv1(x)
-        >>>     x = self.conv2(x)
-        >>>     x = self.conv3(x)
-        >>>     x = self.avg_pool(x)
-        >>>     x = self.flatten(x)
-        >>>     x = self.fc(x)
-        >>>     return x
-
-        >>> model = ExampleModel()
-        >>> x = (3, 16, 16)
-        to print the complexity information state for each layer, you can use
-        >>> get_model_complexity_info(model, x)
-        or directly use
-        >>> print_model_with_flops(model, 4579784.0, 37361)
-        ExampleModel(
-          0.037 M, 100.000% Params, 0.005 GFLOPs, 100.000% FLOPs,
-          (conv1): Conv2d(0.0 M, 0.600% Params, 0.0 GFLOPs, 0.959% FLOPs, 3, 8, kernel_size=(3, 3), stride=(1, 1))  # noqa: E501
-          (conv2): Conv2d(0.019 M, 50.020% Params, 0.003 GFLOPs, 58.760% FLOPs, 8, 256, kernel_size=(3, 3), stride=(1, 1))
-          (conv3): Conv2d(0.018 M, 49.356% Params, 0.002 GFLOPs, 40.264% FLOPs, 256, 8, kernel_size=(3, 3), stride=(1, 1))
-          (avg_pool): AdaptiveAvgPool2d(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.017% FLOPs, output_size=(1, 1))
-          (flatten): Flatten(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.000% FLOPs, )
-          (fc): Linear(0.0 M, 0.024% Params, 0.0 GFLOPs, 0.000% FLOPs, in_features=8, out_features=1, bias=True)
-        )
-    """
-
-    def accumulate_params(self):
-        if is_supported_instance(self):
-            return self.__params__
-        else:
-            sum = 0
-            for m in self.children():
-                sum += m.accumulate_params()
-            return sum
-
-    def accumulate_flops(self):
-        if is_supported_instance(self):
-            return self.__flops__ / model.__batch_counter__
-        else:
-            sum = 0
-            for m in self.children():
-                sum += m.accumulate_flops()
-            return sum
-
-    def flops_repr(self):
-        accumulated_num_params = self.accumulate_params()
-        accumulated_flops_cost = self.accumulate_flops()
-        return ', '.join([
-            params_to_string(
-                accumulated_num_params, units='M', precision=precision),
-            '{:.3%} Params'.format(accumulated_num_params / total_params),
-            flops_to_string(
-                accumulated_flops_cost, units=units, precision=precision),
-            '{:.3%} FLOPs'.format(accumulated_flops_cost / total_flops),
-            self.original_extra_repr()
-        ])
-
-    def add_extra_repr(m):
-        m.accumulate_flops = accumulate_flops.__get__(m)
-        m.accumulate_params = accumulate_params.__get__(m)
-        flops_extra_repr = flops_repr.__get__(m)
-        if m.extra_repr != flops_extra_repr:
-            m.original_extra_repr = m.extra_repr
-            m.extra_repr = flops_extra_repr
-            assert m.extra_repr != m.original_extra_repr
-
-    def del_extra_repr(m):
-        if hasattr(m, 'original_extra_repr'):
-            m.extra_repr = m.original_extra_repr
-            del m.original_extra_repr
-        if hasattr(m, 'accumulate_flops'):
-            del m.accumulate_flops
-
-    model.apply(add_extra_repr)
-    print(model, file=ost, flush=flush)
-    model.apply(del_extra_repr)
-
-
-def get_model_parameters_number(model):
-    """Calculate parameter number of a model.
-
-    Args:
-        model (nn.module): The model for parameter number calculation.
-
-    Returns:
-        float: Parameter number of the model.
-    """
-    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
-    return num_params
-
-
-def add_flops_counting_methods(net_main_module):
-    # adding additional methods to the existing module object,
-    # this is done this way so that each function has access to self object
-    net_main_module.start_flops_count = start_flops_count.__get__(
-        net_main_module)
-    net_main_module.stop_flops_count = stop_flops_count.__get__(
-        net_main_module)
-    net_main_module.reset_flops_count = reset_flops_count.__get__(
-        net_main_module)
-    net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__(  # noqa: E501
-        net_main_module)
-
-    net_main_module.reset_flops_count()
-
-    return net_main_module
-
-
-def compute_average_flops_cost(self):
-    """Compute average FLOPs cost.
-
-    A method to compute average FLOPs cost, which will be available after
-    `add_flops_counting_methods()` is called on a desired net object.
-
-    Returns:
-        float: Current mean flops consumption per image.
-    """
-    batches_count = self.__batch_counter__
-    flops_sum = 0
-    for module in self.modules():
-        if is_supported_instance(module):
-            flops_sum += module.__flops__
-    params_sum = get_model_parameters_number(self)
-    return flops_sum / batches_count, params_sum
-
-
-def start_flops_count(self):
-    """Activate the computation of mean flops consumption per image.
-
-    A method to activate the computation of mean flops consumption per image.
-    which will be available after ``add_flops_counting_methods()`` is called on
-    a desired net object. It should be called before running the network.
-    """
-    add_batch_counter_hook_function(self)
-
-    def add_flops_counter_hook_function(module):
-        if is_supported_instance(module):
-            if hasattr(module, '__flops_handle__'):
-                return
-
-            else:
-                handle = module.register_forward_hook(
-                    get_modules_mapping()[type(module)])
-
-            module.__flops_handle__ = handle
-
-    self.apply(partial(add_flops_counter_hook_function))
-
-
-def stop_flops_count(self):
-    """Stop computing the mean flops consumption per image.
-
-    A method to stop computing the mean flops consumption per image, which will
-    be available after ``add_flops_counting_methods()`` is called on a desired
-    net object. It can be called to pause the computation whenever.
-    """
-    remove_batch_counter_hook_function(self)
-    self.apply(remove_flops_counter_hook_function)
-
-
-def reset_flops_count(self):
-    """Reset statistics computed so far.
-
-    A method to Reset computed statistics, which will be available after
-    `add_flops_counting_methods()` is called on a desired net object.
-    """
-    add_batch_counter_variables_or_reset(self)
-    self.apply(add_flops_counter_variable_or_reset)
-
-
-# ---- Internal functions
-def empty_flops_counter_hook(module, input, output):
-    module.__flops__ += 0
-
-
-def upsample_flops_counter_hook(module, input, output):
-    output_size = output[0]
-    batch_size = output_size.shape[0]
-    output_elements_count = batch_size
-    for val in output_size.shape[1:]:
-        output_elements_count *= val
-    module.__flops__ += int(output_elements_count)
-
-
-def relu_flops_counter_hook(module, input, output):
-    active_elements_count = output.numel()
-    module.__flops__ += int(active_elements_count)
-
-
-def linear_flops_counter_hook(module, input, output):
-    input = input[0]
-    output_last_dim = output.shape[
-        -1]  # pytorch checks dimensions, so here we don't care much
-    module.__flops__ += int(np.prod(input.shape) * output_last_dim)
-
-
-def pool_flops_counter_hook(module, input, output):
-    input = input[0]
-    module.__flops__ += int(np.prod(input.shape))
-
-
-def norm_flops_counter_hook(module, input, output):
-    input = input[0]
-
-    batch_flops = np.prod(input.shape)
-    if (getattr(module, 'affine', False)
-            or getattr(module, 'elementwise_affine', False)):
-        batch_flops *= 2
-    module.__flops__ += int(batch_flops)
-
-
-def deconv_flops_counter_hook(conv_module, input, output):
-    # Can have multiple inputs, getting the first one
-    input = input[0]
-
-    batch_size = input.shape[0]
-    input_height, input_width = input.shape[2:]
-
-    kernel_height, kernel_width = conv_module.kernel_size
-    in_channels = conv_module.in_channels
-    out_channels = conv_module.out_channels
-    groups = conv_module.groups
-
-    filters_per_channel = out_channels // groups
-    conv_per_position_flops = (
-        kernel_height * kernel_width * in_channels * filters_per_channel)
-
-    active_elements_count = batch_size * input_height * input_width
-    overall_conv_flops = conv_per_position_flops * active_elements_count
-    bias_flops = 0
-    if conv_module.bias is not None:
-        output_height, output_width = output.shape[2:]
-        bias_flops = out_channels * batch_size * output_height * output_height
-    overall_flops = overall_conv_flops + bias_flops
-
-    conv_module.__flops__ += int(overall_flops)
-
-
-def conv_flops_counter_hook(conv_module, input, output):
-    # Can have multiple inputs, getting the first one
-    input = input[0]
-
-    batch_size = input.shape[0]
-    output_dims = list(output.shape[2:])
-
-    kernel_dims = list(conv_module.kernel_size)
-    in_channels = conv_module.in_channels
-    out_channels = conv_module.out_channels
-    groups = conv_module.groups
-
-    filters_per_channel = out_channels // groups
-    conv_per_position_flops = int(
-        np.prod(kernel_dims)) * in_channels * filters_per_channel
-
-    active_elements_count = batch_size * int(np.prod(output_dims))
-
-    overall_conv_flops = conv_per_position_flops * active_elements_count
-
-    bias_flops = 0
-
-    if conv_module.bias is not None:
-
-        bias_flops = out_channels * active_elements_count
-
-    overall_flops = overall_conv_flops + bias_flops
-
-    conv_module.__flops__ += int(overall_flops)
-
-
-def batch_counter_hook(module, input, output):
-    batch_size = 1
-    if len(input) > 0:
-        # Can have multiple inputs, getting the first one
-        input = input[0]
-        batch_size = len(input)
-    else:
-        pass
-        print('Warning! No positional inputs found for a module, '
-              'assuming batch size is 1.')
-    module.__batch_counter__ += batch_size
-
-
-def add_batch_counter_variables_or_reset(module):
-
-    module.__batch_counter__ = 0
-
-
-def add_batch_counter_hook_function(module):
-    if hasattr(module, '__batch_counter_handle__'):
-        return
-
-    handle = module.register_forward_hook(batch_counter_hook)
-    module.__batch_counter_handle__ = handle
-
-
-def remove_batch_counter_hook_function(module):
-    if hasattr(module, '__batch_counter_handle__'):
-        module.__batch_counter_handle__.remove()
-        del module.__batch_counter_handle__
-
-
-def add_flops_counter_variable_or_reset(module):
-    if is_supported_instance(module):
-        if hasattr(module, '__flops__') or hasattr(module, '__params__'):
-            print('Warning: variables __flops__ or __params__ are already '
-                  'defined for the module' + type(module).__name__ +
-                  ' ptflops can affect your code!')
-        module.__flops__ = 0
-        module.__params__ = get_model_parameters_number(module)
-
-
-def is_supported_instance(module):
-    if type(module) in get_modules_mapping():
-        return True
-    return False
-
-
-def remove_flops_counter_hook_function(module):
-    if is_supported_instance(module):
-        if hasattr(module, '__flops_handle__'):
-            module.__flops_handle__.remove()
-            del module.__flops_handle__
-
-
-def get_modules_mapping():
-    return {
-        # convolutions
-        nn.Conv1d: conv_flops_counter_hook,
-        nn.Conv2d: conv_flops_counter_hook,
-        mmcv.cnn.bricks.Conv2d: conv_flops_counter_hook,
-        nn.Conv3d: conv_flops_counter_hook,
-        mmcv.cnn.bricks.Conv3d: conv_flops_counter_hook,
-        # activations
-        nn.ReLU: relu_flops_counter_hook,
-        nn.PReLU: relu_flops_counter_hook,
-        nn.ELU: relu_flops_counter_hook,
-        nn.LeakyReLU: relu_flops_counter_hook,
-        nn.ReLU6: relu_flops_counter_hook,
-        # poolings
-        nn.MaxPool1d: pool_flops_counter_hook,
-        nn.AvgPool1d: pool_flops_counter_hook,
-        nn.AvgPool2d: pool_flops_counter_hook,
-        nn.MaxPool2d: pool_flops_counter_hook,
-        mmcv.cnn.bricks.MaxPool2d: pool_flops_counter_hook,
-        nn.MaxPool3d: pool_flops_counter_hook,
-        mmcv.cnn.bricks.MaxPool3d: pool_flops_counter_hook,
-        nn.AvgPool3d: pool_flops_counter_hook,
-        nn.AdaptiveMaxPool1d: pool_flops_counter_hook,
-        nn.AdaptiveAvgPool1d: pool_flops_counter_hook,
-        nn.AdaptiveMaxPool2d: pool_flops_counter_hook,
-        nn.AdaptiveAvgPool2d: pool_flops_counter_hook,
-        nn.AdaptiveMaxPool3d: pool_flops_counter_hook,
-        nn.AdaptiveAvgPool3d: pool_flops_counter_hook,
-        # normalizations
-        nn.BatchNorm1d: norm_flops_counter_hook,
-        nn.BatchNorm2d: norm_flops_counter_hook,
-        nn.BatchNorm3d: norm_flops_counter_hook,
-        nn.GroupNorm: norm_flops_counter_hook,
-        nn.InstanceNorm1d: norm_flops_counter_hook,
-        nn.InstanceNorm2d: norm_flops_counter_hook,
-        nn.InstanceNorm3d: norm_flops_counter_hook,
-        nn.LayerNorm: norm_flops_counter_hook,
-        # FC
-        nn.Linear: linear_flops_counter_hook,
-        mmcv.cnn.bricks.Linear: linear_flops_counter_hook,
-        # Upscale
-        nn.Upsample: upsample_flops_counter_hook,
-        # Deconvolution
-        nn.ConvTranspose2d: deconv_flops_counter_hook,
-        mmcv.cnn.bricks.ConvTranspose2d: deconv_flops_counter_hook,
-    }
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/utils/fuse_conv_bn.py b/ControlNet/annotator/uniformer/mmcv/cnn/utils/fuse_conv_bn.py
deleted file mode 100644
index cb7076f80bf37f7931185bf0293ffcc1ce19c8ef..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/utils/fuse_conv_bn.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-import torch.nn as nn
-
-
-def _fuse_conv_bn(conv, bn):
-    """Fuse conv and bn into one module.
-
-    Args:
-        conv (nn.Module): Conv to be fused.
-        bn (nn.Module): BN to be fused.
-
-    Returns:
-        nn.Module: Fused module.
-    """
-    conv_w = conv.weight
-    conv_b = conv.bias if conv.bias is not None else torch.zeros_like(
-        bn.running_mean)
-
-    factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
-    conv.weight = nn.Parameter(conv_w *
-                               factor.reshape([conv.out_channels, 1, 1, 1]))
-    conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)
-    return conv
-
-
-def fuse_conv_bn(module):
-    """Recursively fuse conv and bn in a module.
-
-    During inference, the functionary of batch norm layers is turned off
-    but only the mean and var alone channels are used, which exposes the
-    chance to fuse it with the preceding conv layers to save computations and
-    simplify network structures.
-
-    Args:
-        module (nn.Module): Module to be fused.
-
-    Returns:
-        nn.Module: Fused module.
-    """
-    last_conv = None
-    last_conv_name = None
-
-    for name, child in module.named_children():
-        if isinstance(child,
-                      (nn.modules.batchnorm._BatchNorm, nn.SyncBatchNorm)):
-            if last_conv is None:  # only fuse BN that is after Conv
-                continue
-            fused_conv = _fuse_conv_bn(last_conv, child)
-            module._modules[last_conv_name] = fused_conv
-            # To reduce changes, set BN as Identity instead of deleting it.
-            module._modules[name] = nn.Identity()
-            last_conv = None
-        elif isinstance(child, nn.Conv2d):
-            last_conv = child
-            last_conv_name = name
-        else:
-            fuse_conv_bn(child)
-    return module
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/utils/sync_bn.py b/ControlNet/annotator/uniformer/mmcv/cnn/utils/sync_bn.py
deleted file mode 100644
index f78f39181d75bb85c53e8c7c8eaf45690e9f0bee..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/utils/sync_bn.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import torch
-
-import annotator.uniformer.mmcv as mmcv
-
-
-class _BatchNormXd(torch.nn.modules.batchnorm._BatchNorm):
-    """A general BatchNorm layer without input dimension check.
-
-    Reproduced from @kapily's work:
-    (https://github.com/pytorch/pytorch/issues/41081#issuecomment-783961547)
-    The only difference between BatchNorm1d, BatchNorm2d, BatchNorm3d, etc
-    is `_check_input_dim` that is designed for tensor sanity checks.
-    The check has been bypassed in this class for the convenience of converting
-    SyncBatchNorm.
-    """
-
-    def _check_input_dim(self, input):
-        return
-
-
-def revert_sync_batchnorm(module):
-    """Helper function to convert all `SyncBatchNorm` (SyncBN) and
-    `mmcv.ops.sync_bn.SyncBatchNorm`(MMSyncBN) layers in the model to
-    `BatchNormXd` layers.
-
-    Adapted from @kapily's work:
-    (https://github.com/pytorch/pytorch/issues/41081#issuecomment-783961547)
-
-    Args:
-        module (nn.Module): The module containing `SyncBatchNorm` layers.
-
-    Returns:
-        module_output: The converted module with `BatchNormXd` layers.
-    """
-    module_output = module
-    module_checklist = [torch.nn.modules.batchnorm.SyncBatchNorm]
-    if hasattr(mmcv, 'ops'):
-        module_checklist.append(mmcv.ops.SyncBatchNorm)
-    if isinstance(module, tuple(module_checklist)):
-        module_output = _BatchNormXd(module.num_features, module.eps,
-                                     module.momentum, module.affine,
-                                     module.track_running_stats)
-        if module.affine:
-            # no_grad() may not be needed here but
-            # just to be consistent with `convert_sync_batchnorm()`
-            with torch.no_grad():
-                module_output.weight = module.weight
-                module_output.bias = module.bias
-        module_output.running_mean = module.running_mean
-        module_output.running_var = module.running_var
-        module_output.num_batches_tracked = module.num_batches_tracked
-        module_output.training = module.training
-        # qconfig exists in quantized models
-        if hasattr(module, 'qconfig'):
-            module_output.qconfig = module.qconfig
-    for name, child in module.named_children():
-        module_output.add_module(name, revert_sync_batchnorm(child))
-    del module
-    return module_output
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/utils/weight_init.py b/ControlNet/annotator/uniformer/mmcv/cnn/utils/weight_init.py
deleted file mode 100644
index 287a1d0bffe26e023029d48634d9b761deda7ba4..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/utils/weight_init.py
+++ /dev/null
@@ -1,684 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import math
-import warnings
-
-import numpy as np
-import torch
-import torch.nn as nn
-from torch import Tensor
-
-from annotator.uniformer.mmcv.utils import Registry, build_from_cfg, get_logger, print_log
-
-INITIALIZERS = Registry('initializer')
-
-
-def update_init_info(module, init_info):
-    """Update the `_params_init_info` in the module if the value of parameters
-    are changed.
-
-    Args:
-        module (obj:`nn.Module`): The module of PyTorch with a user-defined
-            attribute `_params_init_info` which records the initialization
-            information.
-        init_info (str): The string that describes the initialization.
-    """
-    assert hasattr(
-        module,
-        '_params_init_info'), f'Can not find `_params_init_info` in {module}'
-    for name, param in module.named_parameters():
-
-        assert param in module._params_init_info, (
-            f'Find a new :obj:`Parameter` '
-            f'named `{name}` during executing the '
-            f'`init_weights` of '
-            f'`{module.__class__.__name__}`. '
-            f'Please do not add or '
-            f'replace parameters during executing '
-            f'the `init_weights`. ')
-
-        # The parameter has been changed during executing the
-        # `init_weights` of module
-        mean_value = param.data.mean()
-        if module._params_init_info[param]['tmp_mean_value'] != mean_value:
-            module._params_init_info[param]['init_info'] = init_info
-            module._params_init_info[param]['tmp_mean_value'] = mean_value
-
-
-def constant_init(module, val, bias=0):
-    if hasattr(module, 'weight') and module.weight is not None:
-        nn.init.constant_(module.weight, val)
-    if hasattr(module, 'bias') and module.bias is not None:
-        nn.init.constant_(module.bias, bias)
-
-
-def xavier_init(module, gain=1, bias=0, distribution='normal'):
-    assert distribution in ['uniform', 'normal']
-    if hasattr(module, 'weight') and module.weight is not None:
-        if distribution == 'uniform':
-            nn.init.xavier_uniform_(module.weight, gain=gain)
-        else:
-            nn.init.xavier_normal_(module.weight, gain=gain)
-    if hasattr(module, 'bias') and module.bias is not None:
-        nn.init.constant_(module.bias, bias)
-
-
-def normal_init(module, mean=0, std=1, bias=0):
-    if hasattr(module, 'weight') and module.weight is not None:
-        nn.init.normal_(module.weight, mean, std)
-    if hasattr(module, 'bias') and module.bias is not None:
-        nn.init.constant_(module.bias, bias)
-
-
-def trunc_normal_init(module: nn.Module,
-                      mean: float = 0,
-                      std: float = 1,
-                      a: float = -2,
-                      b: float = 2,
-                      bias: float = 0) -> None:
-    if hasattr(module, 'weight') and module.weight is not None:
-        trunc_normal_(module.weight, mean, std, a, b)  # type: ignore
-    if hasattr(module, 'bias') and module.bias is not None:
-        nn.init.constant_(module.bias, bias)  # type: ignore
-
-
-def uniform_init(module, a=0, b=1, bias=0):
-    if hasattr(module, 'weight') and module.weight is not None:
-        nn.init.uniform_(module.weight, a, b)
-    if hasattr(module, 'bias') and module.bias is not None:
-        nn.init.constant_(module.bias, bias)
-
-
-def kaiming_init(module,
-                 a=0,
-                 mode='fan_out',
-                 nonlinearity='relu',
-                 bias=0,
-                 distribution='normal'):
-    assert distribution in ['uniform', 'normal']
-    if hasattr(module, 'weight') and module.weight is not None:
-        if distribution == 'uniform':
-            nn.init.kaiming_uniform_(
-                module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
-        else:
-            nn.init.kaiming_normal_(
-                module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
-    if hasattr(module, 'bias') and module.bias is not None:
-        nn.init.constant_(module.bias, bias)
-
-
-def caffe2_xavier_init(module, bias=0):
-    # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch
-    # Acknowledgment to FAIR's internal code
-    kaiming_init(
-        module,
-        a=1,
-        mode='fan_in',
-        nonlinearity='leaky_relu',
-        bias=bias,
-        distribution='uniform')
-
-
-def bias_init_with_prob(prior_prob):
-    """initialize conv/fc bias value according to a given probability value."""
-    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
-    return bias_init
-
-
-def _get_bases_name(m):
-    return [b.__name__ for b in m.__class__.__bases__]
-
-
-class BaseInit(object):
-
-    def __init__(self, *, bias=0, bias_prob=None, layer=None):
-        self.wholemodule = False
-        if not isinstance(bias, (int, float)):
-            raise TypeError(f'bias must be a number, but got a {type(bias)}')
-
-        if bias_prob is not None:
-            if not isinstance(bias_prob, float):
-                raise TypeError(f'bias_prob type must be float, \
-                    but got {type(bias_prob)}')
-
-        if layer is not None:
-            if not isinstance(layer, (str, list)):
-                raise TypeError(f'layer must be a str or a list of str, \
-                    but got a {type(layer)}')
-        else:
-            layer = []
-
-        if bias_prob is not None:
-            self.bias = bias_init_with_prob(bias_prob)
-        else:
-            self.bias = bias
-        self.layer = [layer] if isinstance(layer, str) else layer
-
-    def _get_init_info(self):
-        info = f'{self.__class__.__name__}, bias={self.bias}'
-        return info
-
-
-@INITIALIZERS.register_module(name='Constant')
-class ConstantInit(BaseInit):
-    """Initialize module parameters with constant values.
-
-    Args:
-        val (int | float): the value to fill the weights in the module with
-        bias (int | float): the value to fill the bias. Defaults to 0.
-        bias_prob (float, optional): the probability for bias initialization.
-            Defaults to None.
-        layer (str | list[str], optional): the layer will be initialized.
-            Defaults to None.
-    """
-
-    def __init__(self, val, **kwargs):
-        super().__init__(**kwargs)
-        self.val = val
-
-    def __call__(self, module):
-
-        def init(m):
-            if self.wholemodule:
-                constant_init(m, self.val, self.bias)
-            else:
-                layername = m.__class__.__name__
-                basesname = _get_bases_name(m)
-                if len(set(self.layer) & set([layername] + basesname)):
-                    constant_init(m, self.val, self.bias)
-
-        module.apply(init)
-        if hasattr(module, '_params_init_info'):
-            update_init_info(module, init_info=self._get_init_info())
-
-    def _get_init_info(self):
-        info = f'{self.__class__.__name__}: val={self.val}, bias={self.bias}'
-        return info
-
-
-@INITIALIZERS.register_module(name='Xavier')
-class XavierInit(BaseInit):
-    r"""Initialize module parameters with values according to the method
-    described in `Understanding the difficulty of training deep feedforward
-    neural networks - Glorot, X. & Bengio, Y. (2010).
-    <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
-
-    Args:
-        gain (int | float): an optional scaling factor. Defaults to 1.
-        bias (int | float): the value to fill the bias. Defaults to 0.
-        bias_prob (float, optional): the probability for bias initialization.
-            Defaults to None.
-        distribution (str): distribution either be ``'normal'``
-            or ``'uniform'``. Defaults to ``'normal'``.
-        layer (str | list[str], optional): the layer will be initialized.
-            Defaults to None.
-    """
-
-    def __init__(self, gain=1, distribution='normal', **kwargs):
-        super().__init__(**kwargs)
-        self.gain = gain
-        self.distribution = distribution
-
-    def __call__(self, module):
-
-        def init(m):
-            if self.wholemodule:
-                xavier_init(m, self.gain, self.bias, self.distribution)
-            else:
-                layername = m.__class__.__name__
-                basesname = _get_bases_name(m)
-                if len(set(self.layer) & set([layername] + basesname)):
-                    xavier_init(m, self.gain, self.bias, self.distribution)
-
-        module.apply(init)
-        if hasattr(module, '_params_init_info'):
-            update_init_info(module, init_info=self._get_init_info())
-
-    def _get_init_info(self):
-        info = f'{self.__class__.__name__}: gain={self.gain}, ' \
-               f'distribution={self.distribution}, bias={self.bias}'
-        return info
-
-
-@INITIALIZERS.register_module(name='Normal')
-class NormalInit(BaseInit):
-    r"""Initialize module parameters with the values drawn from the normal
-    distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`.
-
-    Args:
-        mean (int | float):the mean of the normal distribution. Defaults to 0.
-        std (int | float): the standard deviation of the normal distribution.
-            Defaults to 1.
-        bias (int | float): the value to fill the bias. Defaults to 0.
-        bias_prob (float, optional): the probability for bias initialization.
-            Defaults to None.
-        layer (str | list[str], optional): the layer will be initialized.
-            Defaults to None.
-
-    """
-
-    def __init__(self, mean=0, std=1, **kwargs):
-        super().__init__(**kwargs)
-        self.mean = mean
-        self.std = std
-
-    def __call__(self, module):
-
-        def init(m):
-            if self.wholemodule:
-                normal_init(m, self.mean, self.std, self.bias)
-            else:
-                layername = m.__class__.__name__
-                basesname = _get_bases_name(m)
-                if len(set(self.layer) & set([layername] + basesname)):
-                    normal_init(m, self.mean, self.std, self.bias)
-
-        module.apply(init)
-        if hasattr(module, '_params_init_info'):
-            update_init_info(module, init_info=self._get_init_info())
-
-    def _get_init_info(self):
-        info = f'{self.__class__.__name__}: mean={self.mean},' \
-               f' std={self.std}, bias={self.bias}'
-        return info
-
-
-@INITIALIZERS.register_module(name='TruncNormal')
-class TruncNormalInit(BaseInit):
-    r"""Initialize module parameters with the values drawn from the normal
-    distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` with values
-    outside :math:`[a, b]`.
-
-    Args:
-        mean (float): the mean of the normal distribution. Defaults to 0.
-        std (float):  the standard deviation of the normal distribution.
-            Defaults to 1.
-        a (float): The minimum cutoff value.
-        b ( float): The maximum cutoff value.
-        bias (float): the value to fill the bias. Defaults to 0.
-        bias_prob (float, optional): the probability for bias initialization.
-            Defaults to None.
-        layer (str | list[str], optional): the layer will be initialized.
-            Defaults to None.
-
-    """
-
-    def __init__(self,
-                 mean: float = 0,
-                 std: float = 1,
-                 a: float = -2,
-                 b: float = 2,
-                 **kwargs) -> None:
-        super().__init__(**kwargs)
-        self.mean = mean
-        self.std = std
-        self.a = a
-        self.b = b
-
-    def __call__(self, module: nn.Module) -> None:
-
-        def init(m):
-            if self.wholemodule:
-                trunc_normal_init(m, self.mean, self.std, self.a, self.b,
-                                  self.bias)
-            else:
-                layername = m.__class__.__name__
-                basesname = _get_bases_name(m)
-                if len(set(self.layer) & set([layername] + basesname)):
-                    trunc_normal_init(m, self.mean, self.std, self.a, self.b,
-                                      self.bias)
-
-        module.apply(init)
-        if hasattr(module, '_params_init_info'):
-            update_init_info(module, init_info=self._get_init_info())
-
-    def _get_init_info(self):
-        info = f'{self.__class__.__name__}: a={self.a}, b={self.b},' \
-               f' mean={self.mean}, std={self.std}, bias={self.bias}'
-        return info
-
-
-@INITIALIZERS.register_module(name='Uniform')
-class UniformInit(BaseInit):
-    r"""Initialize module parameters with values drawn from the uniform
-    distribution :math:`\mathcal{U}(a, b)`.
-
-    Args:
-        a (int | float): the lower bound of the uniform distribution.
-            Defaults to 0.
-        b (int | float): the upper bound of the uniform distribution.
-            Defaults to 1.
-        bias (int | float): the value to fill the bias. Defaults to 0.
-        bias_prob (float, optional): the probability for bias initialization.
-            Defaults to None.
-        layer (str | list[str], optional): the layer will be initialized.
-            Defaults to None.
-    """
-
-    def __init__(self, a=0, b=1, **kwargs):
-        super().__init__(**kwargs)
-        self.a = a
-        self.b = b
-
-    def __call__(self, module):
-
-        def init(m):
-            if self.wholemodule:
-                uniform_init(m, self.a, self.b, self.bias)
-            else:
-                layername = m.__class__.__name__
-                basesname = _get_bases_name(m)
-                if len(set(self.layer) & set([layername] + basesname)):
-                    uniform_init(m, self.a, self.b, self.bias)
-
-        module.apply(init)
-        if hasattr(module, '_params_init_info'):
-            update_init_info(module, init_info=self._get_init_info())
-
-    def _get_init_info(self):
-        info = f'{self.__class__.__name__}: a={self.a},' \
-               f' b={self.b}, bias={self.bias}'
-        return info
-
-
-@INITIALIZERS.register_module(name='Kaiming')
-class KaimingInit(BaseInit):
-    r"""Initialize module parameters with the values according to the method
-    described in `Delving deep into rectifiers: Surpassing human-level
-    performance on ImageNet classification - He, K. et al. (2015).
-    <https://www.cv-foundation.org/openaccess/content_iccv_2015/
-    papers/He_Delving_Deep_into_ICCV_2015_paper.pdf>`_
-
-    Args:
-        a (int | float): the negative slope of the rectifier used after this
-            layer (only used with ``'leaky_relu'``). Defaults to 0.
-        mode (str):  either ``'fan_in'`` or ``'fan_out'``. Choosing
-            ``'fan_in'`` preserves the magnitude of the variance of the weights
-            in the forward pass. Choosing ``'fan_out'`` preserves the
-            magnitudes in the backwards pass. Defaults to ``'fan_out'``.
-        nonlinearity (str): the non-linear function (`nn.functional` name),
-            recommended to use only with ``'relu'`` or ``'leaky_relu'`` .
-            Defaults to 'relu'.
-        bias (int | float): the value to fill the bias. Defaults to 0.
-        bias_prob (float, optional): the probability for bias initialization.
-            Defaults to None.
-        distribution (str): distribution either be ``'normal'`` or
-            ``'uniform'``. Defaults to ``'normal'``.
-        layer (str | list[str], optional): the layer will be initialized.
-            Defaults to None.
-    """
-
-    def __init__(self,
-                 a=0,
-                 mode='fan_out',
-                 nonlinearity='relu',
-                 distribution='normal',
-                 **kwargs):
-        super().__init__(**kwargs)
-        self.a = a
-        self.mode = mode
-        self.nonlinearity = nonlinearity
-        self.distribution = distribution
-
-    def __call__(self, module):
-
-        def init(m):
-            if self.wholemodule:
-                kaiming_init(m, self.a, self.mode, self.nonlinearity,
-                             self.bias, self.distribution)
-            else:
-                layername = m.__class__.__name__
-                basesname = _get_bases_name(m)
-                if len(set(self.layer) & set([layername] + basesname)):
-                    kaiming_init(m, self.a, self.mode, self.nonlinearity,
-                                 self.bias, self.distribution)
-
-        module.apply(init)
-        if hasattr(module, '_params_init_info'):
-            update_init_info(module, init_info=self._get_init_info())
-
-    def _get_init_info(self):
-        info = f'{self.__class__.__name__}: a={self.a}, mode={self.mode}, ' \
-               f'nonlinearity={self.nonlinearity}, ' \
-               f'distribution ={self.distribution}, bias={self.bias}'
-        return info
-
-
-@INITIALIZERS.register_module(name='Caffe2Xavier')
-class Caffe2XavierInit(KaimingInit):
-    # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch
-    # Acknowledgment to FAIR's internal code
-    def __init__(self, **kwargs):
-        super().__init__(
-            a=1,
-            mode='fan_in',
-            nonlinearity='leaky_relu',
-            distribution='uniform',
-            **kwargs)
-
-    def __call__(self, module):
-        super().__call__(module)
-
-
-@INITIALIZERS.register_module(name='Pretrained')
-class PretrainedInit(object):
-    """Initialize module by loading a pretrained model.
-
-    Args:
-        checkpoint (str): the checkpoint file of the pretrained model should
-            be load.
-        prefix (str, optional): the prefix of a sub-module in the pretrained
-            model. it is for loading a part of the pretrained model to
-            initialize. For example, if we would like to only load the
-            backbone of a detector model, we can set ``prefix='backbone.'``.
-            Defaults to None.
-        map_location (str): map tensors into proper locations.
-    """
-
-    def __init__(self, checkpoint, prefix=None, map_location=None):
-        self.checkpoint = checkpoint
-        self.prefix = prefix
-        self.map_location = map_location
-
-    def __call__(self, module):
-        from annotator.uniformer.mmcv.runner import (_load_checkpoint_with_prefix, load_checkpoint,
-                                 load_state_dict)
-        logger = get_logger('mmcv')
-        if self.prefix is None:
-            print_log(f'load model from: {self.checkpoint}', logger=logger)
-            load_checkpoint(
-                module,
-                self.checkpoint,
-                map_location=self.map_location,
-                strict=False,
-                logger=logger)
-        else:
-            print_log(
-                f'load {self.prefix} in model from: {self.checkpoint}',
-                logger=logger)
-            state_dict = _load_checkpoint_with_prefix(
-                self.prefix, self.checkpoint, map_location=self.map_location)
-            load_state_dict(module, state_dict, strict=False, logger=logger)
-
-        if hasattr(module, '_params_init_info'):
-            update_init_info(module, init_info=self._get_init_info())
-
-    def _get_init_info(self):
-        info = f'{self.__class__.__name__}: load from {self.checkpoint}'
-        return info
-
-
-def _initialize(module, cfg, wholemodule=False):
-    func = build_from_cfg(cfg, INITIALIZERS)
-    # wholemodule flag is for override mode, there is no layer key in override
-    # and initializer will give init values for the whole module with the name
-    # in override.
-    func.wholemodule = wholemodule
-    func(module)
-
-
-def _initialize_override(module, override, cfg):
-    if not isinstance(override, (dict, list)):
-        raise TypeError(f'override must be a dict or a list of dict, \
-                but got {type(override)}')
-
-    override = [override] if isinstance(override, dict) else override
-
-    for override_ in override:
-
-        cp_override = copy.deepcopy(override_)
-        name = cp_override.pop('name', None)
-        if name is None:
-            raise ValueError('`override` must contain the key "name",'
-                             f'but got {cp_override}')
-        # if override only has name key, it means use args in init_cfg
-        if not cp_override:
-            cp_override.update(cfg)
-        # if override has name key and other args except type key, it will
-        # raise error
-        elif 'type' not in cp_override.keys():
-            raise ValueError(
-                f'`override` need "type" key, but got {cp_override}')
-
-        if hasattr(module, name):
-            _initialize(getattr(module, name), cp_override, wholemodule=True)
-        else:
-            raise RuntimeError(f'module did not have attribute {name}, '
-                               f'but init_cfg is {cp_override}.')
-
-
-def initialize(module, init_cfg):
-    """Initialize a module.
-
-    Args:
-        module (``torch.nn.Module``): the module will be initialized.
-        init_cfg (dict | list[dict]): initialization configuration dict to
-            define initializer. OpenMMLab has implemented 6 initializers
-            including ``Constant``, ``Xavier``, ``Normal``, ``Uniform``,
-            ``Kaiming``, and ``Pretrained``.
-    Example:
-        >>> module = nn.Linear(2, 3, bias=True)
-        >>> init_cfg = dict(type='Constant', layer='Linear', val =1 , bias =2)
-        >>> initialize(module, init_cfg)
-
-        >>> module = nn.Sequential(nn.Conv1d(3, 1, 3), nn.Linear(1,2))
-        >>> # define key ``'layer'`` for initializing layer with different
-        >>> # configuration
-        >>> init_cfg = [dict(type='Constant', layer='Conv1d', val=1),
-                dict(type='Constant', layer='Linear', val=2)]
-        >>> initialize(module, init_cfg)
-
-        >>> # define key``'override'`` to initialize some specific part in
-        >>> # module
-        >>> class FooNet(nn.Module):
-        >>>     def __init__(self):
-        >>>         super().__init__()
-        >>>         self.feat = nn.Conv2d(3, 16, 3)
-        >>>         self.reg = nn.Conv2d(16, 10, 3)
-        >>>         self.cls = nn.Conv2d(16, 5, 3)
-        >>> model = FooNet()
-        >>> init_cfg = dict(type='Constant', val=1, bias=2, layer='Conv2d',
-        >>>     override=dict(type='Constant', name='reg', val=3, bias=4))
-        >>> initialize(model, init_cfg)
-
-        >>> model = ResNet(depth=50)
-        >>> # Initialize weights with the pretrained model.
-        >>> init_cfg = dict(type='Pretrained',
-                checkpoint='torchvision://resnet50')
-        >>> initialize(model, init_cfg)
-
-        >>> # Initialize weights of a sub-module with the specific part of
-        >>> # a pretrained model by using "prefix".
-        >>> url = 'http://download.openmmlab.com/mmdetection/v2.0/retinanet/'\
-        >>>     'retinanet_r50_fpn_1x_coco/'\
-        >>>     'retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth'
-        >>> init_cfg = dict(type='Pretrained',
-                checkpoint=url, prefix='backbone.')
-    """
-    if not isinstance(init_cfg, (dict, list)):
-        raise TypeError(f'init_cfg must be a dict or a list of dict, \
-                but got {type(init_cfg)}')
-
-    if isinstance(init_cfg, dict):
-        init_cfg = [init_cfg]
-
-    for cfg in init_cfg:
-        # should deeply copy the original config because cfg may be used by
-        # other modules, e.g., one init_cfg shared by multiple bottleneck
-        # blocks, the expected cfg will be changed after pop and will change
-        # the initialization behavior of other modules
-        cp_cfg = copy.deepcopy(cfg)
-        override = cp_cfg.pop('override', None)
-        _initialize(module, cp_cfg)
-
-        if override is not None:
-            cp_cfg.pop('layer', None)
-            _initialize_override(module, override, cp_cfg)
-        else:
-            # All attributes in module have same initialization.
-            pass
-
-
-def _no_grad_trunc_normal_(tensor: Tensor, mean: float, std: float, a: float,
-                           b: float) -> Tensor:
-    # Method based on
-    # https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
-    # Modified from
-    # https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
-    def norm_cdf(x):
-        # Computes standard normal cumulative distribution function
-        return (1. + math.erf(x / math.sqrt(2.))) / 2.
-
-    if (mean < a - 2 * std) or (mean > b + 2 * std):
-        warnings.warn(
-            'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. '
-            'The distribution of values may be incorrect.',
-            stacklevel=2)
-
-    with torch.no_grad():
-        # Values are generated by using a truncated uniform distribution and
-        # then using the inverse CDF for the normal distribution.
-        # Get upper and lower cdf values
-        lower = norm_cdf((a - mean) / std)
-        upper = norm_cdf((b - mean) / std)
-
-        # Uniformly fill tensor with values from [lower, upper], then translate
-        # to [2lower-1, 2upper-1].
-        tensor.uniform_(2 * lower - 1, 2 * upper - 1)
-
-        # Use inverse cdf transform for normal distribution to get truncated
-        # standard normal
-        tensor.erfinv_()
-
-        # Transform to proper mean, std
-        tensor.mul_(std * math.sqrt(2.))
-        tensor.add_(mean)
-
-        # Clamp to ensure it's in the proper range
-        tensor.clamp_(min=a, max=b)
-        return tensor
-
-
-def trunc_normal_(tensor: Tensor,
-                  mean: float = 0.,
-                  std: float = 1.,
-                  a: float = -2.,
-                  b: float = 2.) -> Tensor:
-    r"""Fills the input Tensor with values drawn from a truncated
-    normal distribution. The values are effectively drawn from the
-    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
-    with values outside :math:`[a, b]` redrawn until they are within
-    the bounds. The method used for generating the random values works
-    best when :math:`a \leq \text{mean} \leq b`.
-
-    Modified from
-    https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
-
-    Args:
-        tensor (``torch.Tensor``): an n-dimensional `torch.Tensor`.
-        mean (float): the mean of the normal distribution.
-        std (float): the standard deviation of the normal distribution.
-        a (float): the minimum cutoff value.
-        b (float): the maximum cutoff value.
-    """
-    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
diff --git a/ControlNet/annotator/uniformer/mmcv/cnn/vgg.py b/ControlNet/annotator/uniformer/mmcv/cnn/vgg.py
deleted file mode 100644
index 8778b649561a45a9652b1a15a26c2d171e58f3e1..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/cnn/vgg.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import logging
-
-import torch.nn as nn
-
-from .utils import constant_init, kaiming_init, normal_init
-
-
-def conv3x3(in_planes, out_planes, dilation=1):
-    """3x3 convolution with padding."""
-    return nn.Conv2d(
-        in_planes,
-        out_planes,
-        kernel_size=3,
-        padding=dilation,
-        dilation=dilation)
-
-
-def make_vgg_layer(inplanes,
-                   planes,
-                   num_blocks,
-                   dilation=1,
-                   with_bn=False,
-                   ceil_mode=False):
-    layers = []
-    for _ in range(num_blocks):
-        layers.append(conv3x3(inplanes, planes, dilation))
-        if with_bn:
-            layers.append(nn.BatchNorm2d(planes))
-        layers.append(nn.ReLU(inplace=True))
-        inplanes = planes
-    layers.append(nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=ceil_mode))
-
-    return layers
-
-
-class VGG(nn.Module):
-    """VGG backbone.
-
-    Args:
-        depth (int): Depth of vgg, from {11, 13, 16, 19}.
-        with_bn (bool): Use BatchNorm or not.
-        num_classes (int): number of classes for classification.
-        num_stages (int): VGG stages, normally 5.
-        dilations (Sequence[int]): Dilation of each stage.
-        out_indices (Sequence[int]): Output from which stages.
-        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
-            not freezing any parameters.
-        bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze
-            running stats (mean and var).
-        bn_frozen (bool): Whether to freeze weight and bias of BN layers.
-    """
-
-    arch_settings = {
-        11: (1, 1, 2, 2, 2),
-        13: (2, 2, 2, 2, 2),
-        16: (2, 2, 3, 3, 3),
-        19: (2, 2, 4, 4, 4)
-    }
-
-    def __init__(self,
-                 depth,
-                 with_bn=False,
-                 num_classes=-1,
-                 num_stages=5,
-                 dilations=(1, 1, 1, 1, 1),
-                 out_indices=(0, 1, 2, 3, 4),
-                 frozen_stages=-1,
-                 bn_eval=True,
-                 bn_frozen=False,
-                 ceil_mode=False,
-                 with_last_pool=True):
-        super(VGG, self).__init__()
-        if depth not in self.arch_settings:
-            raise KeyError(f'invalid depth {depth} for vgg')
-        assert num_stages >= 1 and num_stages <= 5
-        stage_blocks = self.arch_settings[depth]
-        self.stage_blocks = stage_blocks[:num_stages]
-        assert len(dilations) == num_stages
-        assert max(out_indices) <= num_stages
-
-        self.num_classes = num_classes
-        self.out_indices = out_indices
-        self.frozen_stages = frozen_stages
-        self.bn_eval = bn_eval
-        self.bn_frozen = bn_frozen
-
-        self.inplanes = 3
-        start_idx = 0
-        vgg_layers = []
-        self.range_sub_modules = []
-        for i, num_blocks in enumerate(self.stage_blocks):
-            num_modules = num_blocks * (2 + with_bn) + 1
-            end_idx = start_idx + num_modules
-            dilation = dilations[i]
-            planes = 64 * 2**i if i < 4 else 512
-            vgg_layer = make_vgg_layer(
-                self.inplanes,
-                planes,
-                num_blocks,
-                dilation=dilation,
-                with_bn=with_bn,
-                ceil_mode=ceil_mode)
-            vgg_layers.extend(vgg_layer)
-            self.inplanes = planes
-            self.range_sub_modules.append([start_idx, end_idx])
-            start_idx = end_idx
-        if not with_last_pool:
-            vgg_layers.pop(-1)
-            self.range_sub_modules[-1][1] -= 1
-        self.module_name = 'features'
-        self.add_module(self.module_name, nn.Sequential(*vgg_layers))
-
-        if self.num_classes > 0:
-            self.classifier = nn.Sequential(
-                nn.Linear(512 * 7 * 7, 4096),
-                nn.ReLU(True),
-                nn.Dropout(),
-                nn.Linear(4096, 4096),
-                nn.ReLU(True),
-                nn.Dropout(),
-                nn.Linear(4096, num_classes),
-            )
-
-    def init_weights(self, pretrained=None):
-        if isinstance(pretrained, str):
-            logger = logging.getLogger()
-            from ..runner import load_checkpoint
-            load_checkpoint(self, pretrained, strict=False, logger=logger)
-        elif pretrained is None:
-            for m in self.modules():
-                if isinstance(m, nn.Conv2d):
-                    kaiming_init(m)
-                elif isinstance(m, nn.BatchNorm2d):
-                    constant_init(m, 1)
-                elif isinstance(m, nn.Linear):
-                    normal_init(m, std=0.01)
-        else:
-            raise TypeError('pretrained must be a str or None')
-
-    def forward(self, x):
-        outs = []
-        vgg_layers = getattr(self, self.module_name)
-        for i in range(len(self.stage_blocks)):
-            for j in range(*self.range_sub_modules[i]):
-                vgg_layer = vgg_layers[j]
-                x = vgg_layer(x)
-            if i in self.out_indices:
-                outs.append(x)
-        if self.num_classes > 0:
-            x = x.view(x.size(0), -1)
-            x = self.classifier(x)
-            outs.append(x)
-        if len(outs) == 1:
-            return outs[0]
-        else:
-            return tuple(outs)
-
-    def train(self, mode=True):
-        super(VGG, self).train(mode)
-        if self.bn_eval:
-            for m in self.modules():
-                if isinstance(m, nn.BatchNorm2d):
-                    m.eval()
-                    if self.bn_frozen:
-                        for params in m.parameters():
-                            params.requires_grad = False
-        vgg_layers = getattr(self, self.module_name)
-        if mode and self.frozen_stages >= 0:
-            for i in range(self.frozen_stages):
-                for j in range(*self.range_sub_modules[i]):
-                    mod = vgg_layers[j]
-                    mod.eval()
-                    for param in mod.parameters():
-                        param.requires_grad = False
diff --git a/ControlNet/annotator/uniformer/mmcv/engine/__init__.py b/ControlNet/annotator/uniformer/mmcv/engine/__init__.py
deleted file mode 100644
index 3193b7f664e19ce2458d81c836597fa22e4bb082..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/engine/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .test import (collect_results_cpu, collect_results_gpu, multi_gpu_test,
-                   single_gpu_test)
-
-__all__ = [
-    'collect_results_cpu', 'collect_results_gpu', 'multi_gpu_test',
-    'single_gpu_test'
-]
diff --git a/ControlNet/annotator/uniformer/mmcv/engine/test.py b/ControlNet/annotator/uniformer/mmcv/engine/test.py
deleted file mode 100644
index 8dbeef271db634ec2dadfda3bc0b5ef9c7a677ff..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/engine/test.py
+++ /dev/null
@@ -1,202 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-import pickle
-import shutil
-import tempfile
-import time
-
-import torch
-import torch.distributed as dist
-
-import annotator.uniformer.mmcv as mmcv
-from annotator.uniformer.mmcv.runner import get_dist_info
-
-
-def single_gpu_test(model, data_loader):
-    """Test model with a single gpu.
-
-    This method tests model with a single gpu and displays test progress bar.
-
-    Args:
-        model (nn.Module): Model to be tested.
-        data_loader (nn.Dataloader): Pytorch data loader.
-
-    Returns:
-        list: The prediction results.
-    """
-    model.eval()
-    results = []
-    dataset = data_loader.dataset
-    prog_bar = mmcv.ProgressBar(len(dataset))
-    for data in data_loader:
-        with torch.no_grad():
-            result = model(return_loss=False, **data)
-        results.extend(result)
-
-        # Assume result has the same length of batch_size
-        # refer to https://github.com/open-mmlab/mmcv/issues/985
-        batch_size = len(result)
-        for _ in range(batch_size):
-            prog_bar.update()
-    return results
-
-
-def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
-    """Test model with multiple gpus.
-
-    This method tests model with multiple gpus and collects the results
-    under two different modes: gpu and cpu modes. By setting
-    ``gpu_collect=True``, it encodes results to gpu tensors and use gpu
-    communication for results collection. On cpu mode it saves the results on
-    different gpus to ``tmpdir`` and collects them by the rank 0 worker.
-
-    Args:
-        model (nn.Module): Model to be tested.
-        data_loader (nn.Dataloader): Pytorch data loader.
-        tmpdir (str): Path of directory to save the temporary results from
-            different gpus under cpu mode.
-        gpu_collect (bool): Option to use either gpu or cpu to collect results.
-
-    Returns:
-        list: The prediction results.
-    """
-    model.eval()
-    results = []
-    dataset = data_loader.dataset
-    rank, world_size = get_dist_info()
-    if rank == 0:
-        prog_bar = mmcv.ProgressBar(len(dataset))
-    time.sleep(2)  # This line can prevent deadlock problem in some cases.
-    for i, data in enumerate(data_loader):
-        with torch.no_grad():
-            result = model(return_loss=False, **data)
-        results.extend(result)
-
-        if rank == 0:
-            batch_size = len(result)
-            batch_size_all = batch_size * world_size
-            if batch_size_all + prog_bar.completed > len(dataset):
-                batch_size_all = len(dataset) - prog_bar.completed
-            for _ in range(batch_size_all):
-                prog_bar.update()
-
-    # collect results from all ranks
-    if gpu_collect:
-        results = collect_results_gpu(results, len(dataset))
-    else:
-        results = collect_results_cpu(results, len(dataset), tmpdir)
-    return results
-
-
-def collect_results_cpu(result_part, size, tmpdir=None):
-    """Collect results under cpu mode.
-
-    On cpu mode, this function will save the results on different gpus to
-    ``tmpdir`` and collect them by the rank 0 worker.
-
-    Args:
-        result_part (list): Result list containing result parts
-            to be collected.
-        size (int): Size of the results, commonly equal to length of
-            the results.
-        tmpdir (str | None): temporal directory for collected results to
-            store. If set to None, it will create a random temporal directory
-            for it.
-
-    Returns:
-        list: The collected results.
-    """
-    rank, world_size = get_dist_info()
-    # create a tmp dir if it is not specified
-    if tmpdir is None:
-        MAX_LEN = 512
-        # 32 is whitespace
-        dir_tensor = torch.full((MAX_LEN, ),
-                                32,
-                                dtype=torch.uint8,
-                                device='cuda')
-        if rank == 0:
-            mmcv.mkdir_or_exist('.dist_test')
-            tmpdir = tempfile.mkdtemp(dir='.dist_test')
-            tmpdir = torch.tensor(
-                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
-            dir_tensor[:len(tmpdir)] = tmpdir
-        dist.broadcast(dir_tensor, 0)
-        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
-    else:
-        mmcv.mkdir_or_exist(tmpdir)
-    # dump the part result to the dir
-    mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
-    dist.barrier()
-    # collect all parts
-    if rank != 0:
-        return None
-    else:
-        # load results of all parts from tmp dir
-        part_list = []
-        for i in range(world_size):
-            part_file = osp.join(tmpdir, f'part_{i}.pkl')
-            part_result = mmcv.load(part_file)
-            # When data is severely insufficient, an empty part_result
-            # on a certain gpu could makes the overall outputs empty.
-            if part_result:
-                part_list.append(part_result)
-        # sort the results
-        ordered_results = []
-        for res in zip(*part_list):
-            ordered_results.extend(list(res))
-        # the dataloader may pad some samples
-        ordered_results = ordered_results[:size]
-        # remove tmp dir
-        shutil.rmtree(tmpdir)
-        return ordered_results
-
-
-def collect_results_gpu(result_part, size):
-    """Collect results under gpu mode.
-
-    On gpu mode, this function will encode results to gpu tensors and use gpu
-    communication for results collection.
-
-    Args:
-        result_part (list): Result list containing result parts
-            to be collected.
-        size (int): Size of the results, commonly equal to length of
-            the results.
-
-    Returns:
-        list: The collected results.
-    """
-    rank, world_size = get_dist_info()
-    # dump result part to tensor with pickle
-    part_tensor = torch.tensor(
-        bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda')
-    # gather all result part tensor shape
-    shape_tensor = torch.tensor(part_tensor.shape, device='cuda')
-    shape_list = [shape_tensor.clone() for _ in range(world_size)]
-    dist.all_gather(shape_list, shape_tensor)
-    # padding result part tensor to max length
-    shape_max = torch.tensor(shape_list).max()
-    part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda')
-    part_send[:shape_tensor[0]] = part_tensor
-    part_recv_list = [
-        part_tensor.new_zeros(shape_max) for _ in range(world_size)
-    ]
-    # gather all result part
-    dist.all_gather(part_recv_list, part_send)
-
-    if rank == 0:
-        part_list = []
-        for recv, shape in zip(part_recv_list, shape_list):
-            part_result = pickle.loads(recv[:shape[0]].cpu().numpy().tobytes())
-            # When data is severely insufficient, an empty part_result
-            # on a certain gpu could makes the overall outputs empty.
-            if part_result:
-                part_list.append(part_result)
-        # sort the results
-        ordered_results = []
-        for res in zip(*part_list):
-            ordered_results.extend(list(res))
-        # the dataloader may pad some samples
-        ordered_results = ordered_results[:size]
-        return ordered_results
diff --git a/ControlNet/annotator/uniformer/mmcv/fileio/__init__.py b/ControlNet/annotator/uniformer/mmcv/fileio/__init__.py
deleted file mode 100644
index 2051b85f7e59bff7bdbaa131849ce8cd31f059a4..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/fileio/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .file_client import BaseStorageBackend, FileClient
-from .handlers import BaseFileHandler, JsonHandler, PickleHandler, YamlHandler
-from .io import dump, load, register_handler
-from .parse import dict_from_file, list_from_file
-
-__all__ = [
-    'BaseStorageBackend', 'FileClient', 'load', 'dump', 'register_handler',
-    'BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler',
-    'list_from_file', 'dict_from_file'
-]
diff --git a/ControlNet/annotator/uniformer/mmcv/fileio/file_client.py b/ControlNet/annotator/uniformer/mmcv/fileio/file_client.py
deleted file mode 100644
index 950f0c1aeab14b8e308a7455ccd64a95b5d98add..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/fileio/file_client.py
+++ /dev/null
@@ -1,1148 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import inspect
-import os
-import os.path as osp
-import re
-import tempfile
-import warnings
-from abc import ABCMeta, abstractmethod
-from contextlib import contextmanager
-from pathlib import Path
-from typing import Iterable, Iterator, Optional, Tuple, Union
-from urllib.request import urlopen
-
-import annotator.uniformer.mmcv as mmcv
-from annotator.uniformer.mmcv.utils.misc import has_method
-from annotator.uniformer.mmcv.utils.path import is_filepath
-
-
-class BaseStorageBackend(metaclass=ABCMeta):
-    """Abstract class of storage backends.
-
-    All backends need to implement two apis: ``get()`` and ``get_text()``.
-    ``get()`` reads the file as a byte stream and ``get_text()`` reads the file
-    as texts.
-    """
-
-    # a flag to indicate whether the backend can create a symlink for a file
-    _allow_symlink = False
-
-    @property
-    def name(self):
-        return self.__class__.__name__
-
-    @property
-    def allow_symlink(self):
-        return self._allow_symlink
-
-    @abstractmethod
-    def get(self, filepath):
-        pass
-
-    @abstractmethod
-    def get_text(self, filepath):
-        pass
-
-
-class CephBackend(BaseStorageBackend):
-    """Ceph storage backend (for internal use).
-
-    Args:
-        path_mapping (dict|None): path mapping dict from local path to Petrel
-            path. When ``path_mapping={'src': 'dst'}``, ``src`` in ``filepath``
-            will be replaced by ``dst``. Default: None.
-
-    .. warning::
-        :class:`mmcv.fileio.file_client.CephBackend` will be deprecated,
-        please use :class:`mmcv.fileio.file_client.PetrelBackend` instead.
-    """
-
-    def __init__(self, path_mapping=None):
-        try:
-            import ceph
-        except ImportError:
-            raise ImportError('Please install ceph to enable CephBackend.')
-
-        warnings.warn(
-            'CephBackend will be deprecated, please use PetrelBackend instead')
-        self._client = ceph.S3Client()
-        assert isinstance(path_mapping, dict) or path_mapping is None
-        self.path_mapping = path_mapping
-
-    def get(self, filepath):
-        filepath = str(filepath)
-        if self.path_mapping is not None:
-            for k, v in self.path_mapping.items():
-                filepath = filepath.replace(k, v)
-        value = self._client.Get(filepath)
-        value_buf = memoryview(value)
-        return value_buf
-
-    def get_text(self, filepath, encoding=None):
-        raise NotImplementedError
-
-
-class PetrelBackend(BaseStorageBackend):
-    """Petrel storage backend (for internal use).
-
-    PetrelBackend supports reading and writing data to multiple clusters.
-    If the file path contains the cluster name, PetrelBackend will read data
-    from specified cluster or write data to it. Otherwise, PetrelBackend will
-    access the default cluster.
-
-    Args:
-        path_mapping (dict, optional): Path mapping dict from local path to
-            Petrel path. When ``path_mapping={'src': 'dst'}``, ``src`` in
-            ``filepath`` will be replaced by ``dst``. Default: None.
-        enable_mc (bool, optional): Whether to enable memcached support.
-            Default: True.
-
-    Examples:
-        >>> filepath1 = 's3://path/of/file'
-        >>> filepath2 = 'cluster-name:s3://path/of/file'
-        >>> client = PetrelBackend()
-        >>> client.get(filepath1)  # get data from default cluster
-        >>> client.get(filepath2)  # get data from 'cluster-name' cluster
-    """
-
-    def __init__(self,
-                 path_mapping: Optional[dict] = None,
-                 enable_mc: bool = True):
-        try:
-            from petrel_client import client
-        except ImportError:
-            raise ImportError('Please install petrel_client to enable '
-                              'PetrelBackend.')
-
-        self._client = client.Client(enable_mc=enable_mc)
-        assert isinstance(path_mapping, dict) or path_mapping is None
-        self.path_mapping = path_mapping
-
-    def _map_path(self, filepath: Union[str, Path]) -> str:
-        """Map ``filepath`` to a string path whose prefix will be replaced by
-        :attr:`self.path_mapping`.
-
-        Args:
-            filepath (str): Path to be mapped.
-        """
-        filepath = str(filepath)
-        if self.path_mapping is not None:
-            for k, v in self.path_mapping.items():
-                filepath = filepath.replace(k, v)
-        return filepath
-
-    def _format_path(self, filepath: str) -> str:
-        """Convert a ``filepath`` to standard format of petrel oss.
-
-        If the ``filepath`` is concatenated by ``os.path.join``, in a Windows
-        environment, the ``filepath`` will be the format of
-        's3://bucket_name\\image.jpg'. By invoking :meth:`_format_path`, the
-        above ``filepath`` will be converted to 's3://bucket_name/image.jpg'.
-
-        Args:
-            filepath (str): Path to be formatted.
-        """
-        return re.sub(r'\\+', '/', filepath)
-
-    def get(self, filepath: Union[str, Path]) -> memoryview:
-        """Read data from a given ``filepath`` with 'rb' mode.
-
-        Args:
-            filepath (str or Path): Path to read data.
-
-        Returns:
-            memoryview: A memory view of expected bytes object to avoid
-                copying. The memoryview object can be converted to bytes by
-                ``value_buf.tobytes()``.
-        """
-        filepath = self._map_path(filepath)
-        filepath = self._format_path(filepath)
-        value = self._client.Get(filepath)
-        value_buf = memoryview(value)
-        return value_buf
-
-    def get_text(self,
-                 filepath: Union[str, Path],
-                 encoding: str = 'utf-8') -> str:
-        """Read data from a given ``filepath`` with 'r' mode.
-
-        Args:
-            filepath (str or Path): Path to read data.
-            encoding (str): The encoding format used to open the ``filepath``.
-                Default: 'utf-8'.
-
-        Returns:
-            str: Expected text reading from ``filepath``.
-        """
-        return str(self.get(filepath), encoding=encoding)
-
-    def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
-        """Save data to a given ``filepath``.
-
-        Args:
-            obj (bytes): Data to be saved.
-            filepath (str or Path): Path to write data.
-        """
-        filepath = self._map_path(filepath)
-        filepath = self._format_path(filepath)
-        self._client.put(filepath, obj)
-
-    def put_text(self,
-                 obj: str,
-                 filepath: Union[str, Path],
-                 encoding: str = 'utf-8') -> None:
-        """Save data to a given ``filepath``.
-
-        Args:
-            obj (str): Data to be written.
-            filepath (str or Path): Path to write data.
-            encoding (str): The encoding format used to encode the ``obj``.
-                Default: 'utf-8'.
-        """
-        self.put(bytes(obj, encoding=encoding), filepath)
-
-    def remove(self, filepath: Union[str, Path]) -> None:
-        """Remove a file.
-
-        Args:
-            filepath (str or Path): Path to be removed.
-        """
-        if not has_method(self._client, 'delete'):
-            raise NotImplementedError(
-                ('Current version of Petrel Python SDK has not supported '
-                 'the `delete` method, please use a higher version or dev'
-                 ' branch instead.'))
-
-        filepath = self._map_path(filepath)
-        filepath = self._format_path(filepath)
-        self._client.delete(filepath)
-
-    def exists(self, filepath: Union[str, Path]) -> bool:
-        """Check whether a file path exists.
-
-        Args:
-            filepath (str or Path): Path to be checked whether exists.
-
-        Returns:
-            bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
-        """
-        if not (has_method(self._client, 'contains')
-                and has_method(self._client, 'isdir')):
-            raise NotImplementedError(
-                ('Current version of Petrel Python SDK has not supported '
-                 'the `contains` and `isdir` methods, please use a higher'
-                 'version or dev branch instead.'))
-
-        filepath = self._map_path(filepath)
-        filepath = self._format_path(filepath)
-        return self._client.contains(filepath) or self._client.isdir(filepath)
-
-    def isdir(self, filepath: Union[str, Path]) -> bool:
-        """Check whether a file path is a directory.
-
-        Args:
-            filepath (str or Path): Path to be checked whether it is a
-                directory.
-
-        Returns:
-            bool: Return ``True`` if ``filepath`` points to a directory,
-                ``False`` otherwise.
-        """
-        if not has_method(self._client, 'isdir'):
-            raise NotImplementedError(
-                ('Current version of Petrel Python SDK has not supported '
-                 'the `isdir` method, please use a higher version or dev'
-                 ' branch instead.'))
-
-        filepath = self._map_path(filepath)
-        filepath = self._format_path(filepath)
-        return self._client.isdir(filepath)
-
-    def isfile(self, filepath: Union[str, Path]) -> bool:
-        """Check whether a file path is a file.
-
-        Args:
-            filepath (str or Path): Path to be checked whether it is a file.
-
-        Returns:
-            bool: Return ``True`` if ``filepath`` points to a file, ``False``
-                otherwise.
-        """
-        if not has_method(self._client, 'contains'):
-            raise NotImplementedError(
-                ('Current version of Petrel Python SDK has not supported '
-                 'the `contains` method, please use a higher version or '
-                 'dev branch instead.'))
-
-        filepath = self._map_path(filepath)
-        filepath = self._format_path(filepath)
-        return self._client.contains(filepath)
-
-    def join_path(self, filepath: Union[str, Path],
-                  *filepaths: Union[str, Path]) -> str:
-        """Concatenate all file paths.
-
-        Args:
-            filepath (str or Path): Path to be concatenated.
-
-        Returns:
-            str: The result after concatenation.
-        """
-        filepath = self._format_path(self._map_path(filepath))
-        if filepath.endswith('/'):
-            filepath = filepath[:-1]
-        formatted_paths = [filepath]
-        for path in filepaths:
-            formatted_paths.append(self._format_path(self._map_path(path)))
-        return '/'.join(formatted_paths)
-
-    @contextmanager
-    def get_local_path(self, filepath: Union[str, Path]) -> Iterable[str]:
-        """Download a file from ``filepath`` and return a temporary path.
-
-        ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
-        can be called with ``with`` statement, and when exists from the
-        ``with`` statement, the temporary path will be released.
-
-        Args:
-            filepath (str | Path): Download a file from ``filepath``.
-
-        Examples:
-            >>> client = PetrelBackend()
-            >>> # After existing from the ``with`` clause,
-            >>> # the path will be removed
-            >>> with client.get_local_path('s3://path/of/your/file') as path:
-            ...     # do something here
-
-        Yields:
-            Iterable[str]: Only yield one temporary path.
-        """
-        filepath = self._map_path(filepath)
-        filepath = self._format_path(filepath)
-        assert self.isfile(filepath)
-        try:
-            f = tempfile.NamedTemporaryFile(delete=False)
-            f.write(self.get(filepath))
-            f.close()
-            yield f.name
-        finally:
-            os.remove(f.name)
-
-    def list_dir_or_file(self,
-                         dir_path: Union[str, Path],
-                         list_dir: bool = True,
-                         list_file: bool = True,
-                         suffix: Optional[Union[str, Tuple[str]]] = None,
-                         recursive: bool = False) -> Iterator[str]:
-        """Scan a directory to find the interested directories or files in
-        arbitrary order.
-
-        Note:
-            Petrel has no concept of directories but it simulates the directory
-            hierarchy in the filesystem through public prefixes. In addition,
-            if the returned path ends with '/', it means the path is a public
-            prefix which is a logical directory.
-
-        Note:
-            :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
-            In addition, the returned path of directory will not contains the
-            suffix '/' which is consistent with other backends.
-
-        Args:
-            dir_path (str | Path): Path of the directory.
-            list_dir (bool): List the directories. Default: True.
-            list_file (bool): List the path of files. Default: True.
-            suffix (str or tuple[str], optional):  File suffix
-                that we are interested in. Default: None.
-            recursive (bool): If set to True, recursively scan the
-                directory. Default: False.
-
-        Yields:
-            Iterable[str]: A relative path to ``dir_path``.
-        """
-        if not has_method(self._client, 'list'):
-            raise NotImplementedError(
-                ('Current version of Petrel Python SDK has not supported '
-                 'the `list` method, please use a higher version or dev'
-                 ' branch instead.'))
-
-        dir_path = self._map_path(dir_path)
-        dir_path = self._format_path(dir_path)
-        if list_dir and suffix is not None:
-            raise TypeError(
-                '`list_dir` should be False when `suffix` is not None')
-
-        if (suffix is not None) and not isinstance(suffix, (str, tuple)):
-            raise TypeError('`suffix` must be a string or tuple of strings')
-
-        # Petrel's simulated directory hierarchy assumes that directory paths
-        # should end with `/`
-        if not dir_path.endswith('/'):
-            dir_path += '/'
-
-        root = dir_path
-
-        def _list_dir_or_file(dir_path, list_dir, list_file, suffix,
-                              recursive):
-            for path in self._client.list(dir_path):
-                # the `self.isdir` is not used here to determine whether path
-                # is a directory, because `self.isdir` relies on
-                # `self._client.list`
-                if path.endswith('/'):  # a directory path
-                    next_dir_path = self.join_path(dir_path, path)
-                    if list_dir:
-                        # get the relative path and exclude the last
-                        # character '/'
-                        rel_dir = next_dir_path[len(root):-1]
-                        yield rel_dir
-                    if recursive:
-                        yield from _list_dir_or_file(next_dir_path, list_dir,
-                                                     list_file, suffix,
-                                                     recursive)
-                else:  # a file path
-                    absolute_path = self.join_path(dir_path, path)
-                    rel_path = absolute_path[len(root):]
-                    if (suffix is None
-                            or rel_path.endswith(suffix)) and list_file:
-                        yield rel_path
-
-        return _list_dir_or_file(dir_path, list_dir, list_file, suffix,
-                                 recursive)
-
-
-class MemcachedBackend(BaseStorageBackend):
-    """Memcached storage backend.
-
-    Attributes:
-        server_list_cfg (str): Config file for memcached server list.
-        client_cfg (str): Config file for memcached client.
-        sys_path (str | None): Additional path to be appended to `sys.path`.
-            Default: None.
-    """
-
-    def __init__(self, server_list_cfg, client_cfg, sys_path=None):
-        if sys_path is not None:
-            import sys
-            sys.path.append(sys_path)
-        try:
-            import mc
-        except ImportError:
-            raise ImportError(
-                'Please install memcached to enable MemcachedBackend.')
-
-        self.server_list_cfg = server_list_cfg
-        self.client_cfg = client_cfg
-        self._client = mc.MemcachedClient.GetInstance(self.server_list_cfg,
-                                                      self.client_cfg)
-        # mc.pyvector servers as a point which points to a memory cache
-        self._mc_buffer = mc.pyvector()
-
-    def get(self, filepath):
-        filepath = str(filepath)
-        import mc
-        self._client.Get(filepath, self._mc_buffer)
-        value_buf = mc.ConvertBuffer(self._mc_buffer)
-        return value_buf
-
-    def get_text(self, filepath, encoding=None):
-        raise NotImplementedError
-
-
-class LmdbBackend(BaseStorageBackend):
-    """Lmdb storage backend.
-
-    Args:
-        db_path (str): Lmdb database path.
-        readonly (bool, optional): Lmdb environment parameter. If True,
-            disallow any write operations. Default: True.
-        lock (bool, optional): Lmdb environment parameter. If False, when
-            concurrent access occurs, do not lock the database. Default: False.
-        readahead (bool, optional): Lmdb environment parameter. If False,
-            disable the OS filesystem readahead mechanism, which may improve
-            random read performance when a database is larger than RAM.
-            Default: False.
-
-    Attributes:
-        db_path (str): Lmdb database path.
-    """
-
-    def __init__(self,
-                 db_path,
-                 readonly=True,
-                 lock=False,
-                 readahead=False,
-                 **kwargs):
-        try:
-            import lmdb
-        except ImportError:
-            raise ImportError('Please install lmdb to enable LmdbBackend.')
-
-        self.db_path = str(db_path)
-        self._client = lmdb.open(
-            self.db_path,
-            readonly=readonly,
-            lock=lock,
-            readahead=readahead,
-            **kwargs)
-
-    def get(self, filepath):
-        """Get values according to the filepath.
-
-        Args:
-            filepath (str | obj:`Path`): Here, filepath is the lmdb key.
-        """
-        filepath = str(filepath)
-        with self._client.begin(write=False) as txn:
-            value_buf = txn.get(filepath.encode('ascii'))
-        return value_buf
-
-    def get_text(self, filepath, encoding=None):
-        raise NotImplementedError
-
-
-class HardDiskBackend(BaseStorageBackend):
-    """Raw hard disks storage backend."""
-
-    _allow_symlink = True
-
-    def get(self, filepath: Union[str, Path]) -> bytes:
-        """Read data from a given ``filepath`` with 'rb' mode.
-
-        Args:
-            filepath (str or Path): Path to read data.
-
-        Returns:
-            bytes: Expected bytes object.
-        """
-        with open(filepath, 'rb') as f:
-            value_buf = f.read()
-        return value_buf
-
-    def get_text(self,
-                 filepath: Union[str, Path],
-                 encoding: str = 'utf-8') -> str:
-        """Read data from a given ``filepath`` with 'r' mode.
-
-        Args:
-            filepath (str or Path): Path to read data.
-            encoding (str): The encoding format used to open the ``filepath``.
-                Default: 'utf-8'.
-
-        Returns:
-            str: Expected text reading from ``filepath``.
-        """
-        with open(filepath, 'r', encoding=encoding) as f:
-            value_buf = f.read()
-        return value_buf
-
-    def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
-        """Write data to a given ``filepath`` with 'wb' mode.
-
-        Note:
-            ``put`` will create a directory if the directory of ``filepath``
-            does not exist.
-
-        Args:
-            obj (bytes): Data to be written.
-            filepath (str or Path): Path to write data.
-        """
-        mmcv.mkdir_or_exist(osp.dirname(filepath))
-        with open(filepath, 'wb') as f:
-            f.write(obj)
-
-    def put_text(self,
-                 obj: str,
-                 filepath: Union[str, Path],
-                 encoding: str = 'utf-8') -> None:
-        """Write data to a given ``filepath`` with 'w' mode.
-
-        Note:
-            ``put_text`` will create a directory if the directory of
-            ``filepath`` does not exist.
-
-        Args:
-            obj (str): Data to be written.
-            filepath (str or Path): Path to write data.
-            encoding (str): The encoding format used to open the ``filepath``.
-                Default: 'utf-8'.
-        """
-        mmcv.mkdir_or_exist(osp.dirname(filepath))
-        with open(filepath, 'w', encoding=encoding) as f:
-            f.write(obj)
-
-    def remove(self, filepath: Union[str, Path]) -> None:
-        """Remove a file.
-
-        Args:
-            filepath (str or Path): Path to be removed.
-        """
-        os.remove(filepath)
-
-    def exists(self, filepath: Union[str, Path]) -> bool:
-        """Check whether a file path exists.
-
-        Args:
-            filepath (str or Path): Path to be checked whether exists.
-
-        Returns:
-            bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
-        """
-        return osp.exists(filepath)
-
-    def isdir(self, filepath: Union[str, Path]) -> bool:
-        """Check whether a file path is a directory.
-
-        Args:
-            filepath (str or Path): Path to be checked whether it is a
-                directory.
-
-        Returns:
-            bool: Return ``True`` if ``filepath`` points to a directory,
-                ``False`` otherwise.
-        """
-        return osp.isdir(filepath)
-
-    def isfile(self, filepath: Union[str, Path]) -> bool:
-        """Check whether a file path is a file.
-
-        Args:
-            filepath (str or Path): Path to be checked whether it is a file.
-
-        Returns:
-            bool: Return ``True`` if ``filepath`` points to a file, ``False``
-                otherwise.
-        """
-        return osp.isfile(filepath)
-
-    def join_path(self, filepath: Union[str, Path],
-                  *filepaths: Union[str, Path]) -> str:
-        """Concatenate all file paths.
-
-        Join one or more filepath components intelligently. The return value
-        is the concatenation of filepath and any members of *filepaths.
-
-        Args:
-            filepath (str or Path): Path to be concatenated.
-
-        Returns:
-            str: The result of concatenation.
-        """
-        return osp.join(filepath, *filepaths)
-
-    @contextmanager
-    def get_local_path(
-            self, filepath: Union[str, Path]) -> Iterable[Union[str, Path]]:
-        """Only for unified API and do nothing."""
-        yield filepath
-
-    def list_dir_or_file(self,
-                         dir_path: Union[str, Path],
-                         list_dir: bool = True,
-                         list_file: bool = True,
-                         suffix: Optional[Union[str, Tuple[str]]] = None,
-                         recursive: bool = False) -> Iterator[str]:
-        """Scan a directory to find the interested directories or files in
-        arbitrary order.
-
-        Note:
-            :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
-
-        Args:
-            dir_path (str | Path): Path of the directory.
-            list_dir (bool): List the directories. Default: True.
-            list_file (bool): List the path of files. Default: True.
-            suffix (str or tuple[str], optional):  File suffix
-                that we are interested in. Default: None.
-            recursive (bool): If set to True, recursively scan the
-                directory. Default: False.
-
-        Yields:
-            Iterable[str]: A relative path to ``dir_path``.
-        """
-        if list_dir and suffix is not None:
-            raise TypeError('`suffix` should be None when `list_dir` is True')
-
-        if (suffix is not None) and not isinstance(suffix, (str, tuple)):
-            raise TypeError('`suffix` must be a string or tuple of strings')
-
-        root = dir_path
-
-        def _list_dir_or_file(dir_path, list_dir, list_file, suffix,
-                              recursive):
-            for entry in os.scandir(dir_path):
-                if not entry.name.startswith('.') and entry.is_file():
-                    rel_path = osp.relpath(entry.path, root)
-                    if (suffix is None
-                            or rel_path.endswith(suffix)) and list_file:
-                        yield rel_path
-                elif osp.isdir(entry.path):
-                    if list_dir:
-                        rel_dir = osp.relpath(entry.path, root)
-                        yield rel_dir
-                    if recursive:
-                        yield from _list_dir_or_file(entry.path, list_dir,
-                                                     list_file, suffix,
-                                                     recursive)
-
-        return _list_dir_or_file(dir_path, list_dir, list_file, suffix,
-                                 recursive)
-
-
-class HTTPBackend(BaseStorageBackend):
-    """HTTP and HTTPS storage bachend."""
-
-    def get(self, filepath):
-        value_buf = urlopen(filepath).read()
-        return value_buf
-
-    def get_text(self, filepath, encoding='utf-8'):
-        value_buf = urlopen(filepath).read()
-        return value_buf.decode(encoding)
-
-    @contextmanager
-    def get_local_path(self, filepath: str) -> Iterable[str]:
-        """Download a file from ``filepath``.
-
-        ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
-        can be called with ``with`` statement, and when exists from the
-        ``with`` statement, the temporary path will be released.
-
-        Args:
-            filepath (str): Download a file from ``filepath``.
-
-        Examples:
-            >>> client = HTTPBackend()
-            >>> # After existing from the ``with`` clause,
-            >>> # the path will be removed
-            >>> with client.get_local_path('http://path/of/your/file') as path:
-            ...     # do something here
-        """
-        try:
-            f = tempfile.NamedTemporaryFile(delete=False)
-            f.write(self.get(filepath))
-            f.close()
-            yield f.name
-        finally:
-            os.remove(f.name)
-
-
-class FileClient:
-    """A general file client to access files in different backends.
-
-    The client loads a file or text in a specified backend from its path
-    and returns it as a binary or text file. There are two ways to choose a
-    backend, the name of backend and the prefix of path. Although both of them
-    can be used to choose a storage backend, ``backend`` has a higher priority
-    that is if they are all set, the storage backend will be chosen by the
-    backend argument. If they are all `None`, the disk backend will be chosen.
-    Note that It can also register other backend accessor with a given name,
-    prefixes, and backend class. In addition, We use the singleton pattern to
-    avoid repeated object creation. If the arguments are the same, the same
-    object will be returned.
-
-    Args:
-        backend (str, optional): The storage backend type. Options are "disk",
-            "ceph", "memcached", "lmdb", "http" and "petrel". Default: None.
-        prefix (str, optional): The prefix of the registered storage backend.
-            Options are "s3", "http", "https". Default: None.
-
-    Examples:
-        >>> # only set backend
-        >>> file_client = FileClient(backend='petrel')
-        >>> # only set prefix
-        >>> file_client = FileClient(prefix='s3')
-        >>> # set both backend and prefix but use backend to choose client
-        >>> file_client = FileClient(backend='petrel', prefix='s3')
-        >>> # if the arguments are the same, the same object is returned
-        >>> file_client1 = FileClient(backend='petrel')
-        >>> file_client1 is file_client
-        True
-
-    Attributes:
-        client (:obj:`BaseStorageBackend`): The backend object.
-    """
-
-    _backends = {
-        'disk': HardDiskBackend,
-        'ceph': CephBackend,
-        'memcached': MemcachedBackend,
-        'lmdb': LmdbBackend,
-        'petrel': PetrelBackend,
-        'http': HTTPBackend,
-    }
-    # This collection is used to record the overridden backends, and when a
-    # backend appears in the collection, the singleton pattern is disabled for
-    # that backend, because if the singleton pattern is used, then the object
-    # returned will be the backend before overwriting
-    _overridden_backends = set()
-    _prefix_to_backends = {
-        's3': PetrelBackend,
-        'http': HTTPBackend,
-        'https': HTTPBackend,
-    }
-    _overridden_prefixes = set()
-
-    _instances = {}
-
-    def __new__(cls, backend=None, prefix=None, **kwargs):
-        if backend is None and prefix is None:
-            backend = 'disk'
-        if backend is not None and backend not in cls._backends:
-            raise ValueError(
-                f'Backend {backend} is not supported. Currently supported ones'
-                f' are {list(cls._backends.keys())}')
-        if prefix is not None and prefix not in cls._prefix_to_backends:
-            raise ValueError(
-                f'prefix {prefix} is not supported. Currently supported ones '
-                f'are {list(cls._prefix_to_backends.keys())}')
-
-        # concatenate the arguments to a unique key for determining whether
-        # objects with the same arguments were created
-        arg_key = f'{backend}:{prefix}'
-        for key, value in kwargs.items():
-            arg_key += f':{key}:{value}'
-
-        # if a backend was overridden, it will create a new object
-        if (arg_key in cls._instances
-                and backend not in cls._overridden_backends
-                and prefix not in cls._overridden_prefixes):
-            _instance = cls._instances[arg_key]
-        else:
-            # create a new object and put it to _instance
-            _instance = super().__new__(cls)
-            if backend is not None:
-                _instance.client = cls._backends[backend](**kwargs)
-            else:
-                _instance.client = cls._prefix_to_backends[prefix](**kwargs)
-
-            cls._instances[arg_key] = _instance
-
-        return _instance
-
-    @property
-    def name(self):
-        return self.client.name
-
-    @property
-    def allow_symlink(self):
-        return self.client.allow_symlink
-
-    @staticmethod
-    def parse_uri_prefix(uri: Union[str, Path]) -> Optional[str]:
-        """Parse the prefix of a uri.
-
-        Args:
-            uri (str | Path): Uri to be parsed that contains the file prefix.
-
-        Examples:
-            >>> FileClient.parse_uri_prefix('s3://path/of/your/file')
-            's3'
-
-        Returns:
-            str | None: Return the prefix of uri if the uri contains '://'
-                else ``None``.
-        """
-        assert is_filepath(uri)
-        uri = str(uri)
-        if '://' not in uri:
-            return None
-        else:
-            prefix, _ = uri.split('://')
-            # In the case of PetrelBackend, the prefix may contains the cluster
-            # name like clusterName:s3
-            if ':' in prefix:
-                _, prefix = prefix.split(':')
-            return prefix
-
-    @classmethod
-    def infer_client(cls,
-                     file_client_args: Optional[dict] = None,
-                     uri: Optional[Union[str, Path]] = None) -> 'FileClient':
-        """Infer a suitable file client based on the URI and arguments.
-
-        Args:
-            file_client_args (dict, optional): Arguments to instantiate a
-                FileClient. Default: None.
-            uri (str | Path, optional): Uri to be parsed that contains the file
-                prefix. Default: None.
-
-        Examples:
-            >>> uri = 's3://path/of/your/file'
-            >>> file_client = FileClient.infer_client(uri=uri)
-            >>> file_client_args = {'backend': 'petrel'}
-            >>> file_client = FileClient.infer_client(file_client_args)
-
-        Returns:
-            FileClient: Instantiated FileClient object.
-        """
-        assert file_client_args is not None or uri is not None
-        if file_client_args is None:
-            file_prefix = cls.parse_uri_prefix(uri)  # type: ignore
-            return cls(prefix=file_prefix)
-        else:
-            return cls(**file_client_args)
-
-    @classmethod
-    def _register_backend(cls, name, backend, force=False, prefixes=None):
-        if not isinstance(name, str):
-            raise TypeError('the backend name should be a string, '
-                            f'but got {type(name)}')
-        if not inspect.isclass(backend):
-            raise TypeError(
-                f'backend should be a class but got {type(backend)}')
-        if not issubclass(backend, BaseStorageBackend):
-            raise TypeError(
-                f'backend {backend} is not a subclass of BaseStorageBackend')
-        if not force and name in cls._backends:
-            raise KeyError(
-                f'{name} is already registered as a storage backend, '
-                'add "force=True" if you want to override it')
-
-        if name in cls._backends and force:
-            cls._overridden_backends.add(name)
-        cls._backends[name] = backend
-
-        if prefixes is not None:
-            if isinstance(prefixes, str):
-                prefixes = [prefixes]
-            else:
-                assert isinstance(prefixes, (list, tuple))
-            for prefix in prefixes:
-                if prefix not in cls._prefix_to_backends:
-                    cls._prefix_to_backends[prefix] = backend
-                elif (prefix in cls._prefix_to_backends) and force:
-                    cls._overridden_prefixes.add(prefix)
-                    cls._prefix_to_backends[prefix] = backend
-                else:
-                    raise KeyError(
-                        f'{prefix} is already registered as a storage backend,'
-                        ' add "force=True" if you want to override it')
-
-    @classmethod
-    def register_backend(cls, name, backend=None, force=False, prefixes=None):
-        """Register a backend to FileClient.
-
-        This method can be used as a normal class method or a decorator.
-
-        .. code-block:: python
-
-            class NewBackend(BaseStorageBackend):
-
-                def get(self, filepath):
-                    return filepath
-
-                def get_text(self, filepath):
-                    return filepath
-
-            FileClient.register_backend('new', NewBackend)
-
-        or
-
-        .. code-block:: python
-
-            @FileClient.register_backend('new')
-            class NewBackend(BaseStorageBackend):
-
-                def get(self, filepath):
-                    return filepath
-
-                def get_text(self, filepath):
-                    return filepath
-
-        Args:
-            name (str): The name of the registered backend.
-            backend (class, optional): The backend class to be registered,
-                which must be a subclass of :class:`BaseStorageBackend`.
-                When this method is used as a decorator, backend is None.
-                Defaults to None.
-            force (bool, optional): Whether to override the backend if the name
-                has already been registered. Defaults to False.
-            prefixes (str or list[str] or tuple[str], optional): The prefixes
-                of the registered storage backend. Default: None.
-                `New in version 1.3.15.`
-        """
-        if backend is not None:
-            cls._register_backend(
-                name, backend, force=force, prefixes=prefixes)
-            return
-
-        def _register(backend_cls):
-            cls._register_backend(
-                name, backend_cls, force=force, prefixes=prefixes)
-            return backend_cls
-
-        return _register
-
-    def get(self, filepath: Union[str, Path]) -> Union[bytes, memoryview]:
-        """Read data from a given ``filepath`` with 'rb' mode.
-
-        Note:
-            There are two types of return values for ``get``, one is ``bytes``
-            and the other is ``memoryview``. The advantage of using memoryview
-            is that you can avoid copying, and if you want to convert it to
-            ``bytes``, you can use ``.tobytes()``.
-
-        Args:
-            filepath (str or Path): Path to read data.
-
-        Returns:
-            bytes | memoryview: Expected bytes object or a memory view of the
-                bytes object.
-        """
-        return self.client.get(filepath)
-
-    def get_text(self, filepath: Union[str, Path], encoding='utf-8') -> str:
-        """Read data from a given ``filepath`` with 'r' mode.
-
-        Args:
-            filepath (str or Path): Path to read data.
-            encoding (str): The encoding format used to open the ``filepath``.
-                Default: 'utf-8'.
-
-        Returns:
-            str: Expected text reading from ``filepath``.
-        """
-        return self.client.get_text(filepath, encoding)
-
-    def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
-        """Write data to a given ``filepath`` with 'wb' mode.
-
-        Note:
-            ``put`` should create a directory if the directory of ``filepath``
-            does not exist.
-
-        Args:
-            obj (bytes): Data to be written.
-            filepath (str or Path): Path to write data.
-        """
-        self.client.put(obj, filepath)
-
-    def put_text(self, obj: str, filepath: Union[str, Path]) -> None:
-        """Write data to a given ``filepath`` with 'w' mode.
-
-        Note:
-            ``put_text`` should create a directory if the directory of
-            ``filepath`` does not exist.
-
-        Args:
-            obj (str): Data to be written.
-            filepath (str or Path): Path to write data.
-            encoding (str, optional): The encoding format used to open the
-                `filepath`. Default: 'utf-8'.
-        """
-        self.client.put_text(obj, filepath)
-
-    def remove(self, filepath: Union[str, Path]) -> None:
-        """Remove a file.
-
-        Args:
-            filepath (str, Path): Path to be removed.
-        """
-        self.client.remove(filepath)
-
-    def exists(self, filepath: Union[str, Path]) -> bool:
-        """Check whether a file path exists.
-
-        Args:
-            filepath (str or Path): Path to be checked whether exists.
-
-        Returns:
-            bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
-        """
-        return self.client.exists(filepath)
-
-    def isdir(self, filepath: Union[str, Path]) -> bool:
-        """Check whether a file path is a directory.
-
-        Args:
-            filepath (str or Path): Path to be checked whether it is a
-                directory.
-
-        Returns:
-            bool: Return ``True`` if ``filepath`` points to a directory,
-                ``False`` otherwise.
-        """
-        return self.client.isdir(filepath)
-
-    def isfile(self, filepath: Union[str, Path]) -> bool:
-        """Check whether a file path is a file.
-
-        Args:
-            filepath (str or Path): Path to be checked whether it is a file.
-
-        Returns:
-            bool: Return ``True`` if ``filepath`` points to a file, ``False``
-                otherwise.
-        """
-        return self.client.isfile(filepath)
-
-    def join_path(self, filepath: Union[str, Path],
-                  *filepaths: Union[str, Path]) -> str:
-        """Concatenate all file paths.
-
-        Join one or more filepath components intelligently. The return value
-        is the concatenation of filepath and any members of *filepaths.
-
-        Args:
-            filepath (str or Path): Path to be concatenated.
-
-        Returns:
-            str: The result of concatenation.
-        """
-        return self.client.join_path(filepath, *filepaths)
-
-    @contextmanager
-    def get_local_path(self, filepath: Union[str, Path]) -> Iterable[str]:
-        """Download data from ``filepath`` and write the data to local path.
-
-        ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
-        can be called with ``with`` statement, and when exists from the
-        ``with`` statement, the temporary path will be released.
-
-        Note:
-            If the ``filepath`` is a local path, just return itself.
-
-        .. warning::
-            ``get_local_path`` is an experimental interface that may change in
-            the future.
-
-        Args:
-            filepath (str or Path): Path to be read data.
-
-        Examples:
-            >>> file_client = FileClient(prefix='s3')
-            >>> with file_client.get_local_path('s3://bucket/abc.jpg') as path:
-            ...     # do something here
-
-        Yields:
-            Iterable[str]: Only yield one path.
-        """
-        with self.client.get_local_path(str(filepath)) as local_path:
-            yield local_path
-
-    def list_dir_or_file(self,
-                         dir_path: Union[str, Path],
-                         list_dir: bool = True,
-                         list_file: bool = True,
-                         suffix: Optional[Union[str, Tuple[str]]] = None,
-                         recursive: bool = False) -> Iterator[str]:
-        """Scan a directory to find the interested directories or files in
-        arbitrary order.
-
-        Note:
-            :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
-
-        Args:
-            dir_path (str | Path): Path of the directory.
-            list_dir (bool): List the directories. Default: True.
-            list_file (bool): List the path of files. Default: True.
-            suffix (str or tuple[str], optional):  File suffix
-                that we are interested in. Default: None.
-            recursive (bool): If set to True, recursively scan the
-                directory. Default: False.
-
-        Yields:
-            Iterable[str]: A relative path to ``dir_path``.
-        """
-        yield from self.client.list_dir_or_file(dir_path, list_dir, list_file,
-                                                suffix, recursive)
diff --git a/ControlNet/annotator/uniformer/mmcv/fileio/handlers/__init__.py b/ControlNet/annotator/uniformer/mmcv/fileio/handlers/__init__.py
deleted file mode 100644
index aa24d91972837b8756b225f4879bac20436eb72a..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/fileio/handlers/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .base import BaseFileHandler
-from .json_handler import JsonHandler
-from .pickle_handler import PickleHandler
-from .yaml_handler import YamlHandler
-
-__all__ = ['BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler']
diff --git a/ControlNet/annotator/uniformer/mmcv/fileio/handlers/base.py b/ControlNet/annotator/uniformer/mmcv/fileio/handlers/base.py
deleted file mode 100644
index 288878bc57282fbb2f12b32290152ca8e9d3cab0..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/fileio/handlers/base.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from abc import ABCMeta, abstractmethod
-
-
-class BaseFileHandler(metaclass=ABCMeta):
-    # `str_like` is a flag to indicate whether the type of file object is
-    # str-like object or bytes-like object. Pickle only processes bytes-like
-    # objects but json only processes str-like object. If it is str-like
-    # object, `StringIO` will be used to process the buffer.
-    str_like = True
-
-    @abstractmethod
-    def load_from_fileobj(self, file, **kwargs):
-        pass
-
-    @abstractmethod
-    def dump_to_fileobj(self, obj, file, **kwargs):
-        pass
-
-    @abstractmethod
-    def dump_to_str(self, obj, **kwargs):
-        pass
-
-    def load_from_path(self, filepath, mode='r', **kwargs):
-        with open(filepath, mode) as f:
-            return self.load_from_fileobj(f, **kwargs)
-
-    def dump_to_path(self, obj, filepath, mode='w', **kwargs):
-        with open(filepath, mode) as f:
-            self.dump_to_fileobj(obj, f, **kwargs)
diff --git a/ControlNet/annotator/uniformer/mmcv/fileio/handlers/json_handler.py b/ControlNet/annotator/uniformer/mmcv/fileio/handlers/json_handler.py
deleted file mode 100644
index 18d4f15f74139d20adff18b20be5529c592a66b6..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/fileio/handlers/json_handler.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import json
-
-import numpy as np
-
-from .base import BaseFileHandler
-
-
-def set_default(obj):
-    """Set default json values for non-serializable values.
-
-    It helps convert ``set``, ``range`` and ``np.ndarray`` data types to list.
-    It also converts ``np.generic`` (including ``np.int32``, ``np.float32``,
-    etc.) into plain numbers of plain python built-in types.
-    """
-    if isinstance(obj, (set, range)):
-        return list(obj)
-    elif isinstance(obj, np.ndarray):
-        return obj.tolist()
-    elif isinstance(obj, np.generic):
-        return obj.item()
-    raise TypeError(f'{type(obj)} is unsupported for json dump')
-
-
-class JsonHandler(BaseFileHandler):
-
-    def load_from_fileobj(self, file):
-        return json.load(file)
-
-    def dump_to_fileobj(self, obj, file, **kwargs):
-        kwargs.setdefault('default', set_default)
-        json.dump(obj, file, **kwargs)
-
-    def dump_to_str(self, obj, **kwargs):
-        kwargs.setdefault('default', set_default)
-        return json.dumps(obj, **kwargs)
diff --git a/ControlNet/annotator/uniformer/mmcv/fileio/handlers/pickle_handler.py b/ControlNet/annotator/uniformer/mmcv/fileio/handlers/pickle_handler.py
deleted file mode 100644
index b37c79bed4ef9fd8913715e62dbe3fc5cafdc3aa..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/fileio/handlers/pickle_handler.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import pickle
-
-from .base import BaseFileHandler
-
-
-class PickleHandler(BaseFileHandler):
-
-    str_like = False
-
-    def load_from_fileobj(self, file, **kwargs):
-        return pickle.load(file, **kwargs)
-
-    def load_from_path(self, filepath, **kwargs):
-        return super(PickleHandler, self).load_from_path(
-            filepath, mode='rb', **kwargs)
-
-    def dump_to_str(self, obj, **kwargs):
-        kwargs.setdefault('protocol', 2)
-        return pickle.dumps(obj, **kwargs)
-
-    def dump_to_fileobj(self, obj, file, **kwargs):
-        kwargs.setdefault('protocol', 2)
-        pickle.dump(obj, file, **kwargs)
-
-    def dump_to_path(self, obj, filepath, **kwargs):
-        super(PickleHandler, self).dump_to_path(
-            obj, filepath, mode='wb', **kwargs)
diff --git a/ControlNet/annotator/uniformer/mmcv/fileio/handlers/yaml_handler.py b/ControlNet/annotator/uniformer/mmcv/fileio/handlers/yaml_handler.py
deleted file mode 100644
index c5aa2eea1e8c76f8baf753d1c8c959dee665e543..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/fileio/handlers/yaml_handler.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import yaml
-
-try:
-    from yaml import CLoader as Loader, CDumper as Dumper
-except ImportError:
-    from yaml import Loader, Dumper
-
-from .base import BaseFileHandler  # isort:skip
-
-
-class YamlHandler(BaseFileHandler):
-
-    def load_from_fileobj(self, file, **kwargs):
-        kwargs.setdefault('Loader', Loader)
-        return yaml.load(file, **kwargs)
-
-    def dump_to_fileobj(self, obj, file, **kwargs):
-        kwargs.setdefault('Dumper', Dumper)
-        yaml.dump(obj, file, **kwargs)
-
-    def dump_to_str(self, obj, **kwargs):
-        kwargs.setdefault('Dumper', Dumper)
-        return yaml.dump(obj, **kwargs)
diff --git a/ControlNet/annotator/uniformer/mmcv/fileio/io.py b/ControlNet/annotator/uniformer/mmcv/fileio/io.py
deleted file mode 100644
index aaefde58aa3ea5b58f86249ce7e1c40c186eb8dd..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/fileio/io.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from io import BytesIO, StringIO
-from pathlib import Path
-
-from ..utils import is_list_of, is_str
-from .file_client import FileClient
-from .handlers import BaseFileHandler, JsonHandler, PickleHandler, YamlHandler
-
-file_handlers = {
-    'json': JsonHandler(),
-    'yaml': YamlHandler(),
-    'yml': YamlHandler(),
-    'pickle': PickleHandler(),
-    'pkl': PickleHandler()
-}
-
-
-def load(file, file_format=None, file_client_args=None, **kwargs):
-    """Load data from json/yaml/pickle files.
-
-    This method provides a unified api for loading data from serialized files.
-
-    Note:
-        In v1.3.16 and later, ``load`` supports loading data from serialized
-        files those can be storaged in different backends.
-
-    Args:
-        file (str or :obj:`Path` or file-like object): Filename or a file-like
-            object.
-        file_format (str, optional): If not specified, the file format will be
-            inferred from the file extension, otherwise use the specified one.
-            Currently supported formats include "json", "yaml/yml" and
-            "pickle/pkl".
-        file_client_args (dict, optional): Arguments to instantiate a
-            FileClient. See :class:`mmcv.fileio.FileClient` for details.
-            Default: None.
-
-    Examples:
-        >>> load('/path/of/your/file')  # file is storaged in disk
-        >>> load('https://path/of/your/file')  # file is storaged in Internet
-        >>> load('s3://path/of/your/file')  # file is storaged in petrel
-
-    Returns:
-        The content from the file.
-    """
-    if isinstance(file, Path):
-        file = str(file)
-    if file_format is None and is_str(file):
-        file_format = file.split('.')[-1]
-    if file_format not in file_handlers:
-        raise TypeError(f'Unsupported format: {file_format}')
-
-    handler = file_handlers[file_format]
-    if is_str(file):
-        file_client = FileClient.infer_client(file_client_args, file)
-        if handler.str_like:
-            with StringIO(file_client.get_text(file)) as f:
-                obj = handler.load_from_fileobj(f, **kwargs)
-        else:
-            with BytesIO(file_client.get(file)) as f:
-                obj = handler.load_from_fileobj(f, **kwargs)
-    elif hasattr(file, 'read'):
-        obj = handler.load_from_fileobj(file, **kwargs)
-    else:
-        raise TypeError('"file" must be a filepath str or a file-object')
-    return obj
-
-
-def dump(obj, file=None, file_format=None, file_client_args=None, **kwargs):
-    """Dump data to json/yaml/pickle strings or files.
-
-    This method provides a unified api for dumping data as strings or to files,
-    and also supports custom arguments for each file format.
-
-    Note:
-        In v1.3.16 and later, ``dump`` supports dumping data as strings or to
-        files which is saved to different backends.
-
-    Args:
-        obj (any): The python object to be dumped.
-        file (str or :obj:`Path` or file-like object, optional): If not
-            specified, then the object is dumped to a str, otherwise to a file
-            specified by the filename or file-like object.
-        file_format (str, optional): Same as :func:`load`.
-        file_client_args (dict, optional): Arguments to instantiate a
-            FileClient. See :class:`mmcv.fileio.FileClient` for details.
-            Default: None.
-
-    Examples:
-        >>> dump('hello world', '/path/of/your/file')  # disk
-        >>> dump('hello world', 's3://path/of/your/file')  # ceph or petrel
-
-    Returns:
-        bool: True for success, False otherwise.
-    """
-    if isinstance(file, Path):
-        file = str(file)
-    if file_format is None:
-        if is_str(file):
-            file_format = file.split('.')[-1]
-        elif file is None:
-            raise ValueError(
-                'file_format must be specified since file is None')
-    if file_format not in file_handlers:
-        raise TypeError(f'Unsupported format: {file_format}')
-
-    handler = file_handlers[file_format]
-    if file is None:
-        return handler.dump_to_str(obj, **kwargs)
-    elif is_str(file):
-        file_client = FileClient.infer_client(file_client_args, file)
-        if handler.str_like:
-            with StringIO() as f:
-                handler.dump_to_fileobj(obj, f, **kwargs)
-                file_client.put_text(f.getvalue(), file)
-        else:
-            with BytesIO() as f:
-                handler.dump_to_fileobj(obj, f, **kwargs)
-                file_client.put(f.getvalue(), file)
-    elif hasattr(file, 'write'):
-        handler.dump_to_fileobj(obj, file, **kwargs)
-    else:
-        raise TypeError('"file" must be a filename str or a file-object')
-
-
-def _register_handler(handler, file_formats):
-    """Register a handler for some file extensions.
-
-    Args:
-        handler (:obj:`BaseFileHandler`): Handler to be registered.
-        file_formats (str or list[str]): File formats to be handled by this
-            handler.
-    """
-    if not isinstance(handler, BaseFileHandler):
-        raise TypeError(
-            f'handler must be a child of BaseFileHandler, not {type(handler)}')
-    if isinstance(file_formats, str):
-        file_formats = [file_formats]
-    if not is_list_of(file_formats, str):
-        raise TypeError('file_formats must be a str or a list of str')
-    for ext in file_formats:
-        file_handlers[ext] = handler
-
-
-def register_handler(file_formats, **kwargs):
-
-    def wrap(cls):
-        _register_handler(cls(**kwargs), file_formats)
-        return cls
-
-    return wrap
diff --git a/ControlNet/annotator/uniformer/mmcv/fileio/parse.py b/ControlNet/annotator/uniformer/mmcv/fileio/parse.py
deleted file mode 100644
index f60f0d611b8d75692221d0edd7dc993b0a6445c9..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/fileio/parse.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-from io import StringIO
-
-from .file_client import FileClient
-
-
-def list_from_file(filename,
-                   prefix='',
-                   offset=0,
-                   max_num=0,
-                   encoding='utf-8',
-                   file_client_args=None):
-    """Load a text file and parse the content as a list of strings.
-
-    Note:
-        In v1.3.16 and later, ``list_from_file`` supports loading a text file
-        which can be storaged in different backends and parsing the content as
-        a list for strings.
-
-    Args:
-        filename (str): Filename.
-        prefix (str): The prefix to be inserted to the beginning of each item.
-        offset (int): The offset of lines.
-        max_num (int): The maximum number of lines to be read,
-            zeros and negatives mean no limitation.
-        encoding (str): Encoding used to open the file. Default utf-8.
-        file_client_args (dict, optional): Arguments to instantiate a
-            FileClient. See :class:`mmcv.fileio.FileClient` for details.
-            Default: None.
-
-    Examples:
-        >>> list_from_file('/path/of/your/file')  # disk
-        ['hello', 'world']
-        >>> list_from_file('s3://path/of/your/file')  # ceph or petrel
-        ['hello', 'world']
-
-    Returns:
-        list[str]: A list of strings.
-    """
-    cnt = 0
-    item_list = []
-    file_client = FileClient.infer_client(file_client_args, filename)
-    with StringIO(file_client.get_text(filename, encoding)) as f:
-        for _ in range(offset):
-            f.readline()
-        for line in f:
-            if 0 < max_num <= cnt:
-                break
-            item_list.append(prefix + line.rstrip('\n\r'))
-            cnt += 1
-    return item_list
-
-
-def dict_from_file(filename,
-                   key_type=str,
-                   encoding='utf-8',
-                   file_client_args=None):
-    """Load a text file and parse the content as a dict.
-
-    Each line of the text file will be two or more columns split by
-    whitespaces or tabs. The first column will be parsed as dict keys, and
-    the following columns will be parsed as dict values.
-
-    Note:
-        In v1.3.16 and later, ``dict_from_file`` supports loading a text file
-        which can be storaged in different backends and parsing the content as
-        a dict.
-
-    Args:
-        filename(str): Filename.
-        key_type(type): Type of the dict keys. str is user by default and
-            type conversion will be performed if specified.
-        encoding (str): Encoding used to open the file. Default utf-8.
-        file_client_args (dict, optional): Arguments to instantiate a
-            FileClient. See :class:`mmcv.fileio.FileClient` for details.
-            Default: None.
-
-    Examples:
-        >>> dict_from_file('/path/of/your/file')  # disk
-        {'key1': 'value1', 'key2': 'value2'}
-        >>> dict_from_file('s3://path/of/your/file')  # ceph or petrel
-        {'key1': 'value1', 'key2': 'value2'}
-
-    Returns:
-        dict: The parsed contents.
-    """
-    mapping = {}
-    file_client = FileClient.infer_client(file_client_args, filename)
-    with StringIO(file_client.get_text(filename, encoding)) as f:
-        for line in f:
-            items = line.rstrip('\n').split()
-            assert len(items) >= 2
-            key = key_type(items[0])
-            val = items[1:] if len(items) > 2 else items[1]
-            mapping[key] = val
-    return mapping
diff --git a/ControlNet/annotator/uniformer/mmcv/image/__init__.py b/ControlNet/annotator/uniformer/mmcv/image/__init__.py
deleted file mode 100644
index d0051d609d3de4e7562e3fe638335c66617c4d91..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/image/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .colorspace import (bgr2gray, bgr2hls, bgr2hsv, bgr2rgb, bgr2ycbcr,
-                         gray2bgr, gray2rgb, hls2bgr, hsv2bgr, imconvert,
-                         rgb2bgr, rgb2gray, rgb2ycbcr, ycbcr2bgr, ycbcr2rgb)
-from .geometric import (cutout, imcrop, imflip, imflip_, impad,
-                        impad_to_multiple, imrescale, imresize, imresize_like,
-                        imresize_to_multiple, imrotate, imshear, imtranslate,
-                        rescale_size)
-from .io import imfrombytes, imread, imwrite, supported_backends, use_backend
-from .misc import tensor2imgs
-from .photometric import (adjust_brightness, adjust_color, adjust_contrast,
-                          adjust_lighting, adjust_sharpness, auto_contrast,
-                          clahe, imdenormalize, imequalize, iminvert,
-                          imnormalize, imnormalize_, lut_transform, posterize,
-                          solarize)
-
-__all__ = [
-    'bgr2gray', 'bgr2hls', 'bgr2hsv', 'bgr2rgb', 'gray2bgr', 'gray2rgb',
-    'hls2bgr', 'hsv2bgr', 'imconvert', 'rgb2bgr', 'rgb2gray', 'imrescale',
-    'imresize', 'imresize_like', 'imresize_to_multiple', 'rescale_size',
-    'imcrop', 'imflip', 'imflip_', 'impad', 'impad_to_multiple', 'imrotate',
-    'imfrombytes', 'imread', 'imwrite', 'supported_backends', 'use_backend',
-    'imdenormalize', 'imnormalize', 'imnormalize_', 'iminvert', 'posterize',
-    'solarize', 'rgb2ycbcr', 'bgr2ycbcr', 'ycbcr2rgb', 'ycbcr2bgr',
-    'tensor2imgs', 'imshear', 'imtranslate', 'adjust_color', 'imequalize',
-    'adjust_brightness', 'adjust_contrast', 'lut_transform', 'clahe',
-    'adjust_sharpness', 'auto_contrast', 'cutout', 'adjust_lighting'
-]
diff --git a/ControlNet/annotator/uniformer/mmcv/image/colorspace.py b/ControlNet/annotator/uniformer/mmcv/image/colorspace.py
deleted file mode 100644
index 814533952fdfda23d67cb6a3073692d8c1156add..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/image/colorspace.py
+++ /dev/null
@@ -1,306 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import cv2
-import numpy as np
-
-
-def imconvert(img, src, dst):
-    """Convert an image from the src colorspace to dst colorspace.
-
-    Args:
-        img (ndarray): The input image.
-        src (str): The source colorspace, e.g., 'rgb', 'hsv'.
-        dst (str): The destination colorspace, e.g., 'rgb', 'hsv'.
-
-    Returns:
-        ndarray: The converted image.
-    """
-    code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}')
-    out_img = cv2.cvtColor(img, code)
-    return out_img
-
-
-def bgr2gray(img, keepdim=False):
-    """Convert a BGR image to grayscale image.
-
-    Args:
-        img (ndarray): The input image.
-        keepdim (bool): If False (by default), then return the grayscale image
-            with 2 dims, otherwise 3 dims.
-
-    Returns:
-        ndarray: The converted grayscale image.
-    """
-    out_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-    if keepdim:
-        out_img = out_img[..., None]
-    return out_img
-
-
-def rgb2gray(img, keepdim=False):
-    """Convert a RGB image to grayscale image.
-
-    Args:
-        img (ndarray): The input image.
-        keepdim (bool): If False (by default), then return the grayscale image
-            with 2 dims, otherwise 3 dims.
-
-    Returns:
-        ndarray: The converted grayscale image.
-    """
-    out_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
-    if keepdim:
-        out_img = out_img[..., None]
-    return out_img
-
-
-def gray2bgr(img):
-    """Convert a grayscale image to BGR image.
-
-    Args:
-        img (ndarray): The input image.
-
-    Returns:
-        ndarray: The converted BGR image.
-    """
-    img = img[..., None] if img.ndim == 2 else img
-    out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
-    return out_img
-
-
-def gray2rgb(img):
-    """Convert a grayscale image to RGB image.
-
-    Args:
-        img (ndarray): The input image.
-
-    Returns:
-        ndarray: The converted RGB image.
-    """
-    img = img[..., None] if img.ndim == 2 else img
-    out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
-    return out_img
-
-
-def _convert_input_type_range(img):
-    """Convert the type and range of the input image.
-
-    It converts the input image to np.float32 type and range of [0, 1].
-    It is mainly used for pre-processing the input image in colorspace
-    conversion functions such as rgb2ycbcr and ycbcr2rgb.
-
-    Args:
-        img (ndarray): The input image. It accepts:
-            1. np.uint8 type with range [0, 255];
-            2. np.float32 type with range [0, 1].
-
-    Returns:
-        (ndarray): The converted image with type of np.float32 and range of
-            [0, 1].
-    """
-    img_type = img.dtype
-    img = img.astype(np.float32)
-    if img_type == np.float32:
-        pass
-    elif img_type == np.uint8:
-        img /= 255.
-    else:
-        raise TypeError('The img type should be np.float32 or np.uint8, '
-                        f'but got {img_type}')
-    return img
-
-
-def _convert_output_type_range(img, dst_type):
-    """Convert the type and range of the image according to dst_type.
-
-    It converts the image to desired type and range. If `dst_type` is np.uint8,
-    images will be converted to np.uint8 type with range [0, 255]. If
-    `dst_type` is np.float32, it converts the image to np.float32 type with
-    range [0, 1].
-    It is mainly used for post-processing images in colorspace conversion
-    functions such as rgb2ycbcr and ycbcr2rgb.
-
-    Args:
-        img (ndarray): The image to be converted with np.float32 type and
-            range [0, 255].
-        dst_type (np.uint8 | np.float32): If dst_type is np.uint8, it
-            converts the image to np.uint8 type with range [0, 255]. If
-            dst_type is np.float32, it converts the image to np.float32 type
-            with range [0, 1].
-
-    Returns:
-        (ndarray): The converted image with desired type and range.
-    """
-    if dst_type not in (np.uint8, np.float32):
-        raise TypeError('The dst_type should be np.float32 or np.uint8, '
-                        f'but got {dst_type}')
-    if dst_type == np.uint8:
-        img = img.round()
-    else:
-        img /= 255.
-    return img.astype(dst_type)
-
-
-def rgb2ycbcr(img, y_only=False):
-    """Convert a RGB image to YCbCr image.
-
-    This function produces the same results as Matlab's `rgb2ycbcr` function.
-    It implements the ITU-R BT.601 conversion for standard-definition
-    television. See more details in
-    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
-
-    It differs from a similar function in cv2.cvtColor: `RGB <-> YCrCb`.
-    In OpenCV, it implements a JPEG conversion. See more details in
-    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
-
-    Args:
-        img (ndarray): The input image. It accepts:
-            1. np.uint8 type with range [0, 255];
-            2. np.float32 type with range [0, 1].
-        y_only (bool): Whether to only return Y channel. Default: False.
-
-    Returns:
-        ndarray: The converted YCbCr image. The output image has the same type
-            and range as input image.
-    """
-    img_type = img.dtype
-    img = _convert_input_type_range(img)
-    if y_only:
-        out_img = np.dot(img, [65.481, 128.553, 24.966]) + 16.0
-    else:
-        out_img = np.matmul(
-            img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786],
-                  [24.966, 112.0, -18.214]]) + [16, 128, 128]
-    out_img = _convert_output_type_range(out_img, img_type)
-    return out_img
-
-
-def bgr2ycbcr(img, y_only=False):
-    """Convert a BGR image to YCbCr image.
-
-    The bgr version of rgb2ycbcr.
-    It implements the ITU-R BT.601 conversion for standard-definition
-    television. See more details in
-    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
-
-    It differs from a similar function in cv2.cvtColor: `BGR <-> YCrCb`.
-    In OpenCV, it implements a JPEG conversion. See more details in
-    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
-
-    Args:
-        img (ndarray): The input image. It accepts:
-            1. np.uint8 type with range [0, 255];
-            2. np.float32 type with range [0, 1].
-        y_only (bool): Whether to only return Y channel. Default: False.
-
-    Returns:
-        ndarray: The converted YCbCr image. The output image has the same type
-            and range as input image.
-    """
-    img_type = img.dtype
-    img = _convert_input_type_range(img)
-    if y_only:
-        out_img = np.dot(img, [24.966, 128.553, 65.481]) + 16.0
-    else:
-        out_img = np.matmul(
-            img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
-                  [65.481, -37.797, 112.0]]) + [16, 128, 128]
-    out_img = _convert_output_type_range(out_img, img_type)
-    return out_img
-
-
-def ycbcr2rgb(img):
-    """Convert a YCbCr image to RGB image.
-
-    This function produces the same results as Matlab's ycbcr2rgb function.
-    It implements the ITU-R BT.601 conversion for standard-definition
-    television. See more details in
-    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
-
-    It differs from a similar function in cv2.cvtColor: `YCrCb <-> RGB`.
-    In OpenCV, it implements a JPEG conversion. See more details in
-    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
-
-    Args:
-        img (ndarray): The input image. It accepts:
-            1. np.uint8 type with range [0, 255];
-            2. np.float32 type with range [0, 1].
-
-    Returns:
-        ndarray: The converted RGB image. The output image has the same type
-            and range as input image.
-    """
-    img_type = img.dtype
-    img = _convert_input_type_range(img) * 255
-    out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621],
-                              [0, -0.00153632, 0.00791071],
-                              [0.00625893, -0.00318811, 0]]) * 255.0 + [
-                                  -222.921, 135.576, -276.836
-                              ]
-    out_img = _convert_output_type_range(out_img, img_type)
-    return out_img
-
-
-def ycbcr2bgr(img):
-    """Convert a YCbCr image to BGR image.
-
-    The bgr version of ycbcr2rgb.
-    It implements the ITU-R BT.601 conversion for standard-definition
-    television. See more details in
-    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
-
-    It differs from a similar function in cv2.cvtColor: `YCrCb <-> BGR`.
-    In OpenCV, it implements a JPEG conversion. See more details in
-    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
-
-    Args:
-        img (ndarray): The input image. It accepts:
-            1. np.uint8 type with range [0, 255];
-            2. np.float32 type with range [0, 1].
-
-    Returns:
-        ndarray: The converted BGR image. The output image has the same type
-            and range as input image.
-    """
-    img_type = img.dtype
-    img = _convert_input_type_range(img) * 255
-    out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621],
-                              [0.00791071, -0.00153632, 0],
-                              [0, -0.00318811, 0.00625893]]) * 255.0 + [
-                                  -276.836, 135.576, -222.921
-                              ]
-    out_img = _convert_output_type_range(out_img, img_type)
-    return out_img
-
-
-def convert_color_factory(src, dst):
-
-    code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}')
-
-    def convert_color(img):
-        out_img = cv2.cvtColor(img, code)
-        return out_img
-
-    convert_color.__doc__ = f"""Convert a {src.upper()} image to {dst.upper()}
-        image.
-
-    Args:
-        img (ndarray or str): The input image.
-
-    Returns:
-        ndarray: The converted {dst.upper()} image.
-    """
-
-    return convert_color
-
-
-bgr2rgb = convert_color_factory('bgr', 'rgb')
-
-rgb2bgr = convert_color_factory('rgb', 'bgr')
-
-bgr2hsv = convert_color_factory('bgr', 'hsv')
-
-hsv2bgr = convert_color_factory('hsv', 'bgr')
-
-bgr2hls = convert_color_factory('bgr', 'hls')
-
-hls2bgr = convert_color_factory('hls', 'bgr')
diff --git a/ControlNet/annotator/uniformer/mmcv/image/geometric.py b/ControlNet/annotator/uniformer/mmcv/image/geometric.py
deleted file mode 100644
index cf97c201cb4e43796c911919d03fb26a07ed817d..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/image/geometric.py
+++ /dev/null
@@ -1,728 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numbers
-
-import cv2
-import numpy as np
-
-from ..utils import to_2tuple
-from .io import imread_backend
-
-try:
-    from PIL import Image
-except ImportError:
-    Image = None
-
-
-def _scale_size(size, scale):
-    """Rescale a size by a ratio.
-
-    Args:
-        size (tuple[int]): (w, h).
-        scale (float | tuple(float)): Scaling factor.
-
-    Returns:
-        tuple[int]: scaled size.
-    """
-    if isinstance(scale, (float, int)):
-        scale = (scale, scale)
-    w, h = size
-    return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5)
-
-
-cv2_interp_codes = {
-    'nearest': cv2.INTER_NEAREST,
-    'bilinear': cv2.INTER_LINEAR,
-    'bicubic': cv2.INTER_CUBIC,
-    'area': cv2.INTER_AREA,
-    'lanczos': cv2.INTER_LANCZOS4
-}
-
-if Image is not None:
-    pillow_interp_codes = {
-        'nearest': Image.NEAREST,
-        'bilinear': Image.BILINEAR,
-        'bicubic': Image.BICUBIC,
-        'box': Image.BOX,
-        'lanczos': Image.LANCZOS,
-        'hamming': Image.HAMMING
-    }
-
-
-def imresize(img,
-             size,
-             return_scale=False,
-             interpolation='bilinear',
-             out=None,
-             backend=None):
-    """Resize image to a given size.
-
-    Args:
-        img (ndarray): The input image.
-        size (tuple[int]): Target size (w, h).
-        return_scale (bool): Whether to return `w_scale` and `h_scale`.
-        interpolation (str): Interpolation method, accepted values are
-            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
-            backend, "nearest", "bilinear" for 'pillow' backend.
-        out (ndarray): The output destination.
-        backend (str | None): The image resize backend type. Options are `cv2`,
-            `pillow`, `None`. If backend is None, the global imread_backend
-            specified by ``mmcv.use_backend()`` will be used. Default: None.
-
-    Returns:
-        tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
-            `resized_img`.
-    """
-    h, w = img.shape[:2]
-    if backend is None:
-        backend = imread_backend
-    if backend not in ['cv2', 'pillow']:
-        raise ValueError(f'backend: {backend} is not supported for resize.'
-                         f"Supported backends are 'cv2', 'pillow'")
-
-    if backend == 'pillow':
-        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
-        pil_image = Image.fromarray(img)
-        pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])
-        resized_img = np.array(pil_image)
-    else:
-        resized_img = cv2.resize(
-            img, size, dst=out, interpolation=cv2_interp_codes[interpolation])
-    if not return_scale:
-        return resized_img
-    else:
-        w_scale = size[0] / w
-        h_scale = size[1] / h
-        return resized_img, w_scale, h_scale
-
-
-def imresize_to_multiple(img,
-                         divisor,
-                         size=None,
-                         scale_factor=None,
-                         keep_ratio=False,
-                         return_scale=False,
-                         interpolation='bilinear',
-                         out=None,
-                         backend=None):
-    """Resize image according to a given size or scale factor and then rounds
-    up the the resized or rescaled image size to the nearest value that can be
-    divided by the divisor.
-
-    Args:
-        img (ndarray): The input image.
-        divisor (int | tuple): Resized image size will be a multiple of
-            divisor. If divisor is a tuple, divisor should be
-            (w_divisor, h_divisor).
-        size (None | int | tuple[int]): Target size (w, h). Default: None.
-        scale_factor (None | float | tuple[float]): Multiplier for spatial
-            size. Should match input size if it is a tuple and the 2D style is
-            (w_scale_factor, h_scale_factor). Default: None.
-        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
-            image. Default: False.
-        return_scale (bool): Whether to return `w_scale` and `h_scale`.
-        interpolation (str): Interpolation method, accepted values are
-            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
-            backend, "nearest", "bilinear" for 'pillow' backend.
-        out (ndarray): The output destination.
-        backend (str | None): The image resize backend type. Options are `cv2`,
-            `pillow`, `None`. If backend is None, the global imread_backend
-            specified by ``mmcv.use_backend()`` will be used. Default: None.
-
-    Returns:
-        tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
-            `resized_img`.
-    """
-    h, w = img.shape[:2]
-    if size is not None and scale_factor is not None:
-        raise ValueError('only one of size or scale_factor should be defined')
-    elif size is None and scale_factor is None:
-        raise ValueError('one of size or scale_factor should be defined')
-    elif size is not None:
-        size = to_2tuple(size)
-        if keep_ratio:
-            size = rescale_size((w, h), size, return_scale=False)
-    else:
-        size = _scale_size((w, h), scale_factor)
-
-    divisor = to_2tuple(divisor)
-    size = tuple([int(np.ceil(s / d)) * d for s, d in zip(size, divisor)])
-    resized_img, w_scale, h_scale = imresize(
-        img,
-        size,
-        return_scale=True,
-        interpolation=interpolation,
-        out=out,
-        backend=backend)
-    if return_scale:
-        return resized_img, w_scale, h_scale
-    else:
-        return resized_img
-
-
-def imresize_like(img,
-                  dst_img,
-                  return_scale=False,
-                  interpolation='bilinear',
-                  backend=None):
-    """Resize image to the same size of a given image.
-
-    Args:
-        img (ndarray): The input image.
-        dst_img (ndarray): The target image.
-        return_scale (bool): Whether to return `w_scale` and `h_scale`.
-        interpolation (str): Same as :func:`resize`.
-        backend (str | None): Same as :func:`resize`.
-
-    Returns:
-        tuple or ndarray: (`resized_img`, `w_scale`, `h_scale`) or
-            `resized_img`.
-    """
-    h, w = dst_img.shape[:2]
-    return imresize(img, (w, h), return_scale, interpolation, backend=backend)
-
-
-def rescale_size(old_size, scale, return_scale=False):
-    """Calculate the new size to be rescaled to.
-
-    Args:
-        old_size (tuple[int]): The old size (w, h) of image.
-        scale (float | tuple[int]): The scaling factor or maximum size.
-            If it is a float number, then the image will be rescaled by this
-            factor, else if it is a tuple of 2 integers, then the image will
-            be rescaled as large as possible within the scale.
-        return_scale (bool): Whether to return the scaling factor besides the
-            rescaled image size.
-
-    Returns:
-        tuple[int]: The new rescaled image size.
-    """
-    w, h = old_size
-    if isinstance(scale, (float, int)):
-        if scale <= 0:
-            raise ValueError(f'Invalid scale {scale}, must be positive.')
-        scale_factor = scale
-    elif isinstance(scale, tuple):
-        max_long_edge = max(scale)
-        max_short_edge = min(scale)
-        scale_factor = min(max_long_edge / max(h, w),
-                           max_short_edge / min(h, w))
-    else:
-        raise TypeError(
-            f'Scale must be a number or tuple of int, but got {type(scale)}')
-
-    new_size = _scale_size((w, h), scale_factor)
-
-    if return_scale:
-        return new_size, scale_factor
-    else:
-        return new_size
-
-
-def imrescale(img,
-              scale,
-              return_scale=False,
-              interpolation='bilinear',
-              backend=None):
-    """Resize image while keeping the aspect ratio.
-
-    Args:
-        img (ndarray): The input image.
-        scale (float | tuple[int]): The scaling factor or maximum size.
-            If it is a float number, then the image will be rescaled by this
-            factor, else if it is a tuple of 2 integers, then the image will
-            be rescaled as large as possible within the scale.
-        return_scale (bool): Whether to return the scaling factor besides the
-            rescaled image.
-        interpolation (str): Same as :func:`resize`.
-        backend (str | None): Same as :func:`resize`.
-
-    Returns:
-        ndarray: The rescaled image.
-    """
-    h, w = img.shape[:2]
-    new_size, scale_factor = rescale_size((w, h), scale, return_scale=True)
-    rescaled_img = imresize(
-        img, new_size, interpolation=interpolation, backend=backend)
-    if return_scale:
-        return rescaled_img, scale_factor
-    else:
-        return rescaled_img
-
-
-def imflip(img, direction='horizontal'):
-    """Flip an image horizontally or vertically.
-
-    Args:
-        img (ndarray): Image to be flipped.
-        direction (str): The flip direction, either "horizontal" or
-            "vertical" or "diagonal".
-
-    Returns:
-        ndarray: The flipped image.
-    """
-    assert direction in ['horizontal', 'vertical', 'diagonal']
-    if direction == 'horizontal':
-        return np.flip(img, axis=1)
-    elif direction == 'vertical':
-        return np.flip(img, axis=0)
-    else:
-        return np.flip(img, axis=(0, 1))
-
-
-def imflip_(img, direction='horizontal'):
-    """Inplace flip an image horizontally or vertically.
-
-    Args:
-        img (ndarray): Image to be flipped.
-        direction (str): The flip direction, either "horizontal" or
-            "vertical" or "diagonal".
-
-    Returns:
-        ndarray: The flipped image (inplace).
-    """
-    assert direction in ['horizontal', 'vertical', 'diagonal']
-    if direction == 'horizontal':
-        return cv2.flip(img, 1, img)
-    elif direction == 'vertical':
-        return cv2.flip(img, 0, img)
-    else:
-        return cv2.flip(img, -1, img)
-
-
-def imrotate(img,
-             angle,
-             center=None,
-             scale=1.0,
-             border_value=0,
-             interpolation='bilinear',
-             auto_bound=False):
-    """Rotate an image.
-
-    Args:
-        img (ndarray): Image to be rotated.
-        angle (float): Rotation angle in degrees, positive values mean
-            clockwise rotation.
-        center (tuple[float], optional): Center point (w, h) of the rotation in
-            the source image. If not specified, the center of the image will be
-            used.
-        scale (float): Isotropic scale factor.
-        border_value (int): Border value.
-        interpolation (str): Same as :func:`resize`.
-        auto_bound (bool): Whether to adjust the image size to cover the whole
-            rotated image.
-
-    Returns:
-        ndarray: The rotated image.
-    """
-    if center is not None and auto_bound:
-        raise ValueError('`auto_bound` conflicts with `center`')
-    h, w = img.shape[:2]
-    if center is None:
-        center = ((w - 1) * 0.5, (h - 1) * 0.5)
-    assert isinstance(center, tuple)
-
-    matrix = cv2.getRotationMatrix2D(center, -angle, scale)
-    if auto_bound:
-        cos = np.abs(matrix[0, 0])
-        sin = np.abs(matrix[0, 1])
-        new_w = h * sin + w * cos
-        new_h = h * cos + w * sin
-        matrix[0, 2] += (new_w - w) * 0.5
-        matrix[1, 2] += (new_h - h) * 0.5
-        w = int(np.round(new_w))
-        h = int(np.round(new_h))
-    rotated = cv2.warpAffine(
-        img,
-        matrix, (w, h),
-        flags=cv2_interp_codes[interpolation],
-        borderValue=border_value)
-    return rotated
-
-
-def bbox_clip(bboxes, img_shape):
-    """Clip bboxes to fit the image shape.
-
-    Args:
-        bboxes (ndarray): Shape (..., 4*k)
-        img_shape (tuple[int]): (height, width) of the image.
-
-    Returns:
-        ndarray: Clipped bboxes.
-    """
-    assert bboxes.shape[-1] % 4 == 0
-    cmin = np.empty(bboxes.shape[-1], dtype=bboxes.dtype)
-    cmin[0::2] = img_shape[1] - 1
-    cmin[1::2] = img_shape[0] - 1
-    clipped_bboxes = np.maximum(np.minimum(bboxes, cmin), 0)
-    return clipped_bboxes
-
-
-def bbox_scaling(bboxes, scale, clip_shape=None):
-    """Scaling bboxes w.r.t the box center.
-
-    Args:
-        bboxes (ndarray): Shape(..., 4).
-        scale (float): Scaling factor.
-        clip_shape (tuple[int], optional): If specified, bboxes that exceed the
-            boundary will be clipped according to the given shape (h, w).
-
-    Returns:
-        ndarray: Scaled bboxes.
-    """
-    if float(scale) == 1.0:
-        scaled_bboxes = bboxes.copy()
-    else:
-        w = bboxes[..., 2] - bboxes[..., 0] + 1
-        h = bboxes[..., 3] - bboxes[..., 1] + 1
-        dw = (w * (scale - 1)) * 0.5
-        dh = (h * (scale - 1)) * 0.5
-        scaled_bboxes = bboxes + np.stack((-dw, -dh, dw, dh), axis=-1)
-    if clip_shape is not None:
-        return bbox_clip(scaled_bboxes, clip_shape)
-    else:
-        return scaled_bboxes
-
-
-def imcrop(img, bboxes, scale=1.0, pad_fill=None):
-    """Crop image patches.
-
-    3 steps: scale the bboxes -> clip bboxes -> crop and pad.
-
-    Args:
-        img (ndarray): Image to be cropped.
-        bboxes (ndarray): Shape (k, 4) or (4, ), location of cropped bboxes.
-        scale (float, optional): Scale ratio of bboxes, the default value
-            1.0 means no padding.
-        pad_fill (Number | list[Number]): Value to be filled for padding.
-            Default: None, which means no padding.
-
-    Returns:
-        list[ndarray] | ndarray: The cropped image patches.
-    """
-    chn = 1 if img.ndim == 2 else img.shape[2]
-    if pad_fill is not None:
-        if isinstance(pad_fill, (int, float)):
-            pad_fill = [pad_fill for _ in range(chn)]
-        assert len(pad_fill) == chn
-
-    _bboxes = bboxes[None, ...] if bboxes.ndim == 1 else bboxes
-    scaled_bboxes = bbox_scaling(_bboxes, scale).astype(np.int32)
-    clipped_bbox = bbox_clip(scaled_bboxes, img.shape)
-
-    patches = []
-    for i in range(clipped_bbox.shape[0]):
-        x1, y1, x2, y2 = tuple(clipped_bbox[i, :])
-        if pad_fill is None:
-            patch = img[y1:y2 + 1, x1:x2 + 1, ...]
-        else:
-            _x1, _y1, _x2, _y2 = tuple(scaled_bboxes[i, :])
-            if chn == 1:
-                patch_shape = (_y2 - _y1 + 1, _x2 - _x1 + 1)
-            else:
-                patch_shape = (_y2 - _y1 + 1, _x2 - _x1 + 1, chn)
-            patch = np.array(
-                pad_fill, dtype=img.dtype) * np.ones(
-                    patch_shape, dtype=img.dtype)
-            x_start = 0 if _x1 >= 0 else -_x1
-            y_start = 0 if _y1 >= 0 else -_y1
-            w = x2 - x1 + 1
-            h = y2 - y1 + 1
-            patch[y_start:y_start + h, x_start:x_start + w,
-                  ...] = img[y1:y1 + h, x1:x1 + w, ...]
-        patches.append(patch)
-
-    if bboxes.ndim == 1:
-        return patches[0]
-    else:
-        return patches
-
-
-def impad(img,
-          *,
-          shape=None,
-          padding=None,
-          pad_val=0,
-          padding_mode='constant'):
-    """Pad the given image to a certain shape or pad on all sides with
-    specified padding mode and padding value.
-
-    Args:
-        img (ndarray): Image to be padded.
-        shape (tuple[int]): Expected padding shape (h, w). Default: None.
-        padding (int or tuple[int]): Padding on each border. If a single int is
-            provided this is used to pad all borders. If tuple of length 2 is
-            provided this is the padding on left/right and top/bottom
-            respectively. If a tuple of length 4 is provided this is the
-            padding for the left, top, right and bottom borders respectively.
-            Default: None. Note that `shape` and `padding` can not be both
-            set.
-        pad_val (Number | Sequence[Number]): Values to be filled in padding
-            areas when padding_mode is 'constant'. Default: 0.
-        padding_mode (str): Type of padding. Should be: constant, edge,
-            reflect or symmetric. Default: constant.
-
-            - constant: pads with a constant value, this value is specified
-                with pad_val.
-            - edge: pads with the last value at the edge of the image.
-            - reflect: pads with reflection of image without repeating the
-                last value on the edge. For example, padding [1, 2, 3, 4]
-                with 2 elements on both sides in reflect mode will result
-                in [3, 2, 1, 2, 3, 4, 3, 2].
-            - symmetric: pads with reflection of image repeating the last
-                value on the edge. For example, padding [1, 2, 3, 4] with
-                2 elements on both sides in symmetric mode will result in
-                [2, 1, 1, 2, 3, 4, 4, 3]
-
-    Returns:
-        ndarray: The padded image.
-    """
-
-    assert (shape is not None) ^ (padding is not None)
-    if shape is not None:
-        padding = (0, 0, shape[1] - img.shape[1], shape[0] - img.shape[0])
-
-    # check pad_val
-    if isinstance(pad_val, tuple):
-        assert len(pad_val) == img.shape[-1]
-    elif not isinstance(pad_val, numbers.Number):
-        raise TypeError('pad_val must be a int or a tuple. '
-                        f'But received {type(pad_val)}')
-
-    # check padding
-    if isinstance(padding, tuple) and len(padding) in [2, 4]:
-        if len(padding) == 2:
-            padding = (padding[0], padding[1], padding[0], padding[1])
-    elif isinstance(padding, numbers.Number):
-        padding = (padding, padding, padding, padding)
-    else:
-        raise ValueError('Padding must be a int or a 2, or 4 element tuple.'
-                         f'But received {padding}')
-
-    # check padding mode
-    assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
-
-    border_type = {
-        'constant': cv2.BORDER_CONSTANT,
-        'edge': cv2.BORDER_REPLICATE,
-        'reflect': cv2.BORDER_REFLECT_101,
-        'symmetric': cv2.BORDER_REFLECT
-    }
-    img = cv2.copyMakeBorder(
-        img,
-        padding[1],
-        padding[3],
-        padding[0],
-        padding[2],
-        border_type[padding_mode],
-        value=pad_val)
-
-    return img
-
-
-def impad_to_multiple(img, divisor, pad_val=0):
-    """Pad an image to ensure each edge to be multiple to some number.
-
-    Args:
-        img (ndarray): Image to be padded.
-        divisor (int): Padded image edges will be multiple to divisor.
-        pad_val (Number | Sequence[Number]): Same as :func:`impad`.
-
-    Returns:
-        ndarray: The padded image.
-    """
-    pad_h = int(np.ceil(img.shape[0] / divisor)) * divisor
-    pad_w = int(np.ceil(img.shape[1] / divisor)) * divisor
-    return impad(img, shape=(pad_h, pad_w), pad_val=pad_val)
-
-
-def cutout(img, shape, pad_val=0):
-    """Randomly cut out a rectangle from the original img.
-
-    Args:
-        img (ndarray): Image to be cutout.
-        shape (int | tuple[int]): Expected cutout shape (h, w). If given as a
-            int, the value will be used for both h and w.
-        pad_val (int | float | tuple[int | float]): Values to be filled in the
-            cut area. Defaults to 0.
-
-    Returns:
-        ndarray: The cutout image.
-    """
-
-    channels = 1 if img.ndim == 2 else img.shape[2]
-    if isinstance(shape, int):
-        cut_h, cut_w = shape, shape
-    else:
-        assert isinstance(shape, tuple) and len(shape) == 2, \
-            f'shape must be a int or a tuple with length 2, but got type ' \
-            f'{type(shape)} instead.'
-        cut_h, cut_w = shape
-    if isinstance(pad_val, (int, float)):
-        pad_val = tuple([pad_val] * channels)
-    elif isinstance(pad_val, tuple):
-        assert len(pad_val) == channels, \
-            'Expected the num of elements in tuple equals the channels' \
-            'of input image. Found {} vs {}'.format(
-                len(pad_val), channels)
-    else:
-        raise TypeError(f'Invalid type {type(pad_val)} for `pad_val`')
-
-    img_h, img_w = img.shape[:2]
-    y0 = np.random.uniform(img_h)
-    x0 = np.random.uniform(img_w)
-
-    y1 = int(max(0, y0 - cut_h / 2.))
-    x1 = int(max(0, x0 - cut_w / 2.))
-    y2 = min(img_h, y1 + cut_h)
-    x2 = min(img_w, x1 + cut_w)
-
-    if img.ndim == 2:
-        patch_shape = (y2 - y1, x2 - x1)
-    else:
-        patch_shape = (y2 - y1, x2 - x1, channels)
-
-    img_cutout = img.copy()
-    patch = np.array(
-        pad_val, dtype=img.dtype) * np.ones(
-            patch_shape, dtype=img.dtype)
-    img_cutout[y1:y2, x1:x2, ...] = patch
-
-    return img_cutout
-
-
-def _get_shear_matrix(magnitude, direction='horizontal'):
-    """Generate the shear matrix for transformation.
-
-    Args:
-        magnitude (int | float): The magnitude used for shear.
-        direction (str): The flip direction, either "horizontal"
-            or "vertical".
-
-    Returns:
-        ndarray: The shear matrix with dtype float32.
-    """
-    if direction == 'horizontal':
-        shear_matrix = np.float32([[1, magnitude, 0], [0, 1, 0]])
-    elif direction == 'vertical':
-        shear_matrix = np.float32([[1, 0, 0], [magnitude, 1, 0]])
-    return shear_matrix
-
-
-def imshear(img,
-            magnitude,
-            direction='horizontal',
-            border_value=0,
-            interpolation='bilinear'):
-    """Shear an image.
-
-    Args:
-        img (ndarray): Image to be sheared with format (h, w)
-            or (h, w, c).
-        magnitude (int | float): The magnitude used for shear.
-        direction (str): The flip direction, either "horizontal"
-            or "vertical".
-        border_value (int | tuple[int]): Value used in case of a
-            constant border.
-        interpolation (str): Same as :func:`resize`.
-
-    Returns:
-        ndarray: The sheared image.
-    """
-    assert direction in ['horizontal',
-                         'vertical'], f'Invalid direction: {direction}'
-    height, width = img.shape[:2]
-    if img.ndim == 2:
-        channels = 1
-    elif img.ndim == 3:
-        channels = img.shape[-1]
-    if isinstance(border_value, int):
-        border_value = tuple([border_value] * channels)
-    elif isinstance(border_value, tuple):
-        assert len(border_value) == channels, \
-            'Expected the num of elements in tuple equals the channels' \
-            'of input image. Found {} vs {}'.format(
-                len(border_value), channels)
-    else:
-        raise ValueError(
-            f'Invalid type {type(border_value)} for `border_value`')
-    shear_matrix = _get_shear_matrix(magnitude, direction)
-    sheared = cv2.warpAffine(
-        img,
-        shear_matrix,
-        (width, height),
-        # Note case when the number elements in `border_value`
-        # greater than 3 (e.g. shearing masks whose channels large
-        # than 3) will raise TypeError in `cv2.warpAffine`.
-        # Here simply slice the first 3 values in `border_value`.
-        borderValue=border_value[:3],
-        flags=cv2_interp_codes[interpolation])
-    return sheared
-
-
-def _get_translate_matrix(offset, direction='horizontal'):
-    """Generate the translate matrix.
-
-    Args:
-        offset (int | float): The offset used for translate.
-        direction (str): The translate direction, either
-            "horizontal" or "vertical".
-
-    Returns:
-        ndarray: The translate matrix with dtype float32.
-    """
-    if direction == 'horizontal':
-        translate_matrix = np.float32([[1, 0, offset], [0, 1, 0]])
-    elif direction == 'vertical':
-        translate_matrix = np.float32([[1, 0, 0], [0, 1, offset]])
-    return translate_matrix
-
-
-def imtranslate(img,
-                offset,
-                direction='horizontal',
-                border_value=0,
-                interpolation='bilinear'):
-    """Translate an image.
-
-    Args:
-        img (ndarray): Image to be translated with format
-            (h, w) or (h, w, c).
-        offset (int | float): The offset used for translate.
-        direction (str): The translate direction, either "horizontal"
-            or "vertical".
-        border_value (int | tuple[int]): Value used in case of a
-            constant border.
-        interpolation (str): Same as :func:`resize`.
-
-    Returns:
-        ndarray: The translated image.
-    """
-    assert direction in ['horizontal',
-                         'vertical'], f'Invalid direction: {direction}'
-    height, width = img.shape[:2]
-    if img.ndim == 2:
-        channels = 1
-    elif img.ndim == 3:
-        channels = img.shape[-1]
-    if isinstance(border_value, int):
-        border_value = tuple([border_value] * channels)
-    elif isinstance(border_value, tuple):
-        assert len(border_value) == channels, \
-            'Expected the num of elements in tuple equals the channels' \
-            'of input image. Found {} vs {}'.format(
-                len(border_value), channels)
-    else:
-        raise ValueError(
-            f'Invalid type {type(border_value)} for `border_value`.')
-    translate_matrix = _get_translate_matrix(offset, direction)
-    translated = cv2.warpAffine(
-        img,
-        translate_matrix,
-        (width, height),
-        # Note case when the number elements in `border_value`
-        # greater than 3 (e.g. translating masks whose channels
-        # large than 3) will raise TypeError in `cv2.warpAffine`.
-        # Here simply slice the first 3 values in `border_value`.
-        borderValue=border_value[:3],
-        flags=cv2_interp_codes[interpolation])
-    return translated
diff --git a/ControlNet/annotator/uniformer/mmcv/image/io.py b/ControlNet/annotator/uniformer/mmcv/image/io.py
deleted file mode 100644
index d3fa2e8cc06b1a7b0b69de6406980b15d61a1e5d..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/image/io.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import io
-import os.path as osp
-from pathlib import Path
-
-import cv2
-import numpy as np
-from cv2 import (IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_IGNORE_ORIENTATION,
-                 IMREAD_UNCHANGED)
-
-from annotator.uniformer.mmcv.utils import check_file_exist, is_str, mkdir_or_exist
-
-try:
-    from turbojpeg import TJCS_RGB, TJPF_BGR, TJPF_GRAY, TurboJPEG
-except ImportError:
-    TJCS_RGB = TJPF_GRAY = TJPF_BGR = TurboJPEG = None
-
-try:
-    from PIL import Image, ImageOps
-except ImportError:
-    Image = None
-
-try:
-    import tifffile
-except ImportError:
-    tifffile = None
-
-jpeg = None
-supported_backends = ['cv2', 'turbojpeg', 'pillow', 'tifffile']
-
-imread_flags = {
-    'color': IMREAD_COLOR,
-    'grayscale': IMREAD_GRAYSCALE,
-    'unchanged': IMREAD_UNCHANGED,
-    'color_ignore_orientation': IMREAD_IGNORE_ORIENTATION | IMREAD_COLOR,
-    'grayscale_ignore_orientation':
-    IMREAD_IGNORE_ORIENTATION | IMREAD_GRAYSCALE
-}
-
-imread_backend = 'cv2'
-
-
-def use_backend(backend):
-    """Select a backend for image decoding.
-
-    Args:
-        backend (str): The image decoding backend type. Options are `cv2`,
-        `pillow`, `turbojpeg` (see https://github.com/lilohuang/PyTurboJPEG)
-        and `tifffile`. `turbojpeg` is faster but it only supports `.jpeg`
-        file format.
-    """
-    assert backend in supported_backends
-    global imread_backend
-    imread_backend = backend
-    if imread_backend == 'turbojpeg':
-        if TurboJPEG is None:
-            raise ImportError('`PyTurboJPEG` is not installed')
-        global jpeg
-        if jpeg is None:
-            jpeg = TurboJPEG()
-    elif imread_backend == 'pillow':
-        if Image is None:
-            raise ImportError('`Pillow` is not installed')
-    elif imread_backend == 'tifffile':
-        if tifffile is None:
-            raise ImportError('`tifffile` is not installed')
-
-
-def _jpegflag(flag='color', channel_order='bgr'):
-    channel_order = channel_order.lower()
-    if channel_order not in ['rgb', 'bgr']:
-        raise ValueError('channel order must be either "rgb" or "bgr"')
-
-    if flag == 'color':
-        if channel_order == 'bgr':
-            return TJPF_BGR
-        elif channel_order == 'rgb':
-            return TJCS_RGB
-    elif flag == 'grayscale':
-        return TJPF_GRAY
-    else:
-        raise ValueError('flag must be "color" or "grayscale"')
-
-
-def _pillow2array(img, flag='color', channel_order='bgr'):
-    """Convert a pillow image to numpy array.
-
-    Args:
-        img (:obj:`PIL.Image.Image`): The image loaded using PIL
-        flag (str): Flags specifying the color type of a loaded image,
-            candidates are 'color', 'grayscale' and 'unchanged'.
-            Default to 'color'.
-        channel_order (str): The channel order of the output image array,
-            candidates are 'bgr' and 'rgb'. Default to 'bgr'.
-
-    Returns:
-        np.ndarray: The converted numpy array
-    """
-    channel_order = channel_order.lower()
-    if channel_order not in ['rgb', 'bgr']:
-        raise ValueError('channel order must be either "rgb" or "bgr"')
-
-    if flag == 'unchanged':
-        array = np.array(img)
-        if array.ndim >= 3 and array.shape[2] >= 3:  # color image
-            array[:, :, :3] = array[:, :, (2, 1, 0)]  # RGB to BGR
-    else:
-        # Handle exif orientation tag
-        if flag in ['color', 'grayscale']:
-            img = ImageOps.exif_transpose(img)
-        # If the image mode is not 'RGB', convert it to 'RGB' first.
-        if img.mode != 'RGB':
-            if img.mode != 'LA':
-                # Most formats except 'LA' can be directly converted to RGB
-                img = img.convert('RGB')
-            else:
-                # When the mode is 'LA', the default conversion will fill in
-                #  the canvas with black, which sometimes shadows black objects
-                #  in the foreground.
-                #
-                # Therefore, a random color (124, 117, 104) is used for canvas
-                img_rgba = img.convert('RGBA')
-                img = Image.new('RGB', img_rgba.size, (124, 117, 104))
-                img.paste(img_rgba, mask=img_rgba.split()[3])  # 3 is alpha
-        if flag in ['color', 'color_ignore_orientation']:
-            array = np.array(img)
-            if channel_order != 'rgb':
-                array = array[:, :, ::-1]  # RGB to BGR
-        elif flag in ['grayscale', 'grayscale_ignore_orientation']:
-            img = img.convert('L')
-            array = np.array(img)
-        else:
-            raise ValueError(
-                'flag must be "color", "grayscale", "unchanged", '
-                f'"color_ignore_orientation" or "grayscale_ignore_orientation"'
-                f' but got {flag}')
-    return array
-
-
-def imread(img_or_path, flag='color', channel_order='bgr', backend=None):
-    """Read an image.
-
-    Args:
-        img_or_path (ndarray or str or Path): Either a numpy array or str or
-            pathlib.Path. If it is a numpy array (loaded image), then
-            it will be returned as is.
-        flag (str): Flags specifying the color type of a loaded image,
-            candidates are `color`, `grayscale`, `unchanged`,
-            `color_ignore_orientation` and `grayscale_ignore_orientation`.
-            By default, `cv2` and `pillow` backend would rotate the image
-            according to its EXIF info unless called with `unchanged` or
-            `*_ignore_orientation` flags. `turbojpeg` and `tifffile` backend
-            always ignore image's EXIF info regardless of the flag.
-            The `turbojpeg` backend only supports `color` and `grayscale`.
-        channel_order (str): Order of channel, candidates are `bgr` and `rgb`.
-        backend (str | None): The image decoding backend type. Options are
-            `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`.
-            If backend is None, the global imread_backend specified by
-            ``mmcv.use_backend()`` will be used. Default: None.
-
-    Returns:
-        ndarray: Loaded image array.
-    """
-
-    if backend is None:
-        backend = imread_backend
-    if backend not in supported_backends:
-        raise ValueError(f'backend: {backend} is not supported. Supported '
-                         "backends are 'cv2', 'turbojpeg', 'pillow'")
-    if isinstance(img_or_path, Path):
-        img_or_path = str(img_or_path)
-
-    if isinstance(img_or_path, np.ndarray):
-        return img_or_path
-    elif is_str(img_or_path):
-        check_file_exist(img_or_path,
-                         f'img file does not exist: {img_or_path}')
-        if backend == 'turbojpeg':
-            with open(img_or_path, 'rb') as in_file:
-                img = jpeg.decode(in_file.read(),
-                                  _jpegflag(flag, channel_order))
-                if img.shape[-1] == 1:
-                    img = img[:, :, 0]
-            return img
-        elif backend == 'pillow':
-            img = Image.open(img_or_path)
-            img = _pillow2array(img, flag, channel_order)
-            return img
-        elif backend == 'tifffile':
-            img = tifffile.imread(img_or_path)
-            return img
-        else:
-            flag = imread_flags[flag] if is_str(flag) else flag
-            img = cv2.imread(img_or_path, flag)
-            if flag == IMREAD_COLOR and channel_order == 'rgb':
-                cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
-            return img
-    else:
-        raise TypeError('"img" must be a numpy array or a str or '
-                        'a pathlib.Path object')
-
-
-def imfrombytes(content, flag='color', channel_order='bgr', backend=None):
-    """Read an image from bytes.
-
-    Args:
-        content (bytes): Image bytes got from files or other streams.
-        flag (str): Same as :func:`imread`.
-        backend (str | None): The image decoding backend type. Options are
-            `cv2`, `pillow`, `turbojpeg`, `None`. If backend is None, the
-            global imread_backend specified by ``mmcv.use_backend()`` will be
-            used. Default: None.
-
-    Returns:
-        ndarray: Loaded image array.
-    """
-
-    if backend is None:
-        backend = imread_backend
-    if backend not in supported_backends:
-        raise ValueError(f'backend: {backend} is not supported. Supported '
-                         "backends are 'cv2', 'turbojpeg', 'pillow'")
-    if backend == 'turbojpeg':
-        img = jpeg.decode(content, _jpegflag(flag, channel_order))
-        if img.shape[-1] == 1:
-            img = img[:, :, 0]
-        return img
-    elif backend == 'pillow':
-        buff = io.BytesIO(content)
-        img = Image.open(buff)
-        img = _pillow2array(img, flag, channel_order)
-        return img
-    else:
-        img_np = np.frombuffer(content, np.uint8)
-        flag = imread_flags[flag] if is_str(flag) else flag
-        img = cv2.imdecode(img_np, flag)
-        if flag == IMREAD_COLOR and channel_order == 'rgb':
-            cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
-        return img
-
-
-def imwrite(img, file_path, params=None, auto_mkdir=True):
-    """Write image to file.
-
-    Args:
-        img (ndarray): Image array to be written.
-        file_path (str): Image file path.
-        params (None or list): Same as opencv :func:`imwrite` interface.
-        auto_mkdir (bool): If the parent folder of `file_path` does not exist,
-            whether to create it automatically.
-
-    Returns:
-        bool: Successful or not.
-    """
-    if auto_mkdir:
-        dir_name = osp.abspath(osp.dirname(file_path))
-        mkdir_or_exist(dir_name)
-    return cv2.imwrite(file_path, img, params)
diff --git a/ControlNet/annotator/uniformer/mmcv/image/misc.py b/ControlNet/annotator/uniformer/mmcv/image/misc.py
deleted file mode 100644
index 3e61f05e3b05e4c7b40de4eb6c8eb100e6da41d0..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/image/misc.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-
-import annotator.uniformer.mmcv as mmcv
-
-try:
-    import torch
-except ImportError:
-    torch = None
-
-
-def tensor2imgs(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True):
-    """Convert tensor to 3-channel images.
-
-    Args:
-        tensor (torch.Tensor): Tensor that contains multiple images, shape (
-            N, C, H, W).
-        mean (tuple[float], optional): Mean of images. Defaults to (0, 0, 0).
-        std (tuple[float], optional): Standard deviation of images.
-            Defaults to (1, 1, 1).
-        to_rgb (bool, optional): Whether the tensor was converted to RGB
-            format in the first place. If so, convert it back to BGR.
-            Defaults to True.
-
-    Returns:
-        list[np.ndarray]: A list that contains multiple images.
-    """
-
-    if torch is None:
-        raise RuntimeError('pytorch is not installed')
-    assert torch.is_tensor(tensor) and tensor.ndim == 4
-    assert len(mean) == 3
-    assert len(std) == 3
-
-    num_imgs = tensor.size(0)
-    mean = np.array(mean, dtype=np.float32)
-    std = np.array(std, dtype=np.float32)
-    imgs = []
-    for img_id in range(num_imgs):
-        img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0)
-        img = mmcv.imdenormalize(
-            img, mean, std, to_bgr=to_rgb).astype(np.uint8)
-        imgs.append(np.ascontiguousarray(img))
-    return imgs
diff --git a/ControlNet/annotator/uniformer/mmcv/image/photometric.py b/ControlNet/annotator/uniformer/mmcv/image/photometric.py
deleted file mode 100644
index 5085d012019c0cbf56f66f421a378278c1a058ae..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/image/photometric.py
+++ /dev/null
@@ -1,428 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import cv2
-import numpy as np
-
-from ..utils import is_tuple_of
-from .colorspace import bgr2gray, gray2bgr
-
-
-def imnormalize(img, mean, std, to_rgb=True):
-    """Normalize an image with mean and std.
-
-    Args:
-        img (ndarray): Image to be normalized.
-        mean (ndarray): The mean to be used for normalize.
-        std (ndarray): The std to be used for normalize.
-        to_rgb (bool): Whether to convert to rgb.
-
-    Returns:
-        ndarray: The normalized image.
-    """
-    img = img.copy().astype(np.float32)
-    return imnormalize_(img, mean, std, to_rgb)
-
-
-def imnormalize_(img, mean, std, to_rgb=True):
-    """Inplace normalize an image with mean and std.
-
-    Args:
-        img (ndarray): Image to be normalized.
-        mean (ndarray): The mean to be used for normalize.
-        std (ndarray): The std to be used for normalize.
-        to_rgb (bool): Whether to convert to rgb.
-
-    Returns:
-        ndarray: The normalized image.
-    """
-    # cv2 inplace normalization does not accept uint8
-    assert img.dtype != np.uint8
-    mean = np.float64(mean.reshape(1, -1))
-    stdinv = 1 / np.float64(std.reshape(1, -1))
-    if to_rgb:
-        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)  # inplace
-    cv2.subtract(img, mean, img)  # inplace
-    cv2.multiply(img, stdinv, img)  # inplace
-    return img
-
-
-def imdenormalize(img, mean, std, to_bgr=True):
-    assert img.dtype != np.uint8
-    mean = mean.reshape(1, -1).astype(np.float64)
-    std = std.reshape(1, -1).astype(np.float64)
-    img = cv2.multiply(img, std)  # make a copy
-    cv2.add(img, mean, img)  # inplace
-    if to_bgr:
-        cv2.cvtColor(img, cv2.COLOR_RGB2BGR, img)  # inplace
-    return img
-
-
-def iminvert(img):
-    """Invert (negate) an image.
-
-    Args:
-        img (ndarray): Image to be inverted.
-
-    Returns:
-        ndarray: The inverted image.
-    """
-    return np.full_like(img, 255) - img
-
-
-def solarize(img, thr=128):
-    """Solarize an image (invert all pixel values above a threshold)
-
-    Args:
-        img (ndarray): Image to be solarized.
-        thr (int): Threshold for solarizing (0 - 255).
-
-    Returns:
-        ndarray: The solarized image.
-    """
-    img = np.where(img < thr, img, 255 - img)
-    return img
-
-
-def posterize(img, bits):
-    """Posterize an image (reduce the number of bits for each color channel)
-
-    Args:
-        img (ndarray): Image to be posterized.
-        bits (int): Number of bits (1 to 8) to use for posterizing.
-
-    Returns:
-        ndarray: The posterized image.
-    """
-    shift = 8 - bits
-    img = np.left_shift(np.right_shift(img, shift), shift)
-    return img
-
-
-def adjust_color(img, alpha=1, beta=None, gamma=0):
-    r"""It blends the source image and its gray image:
-
-    .. math::
-        output = img * alpha + gray\_img * beta + gamma
-
-    Args:
-        img (ndarray): The input source image.
-        alpha (int | float): Weight for the source image. Default 1.
-        beta (int | float): Weight for the converted gray image.
-            If None, it's assigned the value (1 - `alpha`).
-        gamma (int | float): Scalar added to each sum.
-            Same as :func:`cv2.addWeighted`. Default 0.
-
-    Returns:
-        ndarray: Colored image which has the same size and dtype as input.
-    """
-    gray_img = bgr2gray(img)
-    gray_img = np.tile(gray_img[..., None], [1, 1, 3])
-    if beta is None:
-        beta = 1 - alpha
-    colored_img = cv2.addWeighted(img, alpha, gray_img, beta, gamma)
-    if not colored_img.dtype == np.uint8:
-        # Note when the dtype of `img` is not the default `np.uint8`
-        # (e.g. np.float32), the value in `colored_img` got from cv2
-        # is not guaranteed to be in range [0, 255], so here clip
-        # is needed.
-        colored_img = np.clip(colored_img, 0, 255)
-    return colored_img
-
-
-def imequalize(img):
-    """Equalize the image histogram.
-
-    This function applies a non-linear mapping to the input image,
-    in order to create a uniform distribution of grayscale values
-    in the output image.
-
-    Args:
-        img (ndarray): Image to be equalized.
-
-    Returns:
-        ndarray: The equalized image.
-    """
-
-    def _scale_channel(im, c):
-        """Scale the data in the corresponding channel."""
-        im = im[:, :, c]
-        # Compute the histogram of the image channel.
-        histo = np.histogram(im, 256, (0, 255))[0]
-        # For computing the step, filter out the nonzeros.
-        nonzero_histo = histo[histo > 0]
-        step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255
-        if not step:
-            lut = np.array(range(256))
-        else:
-            # Compute the cumulative sum, shifted by step // 2
-            # and then normalized by step.
-            lut = (np.cumsum(histo) + (step // 2)) // step
-            # Shift lut, prepending with 0.
-            lut = np.concatenate([[0], lut[:-1]], 0)
-            # handle potential integer overflow
-            lut[lut > 255] = 255
-        # If step is zero, return the original image.
-        # Otherwise, index from lut.
-        return np.where(np.equal(step, 0), im, lut[im])
-
-    # Scales each channel independently and then stacks
-    # the result.
-    s1 = _scale_channel(img, 0)
-    s2 = _scale_channel(img, 1)
-    s3 = _scale_channel(img, 2)
-    equalized_img = np.stack([s1, s2, s3], axis=-1)
-    return equalized_img.astype(img.dtype)
-
-
-def adjust_brightness(img, factor=1.):
-    """Adjust image brightness.
-
-    This function controls the brightness of an image. An
-    enhancement factor of 0.0 gives a black image.
-    A factor of 1.0 gives the original image. This function
-    blends the source image and the degenerated black image:
-
-    .. math::
-        output = img * factor + degenerated * (1 - factor)
-
-    Args:
-        img (ndarray): Image to be brightened.
-        factor (float): A value controls the enhancement.
-            Factor 1.0 returns the original image, lower
-            factors mean less color (brightness, contrast,
-            etc), and higher values more. Default 1.
-
-    Returns:
-        ndarray: The brightened image.
-    """
-    degenerated = np.zeros_like(img)
-    # Note manually convert the dtype to np.float32, to
-    # achieve as close results as PIL.ImageEnhance.Brightness.
-    # Set beta=1-factor, and gamma=0
-    brightened_img = cv2.addWeighted(
-        img.astype(np.float32), factor, degenerated.astype(np.float32),
-        1 - factor, 0)
-    brightened_img = np.clip(brightened_img, 0, 255)
-    return brightened_img.astype(img.dtype)
-
-
-def adjust_contrast(img, factor=1.):
-    """Adjust image contrast.
-
-    This function controls the contrast of an image. An
-    enhancement factor of 0.0 gives a solid grey
-    image. A factor of 1.0 gives the original image. It
-    blends the source image and the degenerated mean image:
-
-    .. math::
-        output = img * factor + degenerated * (1 - factor)
-
-    Args:
-        img (ndarray): Image to be contrasted. BGR order.
-        factor (float): Same as :func:`mmcv.adjust_brightness`.
-
-    Returns:
-        ndarray: The contrasted image.
-    """
-    gray_img = bgr2gray(img)
-    hist = np.histogram(gray_img, 256, (0, 255))[0]
-    mean = round(np.sum(gray_img) / np.sum(hist))
-    degenerated = (np.ones_like(img[..., 0]) * mean).astype(img.dtype)
-    degenerated = gray2bgr(degenerated)
-    contrasted_img = cv2.addWeighted(
-        img.astype(np.float32), factor, degenerated.astype(np.float32),
-        1 - factor, 0)
-    contrasted_img = np.clip(contrasted_img, 0, 255)
-    return contrasted_img.astype(img.dtype)
-
-
-def auto_contrast(img, cutoff=0):
-    """Auto adjust image contrast.
-
-    This function maximize (normalize) image contrast by first removing cutoff
-    percent of the lightest and darkest pixels from the histogram and remapping
-    the image so that the darkest pixel becomes black (0), and the lightest
-    becomes white (255).
-
-    Args:
-        img (ndarray): Image to be contrasted. BGR order.
-        cutoff (int | float | tuple): The cutoff percent of the lightest and
-            darkest pixels to be removed. If given as tuple, it shall be
-            (low, high). Otherwise, the single value will be used for both.
-            Defaults to 0.
-
-    Returns:
-        ndarray: The contrasted image.
-    """
-
-    def _auto_contrast_channel(im, c, cutoff):
-        im = im[:, :, c]
-        # Compute the histogram of the image channel.
-        histo = np.histogram(im, 256, (0, 255))[0]
-        # Remove cut-off percent pixels from histo
-        histo_sum = np.cumsum(histo)
-        cut_low = histo_sum[-1] * cutoff[0] // 100
-        cut_high = histo_sum[-1] - histo_sum[-1] * cutoff[1] // 100
-        histo_sum = np.clip(histo_sum, cut_low, cut_high) - cut_low
-        histo = np.concatenate([[histo_sum[0]], np.diff(histo_sum)], 0)
-
-        # Compute mapping
-        low, high = np.nonzero(histo)[0][0], np.nonzero(histo)[0][-1]
-        # If all the values have been cut off, return the origin img
-        if low >= high:
-            return im
-        scale = 255.0 / (high - low)
-        offset = -low * scale
-        lut = np.array(range(256))
-        lut = lut * scale + offset
-        lut = np.clip(lut, 0, 255)
-        return lut[im]
-
-    if isinstance(cutoff, (int, float)):
-        cutoff = (cutoff, cutoff)
-    else:
-        assert isinstance(cutoff, tuple), 'cutoff must be of type int, ' \
-            f'float or tuple, but got {type(cutoff)} instead.'
-    # Auto adjusts contrast for each channel independently and then stacks
-    # the result.
-    s1 = _auto_contrast_channel(img, 0, cutoff)
-    s2 = _auto_contrast_channel(img, 1, cutoff)
-    s3 = _auto_contrast_channel(img, 2, cutoff)
-    contrasted_img = np.stack([s1, s2, s3], axis=-1)
-    return contrasted_img.astype(img.dtype)
-
-
-def adjust_sharpness(img, factor=1., kernel=None):
-    """Adjust image sharpness.
-
-    This function controls the sharpness of an image. An
-    enhancement factor of 0.0 gives a blurred image. A
-    factor of 1.0 gives the original image. And a factor
-    of 2.0 gives a sharpened image. It blends the source
-    image and the degenerated mean image:
-
-    .. math::
-        output = img * factor + degenerated * (1 - factor)
-
-    Args:
-        img (ndarray): Image to be sharpened. BGR order.
-        factor (float): Same as :func:`mmcv.adjust_brightness`.
-        kernel (np.ndarray, optional): Filter kernel to be applied on the img
-            to obtain the degenerated img. Defaults to None.
-
-    Note:
-        No value sanity check is enforced on the kernel set by users. So with
-        an inappropriate kernel, the ``adjust_sharpness`` may fail to perform
-        the function its name indicates but end up performing whatever
-        transform determined by the kernel.
-
-    Returns:
-        ndarray: The sharpened image.
-    """
-
-    if kernel is None:
-        # adopted from PIL.ImageFilter.SMOOTH
-        kernel = np.array([[1., 1., 1.], [1., 5., 1.], [1., 1., 1.]]) / 13
-    assert isinstance(kernel, np.ndarray), \
-        f'kernel must be of type np.ndarray, but got {type(kernel)} instead.'
-    assert kernel.ndim == 2, \
-        f'kernel must have a dimension of 2, but got {kernel.ndim} instead.'
-
-    degenerated = cv2.filter2D(img, -1, kernel)
-    sharpened_img = cv2.addWeighted(
-        img.astype(np.float32), factor, degenerated.astype(np.float32),
-        1 - factor, 0)
-    sharpened_img = np.clip(sharpened_img, 0, 255)
-    return sharpened_img.astype(img.dtype)
-
-
-def adjust_lighting(img, eigval, eigvec, alphastd=0.1, to_rgb=True):
-    """AlexNet-style PCA jitter.
-
-    This data augmentation is proposed in `ImageNet Classification with Deep
-    Convolutional Neural Networks
-    <https://dl.acm.org/doi/pdf/10.1145/3065386>`_.
-
-    Args:
-        img (ndarray): Image to be adjusted lighting. BGR order.
-        eigval (ndarray): the eigenvalue of the convariance matrix of pixel
-            values, respectively.
-        eigvec (ndarray): the eigenvector of the convariance matrix of pixel
-            values, respectively.
-        alphastd (float): The standard deviation for distribution of alpha.
-            Defaults to 0.1
-        to_rgb (bool): Whether to convert img to rgb.
-
-    Returns:
-        ndarray: The adjusted image.
-    """
-    assert isinstance(eigval, np.ndarray) and isinstance(eigvec, np.ndarray), \
-        f'eigval and eigvec should both be of type np.ndarray, got ' \
-        f'{type(eigval)} and {type(eigvec)} instead.'
-
-    assert eigval.ndim == 1 and eigvec.ndim == 2
-    assert eigvec.shape == (3, eigval.shape[0])
-    n_eigval = eigval.shape[0]
-    assert isinstance(alphastd, float), 'alphastd should be of type float, ' \
-        f'got {type(alphastd)} instead.'
-
-    img = img.copy().astype(np.float32)
-    if to_rgb:
-        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)  # inplace
-
-    alpha = np.random.normal(0, alphastd, n_eigval)
-    alter = eigvec \
-        * np.broadcast_to(alpha.reshape(1, n_eigval), (3, n_eigval)) \
-        * np.broadcast_to(eigval.reshape(1, n_eigval), (3, n_eigval))
-    alter = np.broadcast_to(alter.sum(axis=1).reshape(1, 1, 3), img.shape)
-    img_adjusted = img + alter
-    return img_adjusted
-
-
-def lut_transform(img, lut_table):
-    """Transform array by look-up table.
-
-    The function lut_transform fills the output array with values from the
-    look-up table. Indices of the entries are taken from the input array.
-
-    Args:
-        img (ndarray): Image to be transformed.
-        lut_table (ndarray): look-up table of 256 elements; in case of
-            multi-channel input array, the table should either have a single
-            channel (in this case the same table is used for all channels) or
-            the same number of channels as in the input array.
-
-    Returns:
-        ndarray: The transformed image.
-    """
-    assert isinstance(img, np.ndarray)
-    assert 0 <= np.min(img) and np.max(img) <= 255
-    assert isinstance(lut_table, np.ndarray)
-    assert lut_table.shape == (256, )
-
-    return cv2.LUT(np.array(img, dtype=np.uint8), lut_table)
-
-
-def clahe(img, clip_limit=40.0, tile_grid_size=(8, 8)):
-    """Use CLAHE method to process the image.
-
-    See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J].
-    Graphics Gems, 1994:474-485.` for more information.
-
-    Args:
-        img (ndarray): Image to be processed.
-        clip_limit (float): Threshold for contrast limiting. Default: 40.0.
-        tile_grid_size (tuple[int]): Size of grid for histogram equalization.
-            Input image will be divided into equally sized rectangular tiles.
-            It defines the number of tiles in row and column. Default: (8, 8).
-
-    Returns:
-        ndarray: The processed image.
-    """
-    assert isinstance(img, np.ndarray)
-    assert img.ndim == 2
-    assert isinstance(clip_limit, (float, int))
-    assert is_tuple_of(tile_grid_size, int)
-    assert len(tile_grid_size) == 2
-
-    clahe = cv2.createCLAHE(clip_limit, tile_grid_size)
-    return clahe.apply(np.array(img, dtype=np.uint8))
diff --git a/ControlNet/annotator/uniformer/mmcv/model_zoo/deprecated.json b/ControlNet/annotator/uniformer/mmcv/model_zoo/deprecated.json
deleted file mode 100644
index 25cf6f28caecc22a77e3136fefa6b8dfc0e6cb5b..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/model_zoo/deprecated.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-  "resnet50_caffe": "detectron/resnet50_caffe",
-  "resnet50_caffe_bgr": "detectron2/resnet50_caffe_bgr",
-  "resnet101_caffe": "detectron/resnet101_caffe",
-  "resnet101_caffe_bgr": "detectron2/resnet101_caffe_bgr"
-}
diff --git a/ControlNet/annotator/uniformer/mmcv/model_zoo/mmcls.json b/ControlNet/annotator/uniformer/mmcv/model_zoo/mmcls.json
deleted file mode 100644
index bdb311d9fe6d9f317290feedc9e37236c6cf6e8f..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/model_zoo/mmcls.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-  "vgg11": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_batch256_imagenet_20210208-4271cd6c.pth",
-  "vgg13": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_batch256_imagenet_20210208-4d1d6080.pth",
-  "vgg16": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_batch256_imagenet_20210208-db26f1a5.pth",
-  "vgg19": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_batch256_imagenet_20210208-e6920e4a.pth",
-  "vgg11_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_bn_batch256_imagenet_20210207-f244902c.pth",
-  "vgg13_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_bn_batch256_imagenet_20210207-1a8b7864.pth",
-  "vgg16_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_bn_batch256_imagenet_20210208-7e55cd29.pth",
-  "vgg19_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_bn_batch256_imagenet_20210208-da620c4f.pth",
-  "resnet18": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_batch256_imagenet_20200708-34ab8f90.pth",
-  "resnet34": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet34_batch256_imagenet_20200708-32ffb4f7.pth",
-  "resnet50": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.pth",
-  "resnet101": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_batch256_imagenet_20200708-753f3608.pth",
-  "resnet152": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_batch256_imagenet_20200708-ec25b1f9.pth",
-  "resnet50_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d50_batch256_imagenet_20200708-1ad0ce94.pth",
-  "resnet101_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d101_batch256_imagenet_20200708-9cb302ef.pth",
-  "resnet152_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d152_batch256_imagenet_20200708-e79cb6a2.pth",
-  "resnext50_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext50_32x4d_b32x8_imagenet_20210429-56066e27.pth",
-  "resnext101_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x4d_b32x8_imagenet_20210506-e0fa3dd5.pth",
-  "resnext101_32x8d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x8d_b32x8_imagenet_20210506-23a247d5.pth",
-  "resnext152_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext152_32x4d_b32x8_imagenet_20210524-927787be.pth",
-  "se-resnet50": "https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet50_batch256_imagenet_20200804-ae206104.pth",
-  "se-resnet101": "https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet101_batch256_imagenet_20200804-ba5b51d4.pth",
-  "resnest50": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest50_imagenet_converted-1ebf0afe.pth",
-  "resnest101": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest101_imagenet_converted-032caa52.pth",
-  "resnest200": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest200_imagenet_converted-581a60f2.pth",
-  "resnest269": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest269_imagenet_converted-59930960.pth",
-  "shufflenet_v1": "https://download.openmmlab.com/mmclassification/v0/shufflenet_v1/shufflenet_v1_batch1024_imagenet_20200804-5d6cec73.pth",
-  "shufflenet_v2": "https://download.openmmlab.com/mmclassification/v0/shufflenet_v2/shufflenet_v2_batch1024_imagenet_20200812-5bf4721e.pth",
-  "mobilenet_v2": "https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth"
-}
diff --git a/ControlNet/annotator/uniformer/mmcv/model_zoo/open_mmlab.json b/ControlNet/annotator/uniformer/mmcv/model_zoo/open_mmlab.json
deleted file mode 100644
index 8311db4feef92faa0841c697d75efbee8430c3a0..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/model_zoo/open_mmlab.json
+++ /dev/null
@@ -1,50 +0,0 @@
-{
-  "vgg16_caffe": "https://download.openmmlab.com/pretrain/third_party/vgg16_caffe-292e1171.pth",
-  "detectron/resnet50_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet50_caffe-788b5fa3.pth",
-  "detectron2/resnet50_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet50_msra-5891d200.pth",
-  "detectron/resnet101_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet101_caffe-3ad79236.pth",
-  "detectron2/resnet101_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet101_msra-6cc46731.pth",
-  "detectron2/resnext101_32x8d": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x8d-1516f1aa.pth",
-  "resnext50_32x4d": "https://download.openmmlab.com/pretrain/third_party/resnext50-32x4d-0ab1a123.pth",
-  "resnext101_32x4d": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d-a5af3160.pth",
-  "resnext101_64x4d": "https://download.openmmlab.com/pretrain/third_party/resnext101_64x4d-ee2c6f71.pth",
-  "contrib/resnet50_gn": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn_thangvubk-ad1730dd.pth",
-  "detectron/resnet50_gn": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn-9186a21c.pth",
-  "detectron/resnet101_gn": "https://download.openmmlab.com/pretrain/third_party/resnet101_gn-cac0ab98.pth",
-  "jhu/resnet50_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn_ws-15beedd8.pth",
-  "jhu/resnet101_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnet101_gn_ws-3e3c308c.pth",
-  "jhu/resnext50_32x4d_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnext50_32x4d_gn_ws-0d87ac85.pth",
-  "jhu/resnext101_32x4d_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d_gn_ws-34ac1a9e.pth",
-  "jhu/resnext50_32x4d_gn": "https://download.openmmlab.com/pretrain/third_party/resnext50_32x4d_gn-c7e8b754.pth",
-  "jhu/resnext101_32x4d_gn": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d_gn-ac3bb84e.pth",
-  "msra/hrnetv2_w18_small": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w18_small-b5a04e21.pth",
-  "msra/hrnetv2_w18": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w18-00eb2006.pth",
-  "msra/hrnetv2_w32": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w32-dc9eeb4f.pth",
-  "msra/hrnetv2_w40": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w40-ed0b031c.pth",
-  "msra/hrnetv2_w48": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w48-d2186c55.pth",
-  "bninception_caffe": "https://download.openmmlab.com/pretrain/third_party/bn_inception_caffe-ed2e8665.pth",
-  "kin400/i3d_r50_f32s2_k400": "https://download.openmmlab.com/pretrain/third_party/i3d_r50_f32s2_k400-2c57e077.pth",
-  "kin400/nl3d_r50_f32s2_k400": "https://download.openmmlab.com/pretrain/third_party/nl3d_r50_f32s2_k400-fa7e7caa.pth",
-  "res2net101_v1d_26w_4s": "https://download.openmmlab.com/pretrain/third_party/res2net101_v1d_26w_4s_mmdetv2-f0a600f9.pth",
-  "regnetx_400mf": "https://download.openmmlab.com/pretrain/third_party/regnetx_400mf-a5b10d96.pth",
-  "regnetx_800mf": "https://download.openmmlab.com/pretrain/third_party/regnetx_800mf-1f4be4c7.pth",
-  "regnetx_1.6gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_1.6gf-5791c176.pth",
-  "regnetx_3.2gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_3.2gf-c2599b0f.pth",
-  "regnetx_4.0gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_4.0gf-a88f671e.pth",
-  "regnetx_6.4gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_6.4gf-006af45d.pth",
-  "regnetx_8.0gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_8.0gf-3c68abe7.pth",
-  "regnetx_12gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_12gf-4c2a3350.pth",
-  "resnet18_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet18_v1c-b5776b93.pth",
-  "resnet50_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet50_v1c-2cccc1ad.pth",
-  "resnet101_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet101_v1c-e67eebb6.pth",
-  "mmedit/vgg16": "https://download.openmmlab.com/mmediting/third_party/vgg_state_dict.pth",
-  "mmedit/res34_en_nomixup": "https://download.openmmlab.com/mmediting/third_party/model_best_resnet34_En_nomixup.pth",
-  "mmedit/mobilenet_v2": "https://download.openmmlab.com/mmediting/third_party/mobilenet_v2.pth",
-  "contrib/mobilenet_v3_large": "https://download.openmmlab.com/pretrain/third_party/mobilenet_v3_large-bc2c3fd3.pth",
-  "contrib/mobilenet_v3_small": "https://download.openmmlab.com/pretrain/third_party/mobilenet_v3_small-47085aa1.pth",
-  "resnest50": "https://download.openmmlab.com/pretrain/third_party/resnest50_d2-7497a55b.pth",
-  "resnest101": "https://download.openmmlab.com/pretrain/third_party/resnest101_d2-f3b931b2.pth",
-  "resnest200": "https://download.openmmlab.com/pretrain/third_party/resnest200_d2-ca88e41f.pth",
-  "darknet53": "https://download.openmmlab.com/pretrain/third_party/darknet53-a628ea1b.pth",
-  "mmdet/mobilenet_v2": "https://download.openmmlab.com/mmdetection/v2.0/third_party/mobilenet_v2_batch256_imagenet-ff34753d.pth"
-}
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/__init__.py b/ControlNet/annotator/uniformer/mmcv/ops/__init__.py
deleted file mode 100644
index 999e090a458ee148ceca0649f1e3806a40e909bd..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/__init__.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .assign_score_withk import assign_score_withk
-from .ball_query import ball_query
-from .bbox import bbox_overlaps
-from .border_align import BorderAlign, border_align
-from .box_iou_rotated import box_iou_rotated
-from .carafe import CARAFE, CARAFENaive, CARAFEPack, carafe, carafe_naive
-from .cc_attention import CrissCrossAttention
-from .contour_expand import contour_expand
-from .corner_pool import CornerPool
-from .correlation import Correlation
-from .deform_conv import DeformConv2d, DeformConv2dPack, deform_conv2d
-from .deform_roi_pool import (DeformRoIPool, DeformRoIPoolPack,
-                              ModulatedDeformRoIPoolPack, deform_roi_pool)
-from .deprecated_wrappers import Conv2d_deprecated as Conv2d
-from .deprecated_wrappers import ConvTranspose2d_deprecated as ConvTranspose2d
-from .deprecated_wrappers import Linear_deprecated as Linear
-from .deprecated_wrappers import MaxPool2d_deprecated as MaxPool2d
-from .focal_loss import (SigmoidFocalLoss, SoftmaxFocalLoss,
-                         sigmoid_focal_loss, softmax_focal_loss)
-from .furthest_point_sample import (furthest_point_sample,
-                                    furthest_point_sample_with_dist)
-from .fused_bias_leakyrelu import FusedBiasLeakyReLU, fused_bias_leakyrelu
-from .gather_points import gather_points
-from .group_points import GroupAll, QueryAndGroup, grouping_operation
-from .info import (get_compiler_version, get_compiling_cuda_version,
-                   get_onnxruntime_op_path)
-from .iou3d import boxes_iou_bev, nms_bev, nms_normal_bev
-from .knn import knn
-from .masked_conv import MaskedConv2d, masked_conv2d
-from .modulated_deform_conv import (ModulatedDeformConv2d,
-                                    ModulatedDeformConv2dPack,
-                                    modulated_deform_conv2d)
-from .multi_scale_deform_attn import MultiScaleDeformableAttention
-from .nms import batched_nms, nms, nms_match, nms_rotated, soft_nms
-from .pixel_group import pixel_group
-from .point_sample import (SimpleRoIAlign, point_sample,
-                           rel_roi_point_to_rel_img_point)
-from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu,
-                              points_in_boxes_part)
-from .points_sampler import PointsSampler
-from .psa_mask import PSAMask
-from .roi_align import RoIAlign, roi_align
-from .roi_align_rotated import RoIAlignRotated, roi_align_rotated
-from .roi_pool import RoIPool, roi_pool
-from .roiaware_pool3d import RoIAwarePool3d
-from .roipoint_pool3d import RoIPointPool3d
-from .saconv import SAConv2d
-from .scatter_points import DynamicScatter, dynamic_scatter
-from .sync_bn import SyncBatchNorm
-from .three_interpolate import three_interpolate
-from .three_nn import three_nn
-from .tin_shift import TINShift, tin_shift
-from .upfirdn2d import upfirdn2d
-from .voxelize import Voxelization, voxelization
-
-__all__ = [
-    'bbox_overlaps', 'CARAFE', 'CARAFENaive', 'CARAFEPack', 'carafe',
-    'carafe_naive', 'CornerPool', 'DeformConv2d', 'DeformConv2dPack',
-    'deform_conv2d', 'DeformRoIPool', 'DeformRoIPoolPack',
-    'ModulatedDeformRoIPoolPack', 'deform_roi_pool', 'SigmoidFocalLoss',
-    'SoftmaxFocalLoss', 'sigmoid_focal_loss', 'softmax_focal_loss',
-    'get_compiler_version', 'get_compiling_cuda_version',
-    'get_onnxruntime_op_path', 'MaskedConv2d', 'masked_conv2d',
-    'ModulatedDeformConv2d', 'ModulatedDeformConv2dPack',
-    'modulated_deform_conv2d', 'batched_nms', 'nms', 'soft_nms', 'nms_match',
-    'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool', 'SyncBatchNorm', 'Conv2d',
-    'ConvTranspose2d', 'Linear', 'MaxPool2d', 'CrissCrossAttention', 'PSAMask',
-    'point_sample', 'rel_roi_point_to_rel_img_point', 'SimpleRoIAlign',
-    'SAConv2d', 'TINShift', 'tin_shift', 'assign_score_withk',
-    'box_iou_rotated', 'RoIPointPool3d', 'nms_rotated', 'knn', 'ball_query',
-    'upfirdn2d', 'FusedBiasLeakyReLU', 'fused_bias_leakyrelu',
-    'RoIAlignRotated', 'roi_align_rotated', 'pixel_group', 'QueryAndGroup',
-    'GroupAll', 'grouping_operation', 'contour_expand', 'three_nn',
-    'three_interpolate', 'MultiScaleDeformableAttention', 'BorderAlign',
-    'border_align', 'gather_points', 'furthest_point_sample',
-    'furthest_point_sample_with_dist', 'PointsSampler', 'Correlation',
-    'boxes_iou_bev', 'nms_bev', 'nms_normal_bev', 'Voxelization',
-    'voxelization', 'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d',
-    'points_in_boxes_part', 'points_in_boxes_cpu', 'points_in_boxes_all'
-]
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/assign_score_withk.py b/ControlNet/annotator/uniformer/mmcv/ops/assign_score_withk.py
deleted file mode 100644
index 4906adaa2cffd1b46912fbe7d4f87ef2f9fa0012..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/assign_score_withk.py
+++ /dev/null
@@ -1,123 +0,0 @@
-from torch.autograd import Function
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext(
-    '_ext', ['assign_score_withk_forward', 'assign_score_withk_backward'])
-
-
-class AssignScoreWithK(Function):
-    r"""Perform weighted sum to generate output features according to scores.
-    Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/
-    scene_seg/lib/paconv_lib/src/gpu>`_.
-
-    This is a memory-efficient CUDA implementation of assign_scores operation,
-    which first transform all point features with weight bank, then assemble
-    neighbor features with ``knn_idx`` and perform weighted sum of ``scores``.
-
-    See the `paper <https://arxiv.org/pdf/2103.14635.pdf>`_ appendix Sec. D for
-        more detailed descriptions.
-
-    Note:
-        This implementation assumes using ``neighbor`` kernel input, which is
-            (point_features - center_features, point_features).
-        See https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/model/
-        pointnet2/paconv.py#L128 for more details.
-    """
-
-    @staticmethod
-    def forward(ctx,
-                scores,
-                point_features,
-                center_features,
-                knn_idx,
-                aggregate='sum'):
-        """
-        Args:
-            scores (torch.Tensor): (B, npoint, K, M), predicted scores to
-                aggregate weight matrices in the weight bank.
-                ``npoint`` is the number of sampled centers.
-                ``K`` is the number of queried neighbors.
-                ``M`` is the number of weight matrices in the weight bank.
-            point_features (torch.Tensor): (B, N, M, out_dim)
-                Pre-computed point features to be aggregated.
-            center_features (torch.Tensor): (B, N, M, out_dim)
-                Pre-computed center features to be aggregated.
-            knn_idx (torch.Tensor): (B, npoint, K), index of sampled kNN.
-                We assume the first idx in each row is the idx of the center.
-            aggregate (str, optional): Aggregation method.
-                Can be 'sum', 'avg' or 'max'. Defaults: 'sum'.
-
-        Returns:
-            torch.Tensor: (B, out_dim, npoint, K), the aggregated features.
-        """
-        agg = {'sum': 0, 'avg': 1, 'max': 2}
-
-        B, N, M, out_dim = point_features.size()
-        _, npoint, K, _ = scores.size()
-
-        output = point_features.new_zeros((B, out_dim, npoint, K))
-        ext_module.assign_score_withk_forward(
-            point_features.contiguous(),
-            center_features.contiguous(),
-            scores.contiguous(),
-            knn_idx.contiguous(),
-            output,
-            B=B,
-            N0=N,
-            N1=npoint,
-            M=M,
-            K=K,
-            O=out_dim,
-            aggregate=agg[aggregate])
-
-        ctx.save_for_backward(output, point_features, center_features, scores,
-                              knn_idx)
-        ctx.agg = agg[aggregate]
-
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_out):
-        """
-        Args:
-            grad_out (torch.Tensor): (B, out_dim, npoint, K)
-
-        Returns:
-            grad_scores (torch.Tensor): (B, npoint, K, M)
-            grad_point_features (torch.Tensor): (B, N, M, out_dim)
-            grad_center_features (torch.Tensor): (B, N, M, out_dim)
-        """
-        _, point_features, center_features, scores, knn_idx = ctx.saved_tensors
-
-        agg = ctx.agg
-
-        B, N, M, out_dim = point_features.size()
-        _, npoint, K, _ = scores.size()
-
-        grad_point_features = point_features.new_zeros(point_features.shape)
-        grad_center_features = center_features.new_zeros(center_features.shape)
-        grad_scores = scores.new_zeros(scores.shape)
-
-        ext_module.assign_score_withk_backward(
-            grad_out.contiguous(),
-            point_features.contiguous(),
-            center_features.contiguous(),
-            scores.contiguous(),
-            knn_idx.contiguous(),
-            grad_point_features,
-            grad_center_features,
-            grad_scores,
-            B=B,
-            N0=N,
-            N1=npoint,
-            M=M,
-            K=K,
-            O=out_dim,
-            aggregate=agg)
-
-        return grad_scores, grad_point_features, \
-            grad_center_features, None, None
-
-
-assign_score_withk = AssignScoreWithK.apply
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/ball_query.py b/ControlNet/annotator/uniformer/mmcv/ops/ball_query.py
deleted file mode 100644
index d0466847c6e5c1239e359a0397568413ebc1504a..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/ball_query.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch.autograd import Function
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext', ['ball_query_forward'])
-
-
-class BallQuery(Function):
-    """Find nearby points in spherical space."""
-
-    @staticmethod
-    def forward(ctx, min_radius: float, max_radius: float, sample_num: int,
-                xyz: torch.Tensor, center_xyz: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            min_radius (float): minimum radius of the balls.
-            max_radius (float): maximum radius of the balls.
-            sample_num (int): maximum number of features in the balls.
-            xyz (Tensor): (B, N, 3) xyz coordinates of the features.
-            center_xyz (Tensor): (B, npoint, 3) centers of the ball query.
-
-        Returns:
-            Tensor: (B, npoint, nsample) tensor with the indices of
-                the features that form the query balls.
-        """
-        assert center_xyz.is_contiguous()
-        assert xyz.is_contiguous()
-        assert min_radius < max_radius
-
-        B, N, _ = xyz.size()
-        npoint = center_xyz.size(1)
-        idx = xyz.new_zeros(B, npoint, sample_num, dtype=torch.int)
-
-        ext_module.ball_query_forward(
-            center_xyz,
-            xyz,
-            idx,
-            b=B,
-            n=N,
-            m=npoint,
-            min_radius=min_radius,
-            max_radius=max_radius,
-            nsample=sample_num)
-        if torch.__version__ != 'parrots':
-            ctx.mark_non_differentiable(idx)
-        return idx
-
-    @staticmethod
-    def backward(ctx, a=None):
-        return None, None, None, None
-
-
-ball_query = BallQuery.apply
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/bbox.py b/ControlNet/annotator/uniformer/mmcv/ops/bbox.py
deleted file mode 100644
index 0c4d58b6c91f652933974f519acd3403a833e906..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/bbox.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext', ['bbox_overlaps'])
-
-
-def bbox_overlaps(bboxes1, bboxes2, mode='iou', aligned=False, offset=0):
-    """Calculate overlap between two set of bboxes.
-
-    If ``aligned`` is ``False``, then calculate the ious between each bbox
-    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
-    bboxes1 and bboxes2.
-
-    Args:
-        bboxes1 (Tensor): shape (m, 4) in <x1, y1, x2, y2> format or empty.
-        bboxes2 (Tensor): shape (n, 4) in <x1, y1, x2, y2> format or empty.
-            If aligned is ``True``, then m and n must be equal.
-        mode (str): "iou" (intersection over union) or iof (intersection over
-            foreground).
-
-    Returns:
-        ious(Tensor): shape (m, n) if aligned == False else shape (m, 1)
-
-    Example:
-        >>> bboxes1 = torch.FloatTensor([
-        >>>     [0, 0, 10, 10],
-        >>>     [10, 10, 20, 20],
-        >>>     [32, 32, 38, 42],
-        >>> ])
-        >>> bboxes2 = torch.FloatTensor([
-        >>>     [0, 0, 10, 20],
-        >>>     [0, 10, 10, 19],
-        >>>     [10, 10, 20, 20],
-        >>> ])
-        >>> bbox_overlaps(bboxes1, bboxes2)
-        tensor([[0.5000, 0.0000, 0.0000],
-                [0.0000, 0.0000, 1.0000],
-                [0.0000, 0.0000, 0.0000]])
-
-    Example:
-        >>> empty = torch.FloatTensor([])
-        >>> nonempty = torch.FloatTensor([
-        >>>     [0, 0, 10, 9],
-        >>> ])
-        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
-        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
-        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
-    """
-
-    mode_dict = {'iou': 0, 'iof': 1}
-    assert mode in mode_dict.keys()
-    mode_flag = mode_dict[mode]
-    # Either the boxes are empty or the length of boxes' last dimension is 4
-    assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
-    assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
-    assert offset == 1 or offset == 0
-
-    rows = bboxes1.size(0)
-    cols = bboxes2.size(0)
-    if aligned:
-        assert rows == cols
-
-    if rows * cols == 0:
-        return bboxes1.new(rows, 1) if aligned else bboxes1.new(rows, cols)
-
-    if aligned:
-        ious = bboxes1.new_zeros(rows)
-    else:
-        ious = bboxes1.new_zeros((rows, cols))
-    ext_module.bbox_overlaps(
-        bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset)
-    return ious
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/border_align.py b/ControlNet/annotator/uniformer/mmcv/ops/border_align.py
deleted file mode 100644
index ff305be328e9b0a15e1bbb5e6b41beb940f55c81..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/border_align.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-# modified from
-# https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py
-
-import torch
-import torch.nn as nn
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext(
-    '_ext', ['border_align_forward', 'border_align_backward'])
-
-
-class BorderAlignFunction(Function):
-
-    @staticmethod
-    def symbolic(g, input, boxes, pool_size):
-        return g.op(
-            'mmcv::MMCVBorderAlign', input, boxes, pool_size_i=pool_size)
-
-    @staticmethod
-    def forward(ctx, input, boxes, pool_size):
-        ctx.pool_size = pool_size
-        ctx.input_shape = input.size()
-
-        assert boxes.ndim == 3, 'boxes must be with shape [B, H*W, 4]'
-        assert boxes.size(2) == 4, \
-            'the last dimension of boxes must be (x1, y1, x2, y2)'
-        assert input.size(1) % 4 == 0, \
-            'the channel for input feature must be divisible by factor 4'
-
-        # [B, C//4, H*W, 4]
-        output_shape = (input.size(0), input.size(1) // 4, boxes.size(1), 4)
-        output = input.new_zeros(output_shape)
-        # `argmax_idx` only used for backward
-        argmax_idx = input.new_zeros(output_shape).to(torch.int)
-
-        ext_module.border_align_forward(
-            input, boxes, output, argmax_idx, pool_size=ctx.pool_size)
-
-        ctx.save_for_backward(boxes, argmax_idx)
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output):
-        boxes, argmax_idx = ctx.saved_tensors
-        grad_input = grad_output.new_zeros(ctx.input_shape)
-        # complex head architecture may cause grad_output uncontiguous
-        grad_output = grad_output.contiguous()
-        ext_module.border_align_backward(
-            grad_output,
-            boxes,
-            argmax_idx,
-            grad_input,
-            pool_size=ctx.pool_size)
-        return grad_input, None, None
-
-
-border_align = BorderAlignFunction.apply
-
-
-class BorderAlign(nn.Module):
-    r"""Border align pooling layer.
-
-    Applies border_align over the input feature based on predicted bboxes.
-    The details were described in the paper
-    `BorderDet: Border Feature for Dense Object Detection
-    <https://arxiv.org/abs/2007.11056>`_.
-
-    For each border line (e.g. top, left, bottom or right) of each box,
-    border_align does the following:
-        1. uniformly samples `pool_size`+1 positions on this line, involving \
-           the start and end points.
-        2. the corresponding features on these points are computed by \
-           bilinear interpolation.
-        3. max pooling over all the `pool_size`+1 positions are used for \
-           computing pooled feature.
-
-    Args:
-        pool_size (int): number of positions sampled over the boxes' borders
-            (e.g. top, bottom, left, right).
-
-    """
-
-    def __init__(self, pool_size):
-        super(BorderAlign, self).__init__()
-        self.pool_size = pool_size
-
-    def forward(self, input, boxes):
-        """
-        Args:
-            input: Features with shape [N,4C,H,W]. Channels ranged in [0,C),
-                [C,2C), [2C,3C), [3C,4C) represent the top, left, bottom,
-                right features respectively.
-            boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2).
-
-        Returns:
-            Tensor: Pooled features with shape [N,C,H*W,4]. The order is
-                (top,left,bottom,right) for the last dimension.
-        """
-        return border_align(input, boxes, self.pool_size)
-
-    def __repr__(self):
-        s = self.__class__.__name__
-        s += f'(pool_size={self.pool_size})'
-        return s
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/box_iou_rotated.py b/ControlNet/annotator/uniformer/mmcv/ops/box_iou_rotated.py
deleted file mode 100644
index 2d78015e9c2a9e7a52859b4e18f84a9aa63481a0..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/box_iou_rotated.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext', ['box_iou_rotated'])
-
-
-def box_iou_rotated(bboxes1, bboxes2, mode='iou', aligned=False):
-    """Return intersection-over-union (Jaccard index) of boxes.
-
-    Both sets of boxes are expected to be in
-    (x_center, y_center, width, height, angle) format.
-
-    If ``aligned`` is ``False``, then calculate the ious between each bbox
-    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
-    bboxes1 and bboxes2.
-
-    Arguments:
-        boxes1 (Tensor): rotated bboxes 1. \
-            It has shape (N, 5), indicating (x, y, w, h, theta) for each row.
-            Note that theta is in radian.
-        boxes2 (Tensor): rotated bboxes 2. \
-            It has shape (M, 5), indicating (x, y, w, h, theta) for each row.
-            Note that theta is in radian.
-        mode (str): "iou" (intersection over union) or iof (intersection over
-            foreground).
-
-    Returns:
-        ious(Tensor): shape (N, M) if aligned == False else shape (N,)
-    """
-    assert mode in ['iou', 'iof']
-    mode_dict = {'iou': 0, 'iof': 1}
-    mode_flag = mode_dict[mode]
-    rows = bboxes1.size(0)
-    cols = bboxes2.size(0)
-    if aligned:
-        ious = bboxes1.new_zeros(rows)
-    else:
-        ious = bboxes1.new_zeros((rows * cols))
-    bboxes1 = bboxes1.contiguous()
-    bboxes2 = bboxes2.contiguous()
-    ext_module.box_iou_rotated(
-        bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned)
-    if not aligned:
-        ious = ious.view(rows, cols)
-    return ious
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/carafe.py b/ControlNet/annotator/uniformer/mmcv/ops/carafe.py
deleted file mode 100644
index 5154cb3abfccfbbe0a1b2daa67018dbf80aaf6d2..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/carafe.py
+++ /dev/null
@@ -1,287 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.autograd import Function
-from torch.nn.modules.module import Module
-
-from ..cnn import UPSAMPLE_LAYERS, normal_init, xavier_init
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext', [
-    'carafe_naive_forward', 'carafe_naive_backward', 'carafe_forward',
-    'carafe_backward'
-])
-
-
-class CARAFENaiveFunction(Function):
-
-    @staticmethod
-    def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
-        return g.op(
-            'mmcv::MMCVCARAFENaive',
-            features,
-            masks,
-            kernel_size_i=kernel_size,
-            group_size_i=group_size,
-            scale_factor_f=scale_factor)
-
-    @staticmethod
-    def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
-        assert scale_factor >= 1
-        assert masks.size(1) == kernel_size * kernel_size * group_size
-        assert masks.size(-1) == features.size(-1) * scale_factor
-        assert masks.size(-2) == features.size(-2) * scale_factor
-        assert features.size(1) % group_size == 0
-        assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
-        ctx.kernel_size = kernel_size
-        ctx.group_size = group_size
-        ctx.scale_factor = scale_factor
-        ctx.feature_size = features.size()
-        ctx.mask_size = masks.size()
-
-        n, c, h, w = features.size()
-        output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
-        ext_module.carafe_naive_forward(
-            features,
-            masks,
-            output,
-            kernel_size=kernel_size,
-            group_size=group_size,
-            scale_factor=scale_factor)
-
-        if features.requires_grad or masks.requires_grad:
-            ctx.save_for_backward(features, masks)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        assert grad_output.is_cuda
-
-        features, masks = ctx.saved_tensors
-        kernel_size = ctx.kernel_size
-        group_size = ctx.group_size
-        scale_factor = ctx.scale_factor
-
-        grad_input = torch.zeros_like(features)
-        grad_masks = torch.zeros_like(masks)
-        ext_module.carafe_naive_backward(
-            grad_output.contiguous(),
-            features,
-            masks,
-            grad_input,
-            grad_masks,
-            kernel_size=kernel_size,
-            group_size=group_size,
-            scale_factor=scale_factor)
-
-        return grad_input, grad_masks, None, None, None
-
-
-carafe_naive = CARAFENaiveFunction.apply
-
-
-class CARAFENaive(Module):
-
-    def __init__(self, kernel_size, group_size, scale_factor):
-        super(CARAFENaive, self).__init__()
-
-        assert isinstance(kernel_size, int) and isinstance(
-            group_size, int) and isinstance(scale_factor, int)
-        self.kernel_size = kernel_size
-        self.group_size = group_size
-        self.scale_factor = scale_factor
-
-    def forward(self, features, masks):
-        return carafe_naive(features, masks, self.kernel_size, self.group_size,
-                            self.scale_factor)
-
-
-class CARAFEFunction(Function):
-
-    @staticmethod
-    def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
-        return g.op(
-            'mmcv::MMCVCARAFE',
-            features,
-            masks,
-            kernel_size_i=kernel_size,
-            group_size_i=group_size,
-            scale_factor_f=scale_factor)
-
-    @staticmethod
-    def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
-        assert scale_factor >= 1
-        assert masks.size(1) == kernel_size * kernel_size * group_size
-        assert masks.size(-1) == features.size(-1) * scale_factor
-        assert masks.size(-2) == features.size(-2) * scale_factor
-        assert features.size(1) % group_size == 0
-        assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
-        ctx.kernel_size = kernel_size
-        ctx.group_size = group_size
-        ctx.scale_factor = scale_factor
-        ctx.feature_size = features.size()
-        ctx.mask_size = masks.size()
-
-        n, c, h, w = features.size()
-        output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
-        routput = features.new_zeros(output.size(), requires_grad=False)
-        rfeatures = features.new_zeros(features.size(), requires_grad=False)
-        rmasks = masks.new_zeros(masks.size(), requires_grad=False)
-        ext_module.carafe_forward(
-            features,
-            masks,
-            rfeatures,
-            routput,
-            rmasks,
-            output,
-            kernel_size=kernel_size,
-            group_size=group_size,
-            scale_factor=scale_factor)
-
-        if features.requires_grad or masks.requires_grad:
-            ctx.save_for_backward(features, masks, rfeatures)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        assert grad_output.is_cuda
-
-        features, masks, rfeatures = ctx.saved_tensors
-        kernel_size = ctx.kernel_size
-        group_size = ctx.group_size
-        scale_factor = ctx.scale_factor
-
-        rgrad_output = torch.zeros_like(grad_output, requires_grad=False)
-        rgrad_input_hs = torch.zeros_like(grad_output, requires_grad=False)
-        rgrad_input = torch.zeros_like(features, requires_grad=False)
-        rgrad_masks = torch.zeros_like(masks, requires_grad=False)
-        grad_input = torch.zeros_like(features, requires_grad=False)
-        grad_masks = torch.zeros_like(masks, requires_grad=False)
-        ext_module.carafe_backward(
-            grad_output.contiguous(),
-            rfeatures,
-            masks,
-            rgrad_output,
-            rgrad_input_hs,
-            rgrad_input,
-            rgrad_masks,
-            grad_input,
-            grad_masks,
-            kernel_size=kernel_size,
-            group_size=group_size,
-            scale_factor=scale_factor)
-        return grad_input, grad_masks, None, None, None
-
-
-carafe = CARAFEFunction.apply
-
-
-class CARAFE(Module):
-    """ CARAFE: Content-Aware ReAssembly of FEatures
-
-    Please refer to https://arxiv.org/abs/1905.02188 for more details.
-
-    Args:
-        kernel_size (int): reassemble kernel size
-        group_size (int): reassemble group size
-        scale_factor (int): upsample ratio
-
-    Returns:
-        upsampled feature map
-    """
-
-    def __init__(self, kernel_size, group_size, scale_factor):
-        super(CARAFE, self).__init__()
-
-        assert isinstance(kernel_size, int) and isinstance(
-            group_size, int) and isinstance(scale_factor, int)
-        self.kernel_size = kernel_size
-        self.group_size = group_size
-        self.scale_factor = scale_factor
-
-    def forward(self, features, masks):
-        return carafe(features, masks, self.kernel_size, self.group_size,
-                      self.scale_factor)
-
-
-@UPSAMPLE_LAYERS.register_module(name='carafe')
-class CARAFEPack(nn.Module):
-    """A unified package of CARAFE upsampler that contains: 1) channel
-    compressor 2) content encoder 3) CARAFE op.
-
-    Official implementation of ICCV 2019 paper
-    CARAFE: Content-Aware ReAssembly of FEatures
-    Please refer to https://arxiv.org/abs/1905.02188 for more details.
-
-    Args:
-        channels (int): input feature channels
-        scale_factor (int): upsample ratio
-        up_kernel (int): kernel size of CARAFE op
-        up_group (int): group size of CARAFE op
-        encoder_kernel (int): kernel size of content encoder
-        encoder_dilation (int): dilation of content encoder
-        compressed_channels (int): output channels of channels compressor
-
-    Returns:
-        upsampled feature map
-    """
-
-    def __init__(self,
-                 channels,
-                 scale_factor,
-                 up_kernel=5,
-                 up_group=1,
-                 encoder_kernel=3,
-                 encoder_dilation=1,
-                 compressed_channels=64):
-        super(CARAFEPack, self).__init__()
-        self.channels = channels
-        self.scale_factor = scale_factor
-        self.up_kernel = up_kernel
-        self.up_group = up_group
-        self.encoder_kernel = encoder_kernel
-        self.encoder_dilation = encoder_dilation
-        self.compressed_channels = compressed_channels
-        self.channel_compressor = nn.Conv2d(channels, self.compressed_channels,
-                                            1)
-        self.content_encoder = nn.Conv2d(
-            self.compressed_channels,
-            self.up_kernel * self.up_kernel * self.up_group *
-            self.scale_factor * self.scale_factor,
-            self.encoder_kernel,
-            padding=int((self.encoder_kernel - 1) * self.encoder_dilation / 2),
-            dilation=self.encoder_dilation,
-            groups=1)
-        self.init_weights()
-
-    def init_weights(self):
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                xavier_init(m, distribution='uniform')
-        normal_init(self.content_encoder, std=0.001)
-
-    def kernel_normalizer(self, mask):
-        mask = F.pixel_shuffle(mask, self.scale_factor)
-        n, mask_c, h, w = mask.size()
-        # use float division explicitly,
-        # to void inconsistency while exporting to onnx
-        mask_channel = int(mask_c / float(self.up_kernel**2))
-        mask = mask.view(n, mask_channel, -1, h, w)
-
-        mask = F.softmax(mask, dim=2, dtype=mask.dtype)
-        mask = mask.view(n, mask_c, h, w).contiguous()
-
-        return mask
-
-    def feature_reassemble(self, x, mask):
-        x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor)
-        return x
-
-    def forward(self, x):
-        compressed_x = self.channel_compressor(x)
-        mask = self.content_encoder(compressed_x)
-        mask = self.kernel_normalizer(mask)
-
-        x = self.feature_reassemble(x, mask)
-        return x
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/cc_attention.py b/ControlNet/annotator/uniformer/mmcv/ops/cc_attention.py
deleted file mode 100644
index 9207aa95e6730bd9b3362dee612059a5f0ce1c5e..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/cc_attention.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from annotator.uniformer.mmcv.cnn import PLUGIN_LAYERS, Scale
-
-
-def NEG_INF_DIAG(n, device):
-    """Returns a diagonal matrix of size [n, n].
-
-    The diagonal are all "-inf". This is for avoiding calculating the
-    overlapped element in the Criss-Cross twice.
-    """
-    return torch.diag(torch.tensor(float('-inf')).to(device).repeat(n), 0)
-
-
-@PLUGIN_LAYERS.register_module()
-class CrissCrossAttention(nn.Module):
-    """Criss-Cross Attention Module.
-
-    .. note::
-        Before v1.3.13, we use a CUDA op. Since v1.3.13, we switch
-        to a pure PyTorch and equivalent implementation. For more
-        details, please refer to https://github.com/open-mmlab/mmcv/pull/1201.
-
-        Speed comparison for one forward pass
-
-        - Input size: [2,512,97,97]
-        - Device: 1 NVIDIA GeForce RTX 2080 Ti
-
-        +-----------------------+---------------+------------+---------------+
-        |                       |PyTorch version|CUDA version|Relative speed |
-        +=======================+===============+============+===============+
-        |with torch.no_grad()   |0.00554402 s   |0.0299619 s |5.4x           |
-        +-----------------------+---------------+------------+---------------+
-        |no with torch.no_grad()|0.00562803 s   |0.0301349 s |5.4x           |
-        +-----------------------+---------------+------------+---------------+
-
-    Args:
-        in_channels (int): Channels of the input feature map.
-    """
-
-    def __init__(self, in_channels):
-        super().__init__()
-        self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
-        self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
-        self.value_conv = nn.Conv2d(in_channels, in_channels, 1)
-        self.gamma = Scale(0.)
-        self.in_channels = in_channels
-
-    def forward(self, x):
-        """forward function of Criss-Cross Attention.
-
-        Args:
-            x (Tensor): Input feature. \
-                shape (batch_size, in_channels, height, width)
-        Returns:
-            Tensor: Output of the layer, with shape of \
-            (batch_size, in_channels, height, width)
-        """
-        B, C, H, W = x.size()
-        query = self.query_conv(x)
-        key = self.key_conv(x)
-        value = self.value_conv(x)
-        energy_H = torch.einsum('bchw,bciw->bwhi', query, key) + NEG_INF_DIAG(
-            H, query.device)
-        energy_H = energy_H.transpose(1, 2)
-        energy_W = torch.einsum('bchw,bchj->bhwj', query, key)
-        attn = F.softmax(
-            torch.cat([energy_H, energy_W], dim=-1), dim=-1)  # [B,H,W,(H+W)]
-        out = torch.einsum('bciw,bhwi->bchw', value, attn[..., :H])
-        out += torch.einsum('bchj,bhwj->bchw', value, attn[..., H:])
-
-        out = self.gamma(out) + x
-        out = out.contiguous()
-
-        return out
-
-    def __repr__(self):
-        s = self.__class__.__name__
-        s += f'(in_channels={self.in_channels})'
-        return s
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/contour_expand.py b/ControlNet/annotator/uniformer/mmcv/ops/contour_expand.py
deleted file mode 100644
index ea1111e1768b5f27e118bf7dbc0d9c70a7afd6d7..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/contour_expand.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext', ['contour_expand'])
-
-
-def contour_expand(kernel_mask, internal_kernel_label, min_kernel_area,
-                   kernel_num):
-    """Expand kernel contours so that foreground pixels are assigned into
-    instances.
-
-    Arguments:
-        kernel_mask (np.array or Tensor): The instance kernel mask with
-            size hxw.
-        internal_kernel_label (np.array or Tensor): The instance internal
-            kernel label with size hxw.
-        min_kernel_area (int): The minimum kernel area.
-        kernel_num (int): The instance kernel number.
-
-    Returns:
-        label (list): The instance index map with size hxw.
-    """
-    assert isinstance(kernel_mask, (torch.Tensor, np.ndarray))
-    assert isinstance(internal_kernel_label, (torch.Tensor, np.ndarray))
-    assert isinstance(min_kernel_area, int)
-    assert isinstance(kernel_num, int)
-
-    if isinstance(kernel_mask, np.ndarray):
-        kernel_mask = torch.from_numpy(kernel_mask)
-    if isinstance(internal_kernel_label, np.ndarray):
-        internal_kernel_label = torch.from_numpy(internal_kernel_label)
-
-    if torch.__version__ == 'parrots':
-        if kernel_mask.shape[0] == 0 or internal_kernel_label.shape[0] == 0:
-            label = []
-        else:
-            label = ext_module.contour_expand(
-                kernel_mask,
-                internal_kernel_label,
-                min_kernel_area=min_kernel_area,
-                kernel_num=kernel_num)
-            label = label.tolist()
-    else:
-        label = ext_module.contour_expand(kernel_mask, internal_kernel_label,
-                                          min_kernel_area, kernel_num)
-    return label
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/corner_pool.py b/ControlNet/annotator/uniformer/mmcv/ops/corner_pool.py
deleted file mode 100644
index a33d798b43d405e4c86bee4cd6389be21ca9c637..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/corner_pool.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch import nn
-from torch.autograd import Function
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext', [
-    'top_pool_forward', 'top_pool_backward', 'bottom_pool_forward',
-    'bottom_pool_backward', 'left_pool_forward', 'left_pool_backward',
-    'right_pool_forward', 'right_pool_backward'
-])
-
-_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}
-
-
-class TopPoolFunction(Function):
-
-    @staticmethod
-    def symbolic(g, input):
-        output = g.op(
-            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['top']))
-        return output
-
-    @staticmethod
-    def forward(ctx, input):
-        output = ext_module.top_pool_forward(input)
-        ctx.save_for_backward(input)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input, = ctx.saved_tensors
-        output = ext_module.top_pool_backward(input, grad_output)
-        return output
-
-
-class BottomPoolFunction(Function):
-
-    @staticmethod
-    def symbolic(g, input):
-        output = g.op(
-            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['bottom']))
-        return output
-
-    @staticmethod
-    def forward(ctx, input):
-        output = ext_module.bottom_pool_forward(input)
-        ctx.save_for_backward(input)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input, = ctx.saved_tensors
-        output = ext_module.bottom_pool_backward(input, grad_output)
-        return output
-
-
-class LeftPoolFunction(Function):
-
-    @staticmethod
-    def symbolic(g, input):
-        output = g.op(
-            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['left']))
-        return output
-
-    @staticmethod
-    def forward(ctx, input):
-        output = ext_module.left_pool_forward(input)
-        ctx.save_for_backward(input)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input, = ctx.saved_tensors
-        output = ext_module.left_pool_backward(input, grad_output)
-        return output
-
-
-class RightPoolFunction(Function):
-
-    @staticmethod
-    def symbolic(g, input):
-        output = g.op(
-            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['right']))
-        return output
-
-    @staticmethod
-    def forward(ctx, input):
-        output = ext_module.right_pool_forward(input)
-        ctx.save_for_backward(input)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input, = ctx.saved_tensors
-        output = ext_module.right_pool_backward(input, grad_output)
-        return output
-
-
-class CornerPool(nn.Module):
-    """Corner Pooling.
-
-    Corner Pooling is a new type of pooling layer that helps a
-    convolutional network better localize corners of bounding boxes.
-
-    Please refer to https://arxiv.org/abs/1808.01244 for more details.
-    Code is modified from https://github.com/princeton-vl/CornerNet-Lite.
-
-    Args:
-        mode(str): Pooling orientation for the pooling layer
-
-            - 'bottom': Bottom Pooling
-            - 'left': Left Pooling
-            - 'right': Right Pooling
-            - 'top': Top Pooling
-
-    Returns:
-        Feature map after pooling.
-    """
-
-    pool_functions = {
-        'bottom': BottomPoolFunction,
-        'left': LeftPoolFunction,
-        'right': RightPoolFunction,
-        'top': TopPoolFunction,
-    }
-
-    cummax_dim_flip = {
-        'bottom': (2, False),
-        'left': (3, True),
-        'right': (3, False),
-        'top': (2, True),
-    }
-
-    def __init__(self, mode):
-        super(CornerPool, self).__init__()
-        assert mode in self.pool_functions
-        self.mode = mode
-        self.corner_pool = self.pool_functions[mode]
-
-    def forward(self, x):
-        if torch.__version__ != 'parrots' and torch.__version__ >= '1.5.0':
-            if torch.onnx.is_in_onnx_export():
-                assert torch.__version__ >= '1.7.0', \
-                    'When `cummax` serves as an intermediate component whose '\
-                    'outputs is used as inputs for another modules, it\'s '\
-                    'expected that pytorch version must be >= 1.7.0, '\
-                    'otherwise Error appears like: `RuntimeError: tuple '\
-                    'appears in op that does not forward tuples, unsupported '\
-                    'kind: prim::PythonOp`.'
-
-            dim, flip = self.cummax_dim_flip[self.mode]
-            if flip:
-                x = x.flip(dim)
-            pool_tensor, _ = torch.cummax(x, dim=dim)
-            if flip:
-                pool_tensor = pool_tensor.flip(dim)
-            return pool_tensor
-        else:
-            return self.corner_pool.apply(x)
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/correlation.py b/ControlNet/annotator/uniformer/mmcv/ops/correlation.py
deleted file mode 100644
index 3d0b79c301b29915dfaf4d2b1846c59be73127d3..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/correlation.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch import Tensor, nn
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-from torch.nn.modules.utils import _pair
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext(
-    '_ext', ['correlation_forward', 'correlation_backward'])
-
-
-class CorrelationFunction(Function):
-
-    @staticmethod
-    def forward(ctx,
-                input1,
-                input2,
-                kernel_size=1,
-                max_displacement=1,
-                stride=1,
-                padding=1,
-                dilation=1,
-                dilation_patch=1):
-
-        ctx.save_for_backward(input1, input2)
-
-        kH, kW = ctx.kernel_size = _pair(kernel_size)
-        patch_size = max_displacement * 2 + 1
-        ctx.patch_size = patch_size
-        dH, dW = ctx.stride = _pair(stride)
-        padH, padW = ctx.padding = _pair(padding)
-        dilationH, dilationW = ctx.dilation = _pair(dilation)
-        dilation_patchH, dilation_patchW = ctx.dilation_patch = _pair(
-            dilation_patch)
-
-        output_size = CorrelationFunction._output_size(ctx, input1)
-
-        output = input1.new_zeros(output_size)
-
-        ext_module.correlation_forward(
-            input1,
-            input2,
-            output,
-            kH=kH,
-            kW=kW,
-            patchH=patch_size,
-            patchW=patch_size,
-            padH=padH,
-            padW=padW,
-            dilationH=dilationH,
-            dilationW=dilationW,
-            dilation_patchH=dilation_patchH,
-            dilation_patchW=dilation_patchW,
-            dH=dH,
-            dW=dW)
-
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output):
-        input1, input2 = ctx.saved_tensors
-
-        kH, kW = ctx.kernel_size
-        patch_size = ctx.patch_size
-        padH, padW = ctx.padding
-        dilationH, dilationW = ctx.dilation
-        dilation_patchH, dilation_patchW = ctx.dilation_patch
-        dH, dW = ctx.stride
-        grad_input1 = torch.zeros_like(input1)
-        grad_input2 = torch.zeros_like(input2)
-
-        ext_module.correlation_backward(
-            grad_output,
-            input1,
-            input2,
-            grad_input1,
-            grad_input2,
-            kH=kH,
-            kW=kW,
-            patchH=patch_size,
-            patchW=patch_size,
-            padH=padH,
-            padW=padW,
-            dilationH=dilationH,
-            dilationW=dilationW,
-            dilation_patchH=dilation_patchH,
-            dilation_patchW=dilation_patchW,
-            dH=dH,
-            dW=dW)
-        return grad_input1, grad_input2, None, None, None, None, None, None
-
-    @staticmethod
-    def _output_size(ctx, input1):
-        iH, iW = input1.size(2), input1.size(3)
-        batch_size = input1.size(0)
-        kH, kW = ctx.kernel_size
-        patch_size = ctx.patch_size
-        dH, dW = ctx.stride
-        padH, padW = ctx.padding
-        dilationH, dilationW = ctx.dilation
-        dilatedKH = (kH - 1) * dilationH + 1
-        dilatedKW = (kW - 1) * dilationW + 1
-
-        oH = int((iH + 2 * padH - dilatedKH) / dH + 1)
-        oW = int((iW + 2 * padW - dilatedKW) / dW + 1)
-
-        output_size = (batch_size, patch_size, patch_size, oH, oW)
-        return output_size
-
-
-class Correlation(nn.Module):
-    r"""Correlation operator
-
-    This correlation operator works for optical flow correlation computation.
-
-    There are two batched tensors with shape :math:`(N, C, H, W)`,
-    and the correlation output's shape is :math:`(N, max\_displacement \times
-    2 + 1, max\_displacement * 2 + 1, H_{out}, W_{out})`
-
-    where
-
-    .. math::
-        H_{out} = \left\lfloor\frac{H_{in}  + 2 \times padding -
-            dilation \times (kernel\_size - 1) - 1}
-            {stride} + 1\right\rfloor
-
-    .. math::
-        W_{out} = \left\lfloor\frac{W_{in}  + 2 \times padding - dilation
-            \times (kernel\_size - 1) - 1}
-            {stride} + 1\right\rfloor
-
-    the correlation item :math:`(N_i, dy, dx)` is formed by taking the sliding
-    window convolution between input1 and shifted input2,
-
-    .. math::
-        Corr(N_i, dx, dy) =
-        \sum_{c=0}^{C-1}
-        input1(N_i, c) \star
-        \mathcal{S}(input2(N_i, c), dy, dx)
-
-    where :math:`\star` is the valid 2d sliding window convolution operator,
-    and :math:`\mathcal{S}` means shifting the input features (auto-complete
-    zero marginal), and :math:`dx, dy` are shifting distance, :math:`dx, dy \in
-    [-max\_displacement \times dilation\_patch, max\_displacement \times
-    dilation\_patch]`.
-
-    Args:
-        kernel_size (int): The size of sliding window i.e. local neighborhood
-            representing the center points and involved in correlation
-            computation. Defaults to 1.
-        max_displacement (int): The radius for computing correlation volume,
-            but the actual working space can be dilated by dilation_patch.
-            Defaults to 1.
-        stride (int): The stride of the sliding blocks in the input spatial
-            dimensions. Defaults to 1.
-        padding (int): Zero padding added to all four sides of the input1.
-            Defaults to 0.
-        dilation (int): The spacing of local neighborhood that will involved
-            in correlation. Defaults to 1.
-        dilation_patch (int): The spacing between position need to compute
-            correlation.  Defaults to 1.
-    """
-
-    def __init__(self,
-                 kernel_size: int = 1,
-                 max_displacement: int = 1,
-                 stride: int = 1,
-                 padding: int = 0,
-                 dilation: int = 1,
-                 dilation_patch: int = 1) -> None:
-        super().__init__()
-        self.kernel_size = kernel_size
-        self.max_displacement = max_displacement
-        self.stride = stride
-        self.padding = padding
-        self.dilation = dilation
-        self.dilation_patch = dilation_patch
-
-    def forward(self, input1: Tensor, input2: Tensor) -> Tensor:
-        return CorrelationFunction.apply(input1, input2, self.kernel_size,
-                                         self.max_displacement, self.stride,
-                                         self.padding, self.dilation,
-                                         self.dilation_patch)
-
-    def __repr__(self) -> str:
-        s = self.__class__.__name__
-        s += f'(kernel_size={self.kernel_size}, '
-        s += f'max_displacement={self.max_displacement}, '
-        s += f'stride={self.stride}, '
-        s += f'padding={self.padding}, '
-        s += f'dilation={self.dilation}, '
-        s += f'dilation_patch={self.dilation_patch})'
-        return s
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/deform_conv.py b/ControlNet/annotator/uniformer/mmcv/ops/deform_conv.py
deleted file mode 100644
index a3f8c75ee774823eea334e3b3732af6a18f55038..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/deform_conv.py
+++ /dev/null
@@ -1,405 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple, Union
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-from torch.nn.modules.utils import _pair, _single
-
-from annotator.uniformer.mmcv.utils import deprecated_api_warning
-from ..cnn import CONV_LAYERS
-from ..utils import ext_loader, print_log
-
-ext_module = ext_loader.load_ext('_ext', [
-    'deform_conv_forward', 'deform_conv_backward_input',
-    'deform_conv_backward_parameters'
-])
-
-
-class DeformConv2dFunction(Function):
-
-    @staticmethod
-    def symbolic(g,
-                 input,
-                 offset,
-                 weight,
-                 stride,
-                 padding,
-                 dilation,
-                 groups,
-                 deform_groups,
-                 bias=False,
-                 im2col_step=32):
-        return g.op(
-            'mmcv::MMCVDeformConv2d',
-            input,
-            offset,
-            weight,
-            stride_i=stride,
-            padding_i=padding,
-            dilation_i=dilation,
-            groups_i=groups,
-            deform_groups_i=deform_groups,
-            bias_i=bias,
-            im2col_step_i=im2col_step)
-
-    @staticmethod
-    def forward(ctx,
-                input,
-                offset,
-                weight,
-                stride=1,
-                padding=0,
-                dilation=1,
-                groups=1,
-                deform_groups=1,
-                bias=False,
-                im2col_step=32):
-        if input is not None and input.dim() != 4:
-            raise ValueError(
-                f'Expected 4D tensor as input, got {input.dim()}D tensor \
-                  instead.')
-        assert bias is False, 'Only support bias is False.'
-        ctx.stride = _pair(stride)
-        ctx.padding = _pair(padding)
-        ctx.dilation = _pair(dilation)
-        ctx.groups = groups
-        ctx.deform_groups = deform_groups
-        ctx.im2col_step = im2col_step
-
-        # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
-        # amp won't cast the type of model (float32), but "offset" is cast
-        # to float16 by nn.Conv2d automatically, leading to the type
-        # mismatch with input (when it is float32) or weight.
-        # The flag for whether to use fp16 or amp is the type of "offset",
-        # we cast weight and input to temporarily support fp16 and amp
-        # whatever the pytorch version is.
-        input = input.type_as(offset)
-        weight = weight.type_as(input)
-        ctx.save_for_backward(input, offset, weight)
-
-        output = input.new_empty(
-            DeformConv2dFunction._output_size(ctx, input, weight))
-
-        ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones
-
-        cur_im2col_step = min(ctx.im2col_step, input.size(0))
-        assert (input.size(0) %
-                cur_im2col_step) == 0, 'im2col step must divide batchsize'
-        ext_module.deform_conv_forward(
-            input,
-            weight,
-            offset,
-            output,
-            ctx.bufs_[0],
-            ctx.bufs_[1],
-            kW=weight.size(3),
-            kH=weight.size(2),
-            dW=ctx.stride[1],
-            dH=ctx.stride[0],
-            padW=ctx.padding[1],
-            padH=ctx.padding[0],
-            dilationW=ctx.dilation[1],
-            dilationH=ctx.dilation[0],
-            group=ctx.groups,
-            deformable_group=ctx.deform_groups,
-            im2col_step=cur_im2col_step)
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output):
-        input, offset, weight = ctx.saved_tensors
-
-        grad_input = grad_offset = grad_weight = None
-
-        cur_im2col_step = min(ctx.im2col_step, input.size(0))
-        assert (input.size(0) % cur_im2col_step
-                ) == 0, 'batch size must be divisible by im2col_step'
-
-        grad_output = grad_output.contiguous()
-        if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
-            grad_input = torch.zeros_like(input)
-            grad_offset = torch.zeros_like(offset)
-            ext_module.deform_conv_backward_input(
-                input,
-                offset,
-                grad_output,
-                grad_input,
-                grad_offset,
-                weight,
-                ctx.bufs_[0],
-                kW=weight.size(3),
-                kH=weight.size(2),
-                dW=ctx.stride[1],
-                dH=ctx.stride[0],
-                padW=ctx.padding[1],
-                padH=ctx.padding[0],
-                dilationW=ctx.dilation[1],
-                dilationH=ctx.dilation[0],
-                group=ctx.groups,
-                deformable_group=ctx.deform_groups,
-                im2col_step=cur_im2col_step)
-
-        if ctx.needs_input_grad[2]:
-            grad_weight = torch.zeros_like(weight)
-            ext_module.deform_conv_backward_parameters(
-                input,
-                offset,
-                grad_output,
-                grad_weight,
-                ctx.bufs_[0],
-                ctx.bufs_[1],
-                kW=weight.size(3),
-                kH=weight.size(2),
-                dW=ctx.stride[1],
-                dH=ctx.stride[0],
-                padW=ctx.padding[1],
-                padH=ctx.padding[0],
-                dilationW=ctx.dilation[1],
-                dilationH=ctx.dilation[0],
-                group=ctx.groups,
-                deformable_group=ctx.deform_groups,
-                scale=1,
-                im2col_step=cur_im2col_step)
-
-        return grad_input, grad_offset, grad_weight, \
-            None, None, None, None, None, None, None
-
-    @staticmethod
-    def _output_size(ctx, input, weight):
-        channels = weight.size(0)
-        output_size = (input.size(0), channels)
-        for d in range(input.dim() - 2):
-            in_size = input.size(d + 2)
-            pad = ctx.padding[d]
-            kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1
-            stride_ = ctx.stride[d]
-            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
-        if not all(map(lambda s: s > 0, output_size)):
-            raise ValueError(
-                'convolution input is too small (output would be ' +
-                'x'.join(map(str, output_size)) + ')')
-        return output_size
-
-
-deform_conv2d = DeformConv2dFunction.apply
-
-
-class DeformConv2d(nn.Module):
-    r"""Deformable 2D convolution.
-
-    Applies a deformable 2D convolution over an input signal composed of
-    several input planes. DeformConv2d was described in the paper
-    `Deformable Convolutional Networks
-    <https://arxiv.org/pdf/1703.06211.pdf>`_
-
-    Note:
-        The argument ``im2col_step`` was added in version 1.3.17, which means
-        number of samples processed by the ``im2col_cuda_kernel`` per call.
-        It enables users to define ``batch_size`` and ``im2col_step`` more
-        flexibly and solved `issue mmcv#1440
-        <https://github.com/open-mmlab/mmcv/issues/1440>`_.
-
-    Args:
-        in_channels (int): Number of channels in the input image.
-        out_channels (int): Number of channels produced by the convolution.
-        kernel_size(int, tuple): Size of the convolving kernel.
-        stride(int, tuple): Stride of the convolution. Default: 1.
-        padding (int or tuple): Zero-padding added to both sides of the input.
-            Default: 0.
-        dilation (int or tuple): Spacing between kernel elements. Default: 1.
-        groups (int): Number of blocked connections from input.
-            channels to output channels. Default: 1.
-        deform_groups (int): Number of deformable group partitions.
-        bias (bool): If True, adds a learnable bias to the output.
-            Default: False.
-        im2col_step (int): Number of samples processed by im2col_cuda_kernel
-            per call. It will work when ``batch_size`` > ``im2col_step``, but
-            ``batch_size`` must be divisible by ``im2col_step``. Default: 32.
-            `New in version 1.3.17.`
-    """
-
-    @deprecated_api_warning({'deformable_groups': 'deform_groups'},
-                            cls_name='DeformConv2d')
-    def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 kernel_size: Union[int, Tuple[int, ...]],
-                 stride: Union[int, Tuple[int, ...]] = 1,
-                 padding: Union[int, Tuple[int, ...]] = 0,
-                 dilation: Union[int, Tuple[int, ...]] = 1,
-                 groups: int = 1,
-                 deform_groups: int = 1,
-                 bias: bool = False,
-                 im2col_step: int = 32) -> None:
-        super(DeformConv2d, self).__init__()
-
-        assert not bias, \
-            f'bias={bias} is not supported in DeformConv2d.'
-        assert in_channels % groups == 0, \
-            f'in_channels {in_channels} cannot be divisible by groups {groups}'
-        assert out_channels % groups == 0, \
-            f'out_channels {out_channels} cannot be divisible by groups \
-              {groups}'
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = _pair(kernel_size)
-        self.stride = _pair(stride)
-        self.padding = _pair(padding)
-        self.dilation = _pair(dilation)
-        self.groups = groups
-        self.deform_groups = deform_groups
-        self.im2col_step = im2col_step
-        # enable compatibility with nn.Conv2d
-        self.transposed = False
-        self.output_padding = _single(0)
-
-        # only weight, no bias
-        self.weight = nn.Parameter(
-            torch.Tensor(out_channels, in_channels // self.groups,
-                         *self.kernel_size))
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        # switch the initialization of `self.weight` to the standard kaiming
-        # method described in `Delving deep into rectifiers: Surpassing
-        # human-level performance on ImageNet classification` - He, K. et al.
-        # (2015), using a uniform distribution
-        nn.init.kaiming_uniform_(self.weight, nonlinearity='relu')
-
-    def forward(self, x: Tensor, offset: Tensor) -> Tensor:
-        """Deformable Convolutional forward function.
-
-        Args:
-            x (Tensor): Input feature, shape (B, C_in, H_in, W_in)
-            offset (Tensor): Offset for deformable convolution, shape
-                (B, deform_groups*kernel_size[0]*kernel_size[1]*2,
-                H_out, W_out), H_out, W_out are equal to the output's.
-
-                An offset is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.
-                The spatial arrangement is like:
-
-                .. code:: text
-
-                    (x0, y0) (x1, y1) (x2, y2)
-                    (x3, y3) (x4, y4) (x5, y5)
-                    (x6, y6) (x7, y7) (x8, y8)
-
-        Returns:
-            Tensor: Output of the layer.
-        """
-        # To fix an assert error in deform_conv_cuda.cpp:128
-        # input image is smaller than kernel
-        input_pad = (x.size(2) < self.kernel_size[0]) or (x.size(3) <
-                                                          self.kernel_size[1])
-        if input_pad:
-            pad_h = max(self.kernel_size[0] - x.size(2), 0)
-            pad_w = max(self.kernel_size[1] - x.size(3), 0)
-            x = F.pad(x, (0, pad_w, 0, pad_h), 'constant', 0).contiguous()
-            offset = F.pad(offset, (0, pad_w, 0, pad_h), 'constant', 0)
-            offset = offset.contiguous()
-        out = deform_conv2d(x, offset, self.weight, self.stride, self.padding,
-                            self.dilation, self.groups, self.deform_groups,
-                            False, self.im2col_step)
-        if input_pad:
-            out = out[:, :, :out.size(2) - pad_h, :out.size(3) -
-                      pad_w].contiguous()
-        return out
-
-    def __repr__(self):
-        s = self.__class__.__name__
-        s += f'(in_channels={self.in_channels},\n'
-        s += f'out_channels={self.out_channels},\n'
-        s += f'kernel_size={self.kernel_size},\n'
-        s += f'stride={self.stride},\n'
-        s += f'padding={self.padding},\n'
-        s += f'dilation={self.dilation},\n'
-        s += f'groups={self.groups},\n'
-        s += f'deform_groups={self.deform_groups},\n'
-        # bias is not supported in DeformConv2d.
-        s += 'bias=False)'
-        return s
-
-
-@CONV_LAYERS.register_module('DCN')
-class DeformConv2dPack(DeformConv2d):
-    """A Deformable Conv Encapsulation that acts as normal Conv layers.
-
-    The offset tensor is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.
-    The spatial arrangement is like:
-
-    .. code:: text
-
-        (x0, y0) (x1, y1) (x2, y2)
-        (x3, y3) (x4, y4) (x5, y5)
-        (x6, y6) (x7, y7) (x8, y8)
-
-    Args:
-        in_channels (int): Same as nn.Conv2d.
-        out_channels (int): Same as nn.Conv2d.
-        kernel_size (int or tuple[int]): Same as nn.Conv2d.
-        stride (int or tuple[int]): Same as nn.Conv2d.
-        padding (int or tuple[int]): Same as nn.Conv2d.
-        dilation (int or tuple[int]): Same as nn.Conv2d.
-        groups (int): Same as nn.Conv2d.
-        bias (bool or str): If specified as `auto`, it will be decided by the
-            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
-            False.
-    """
-
-    _version = 2
-
-    def __init__(self, *args, **kwargs):
-        super(DeformConv2dPack, self).__init__(*args, **kwargs)
-        self.conv_offset = nn.Conv2d(
-            self.in_channels,
-            self.deform_groups * 2 * self.kernel_size[0] * self.kernel_size[1],
-            kernel_size=self.kernel_size,
-            stride=_pair(self.stride),
-            padding=_pair(self.padding),
-            dilation=_pair(self.dilation),
-            bias=True)
-        self.init_offset()
-
-    def init_offset(self):
-        self.conv_offset.weight.data.zero_()
-        self.conv_offset.bias.data.zero_()
-
-    def forward(self, x):
-        offset = self.conv_offset(x)
-        return deform_conv2d(x, offset, self.weight, self.stride, self.padding,
-                             self.dilation, self.groups, self.deform_groups,
-                             False, self.im2col_step)
-
-    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
-                              missing_keys, unexpected_keys, error_msgs):
-        version = local_metadata.get('version', None)
-
-        if version is None or version < 2:
-            # the key is different in early versions
-            # In version < 2, DeformConvPack loads previous benchmark models.
-            if (prefix + 'conv_offset.weight' not in state_dict
-                    and prefix[:-1] + '_offset.weight' in state_dict):
-                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
-                    prefix[:-1] + '_offset.weight')
-            if (prefix + 'conv_offset.bias' not in state_dict
-                    and prefix[:-1] + '_offset.bias' in state_dict):
-                state_dict[prefix +
-                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
-                                                                '_offset.bias')
-
-        if version is not None and version > 1:
-            print_log(
-                f'DeformConv2dPack {prefix.rstrip(".")} is upgraded to '
-                'version 2.',
-                logger='root')
-
-        super()._load_from_state_dict(state_dict, prefix, local_metadata,
-                                      strict, missing_keys, unexpected_keys,
-                                      error_msgs)
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/deform_roi_pool.py b/ControlNet/annotator/uniformer/mmcv/ops/deform_roi_pool.py
deleted file mode 100644
index cc245ba91fee252226ba22e76bb94a35db9a629b..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/deform_roi_pool.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from torch import nn
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-from torch.nn.modules.utils import _pair
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext(
-    '_ext', ['deform_roi_pool_forward', 'deform_roi_pool_backward'])
-
-
-class DeformRoIPoolFunction(Function):
-
-    @staticmethod
-    def symbolic(g, input, rois, offset, output_size, spatial_scale,
-                 sampling_ratio, gamma):
-        return g.op(
-            'mmcv::MMCVDeformRoIPool',
-            input,
-            rois,
-            offset,
-            pooled_height_i=output_size[0],
-            pooled_width_i=output_size[1],
-            spatial_scale_f=spatial_scale,
-            sampling_ratio_f=sampling_ratio,
-            gamma_f=gamma)
-
-    @staticmethod
-    def forward(ctx,
-                input,
-                rois,
-                offset,
-                output_size,
-                spatial_scale=1.0,
-                sampling_ratio=0,
-                gamma=0.1):
-        if offset is None:
-            offset = input.new_zeros(0)
-        ctx.output_size = _pair(output_size)
-        ctx.spatial_scale = float(spatial_scale)
-        ctx.sampling_ratio = int(sampling_ratio)
-        ctx.gamma = float(gamma)
-
-        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'
-
-        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],
-                        ctx.output_size[1])
-        output = input.new_zeros(output_shape)
-
-        ext_module.deform_roi_pool_forward(
-            input,
-            rois,
-            offset,
-            output,
-            pooled_height=ctx.output_size[0],
-            pooled_width=ctx.output_size[1],
-            spatial_scale=ctx.spatial_scale,
-            sampling_ratio=ctx.sampling_ratio,
-            gamma=ctx.gamma)
-
-        ctx.save_for_backward(input, rois, offset)
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output):
-        input, rois, offset = ctx.saved_tensors
-        grad_input = grad_output.new_zeros(input.shape)
-        grad_offset = grad_output.new_zeros(offset.shape)
-
-        ext_module.deform_roi_pool_backward(
-            grad_output,
-            input,
-            rois,
-            offset,
-            grad_input,
-            grad_offset,
-            pooled_height=ctx.output_size[0],
-            pooled_width=ctx.output_size[1],
-            spatial_scale=ctx.spatial_scale,
-            sampling_ratio=ctx.sampling_ratio,
-            gamma=ctx.gamma)
-        if grad_offset.numel() == 0:
-            grad_offset = None
-        return grad_input, None, grad_offset, None, None, None, None
-
-
-deform_roi_pool = DeformRoIPoolFunction.apply
-
-
-class DeformRoIPool(nn.Module):
-
-    def __init__(self,
-                 output_size,
-                 spatial_scale=1.0,
-                 sampling_ratio=0,
-                 gamma=0.1):
-        super(DeformRoIPool, self).__init__()
-        self.output_size = _pair(output_size)
-        self.spatial_scale = float(spatial_scale)
-        self.sampling_ratio = int(sampling_ratio)
-        self.gamma = float(gamma)
-
-    def forward(self, input, rois, offset=None):
-        return deform_roi_pool(input, rois, offset, self.output_size,
-                               self.spatial_scale, self.sampling_ratio,
-                               self.gamma)
-
-
-class DeformRoIPoolPack(DeformRoIPool):
-
-    def __init__(self,
-                 output_size,
-                 output_channels,
-                 deform_fc_channels=1024,
-                 spatial_scale=1.0,
-                 sampling_ratio=0,
-                 gamma=0.1):
-        super(DeformRoIPoolPack, self).__init__(output_size, spatial_scale,
-                                                sampling_ratio, gamma)
-
-        self.output_channels = output_channels
-        self.deform_fc_channels = deform_fc_channels
-
-        self.offset_fc = nn.Sequential(
-            nn.Linear(
-                self.output_size[0] * self.output_size[1] *
-                self.output_channels, self.deform_fc_channels),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.deform_fc_channels, self.deform_fc_channels),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.deform_fc_channels,
-                      self.output_size[0] * self.output_size[1] * 2))
-        self.offset_fc[-1].weight.data.zero_()
-        self.offset_fc[-1].bias.data.zero_()
-
-    def forward(self, input, rois):
-        assert input.size(1) == self.output_channels
-        x = deform_roi_pool(input, rois, None, self.output_size,
-                            self.spatial_scale, self.sampling_ratio,
-                            self.gamma)
-        rois_num = rois.size(0)
-        offset = self.offset_fc(x.view(rois_num, -1))
-        offset = offset.view(rois_num, 2, self.output_size[0],
-                             self.output_size[1])
-        return deform_roi_pool(input, rois, offset, self.output_size,
-                               self.spatial_scale, self.sampling_ratio,
-                               self.gamma)
-
-
-class ModulatedDeformRoIPoolPack(DeformRoIPool):
-
-    def __init__(self,
-                 output_size,
-                 output_channels,
-                 deform_fc_channels=1024,
-                 spatial_scale=1.0,
-                 sampling_ratio=0,
-                 gamma=0.1):
-        super(ModulatedDeformRoIPoolPack,
-              self).__init__(output_size, spatial_scale, sampling_ratio, gamma)
-
-        self.output_channels = output_channels
-        self.deform_fc_channels = deform_fc_channels
-
-        self.offset_fc = nn.Sequential(
-            nn.Linear(
-                self.output_size[0] * self.output_size[1] *
-                self.output_channels, self.deform_fc_channels),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.deform_fc_channels, self.deform_fc_channels),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.deform_fc_channels,
-                      self.output_size[0] * self.output_size[1] * 2))
-        self.offset_fc[-1].weight.data.zero_()
-        self.offset_fc[-1].bias.data.zero_()
-
-        self.mask_fc = nn.Sequential(
-            nn.Linear(
-                self.output_size[0] * self.output_size[1] *
-                self.output_channels, self.deform_fc_channels),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.deform_fc_channels,
-                      self.output_size[0] * self.output_size[1] * 1),
-            nn.Sigmoid())
-        self.mask_fc[2].weight.data.zero_()
-        self.mask_fc[2].bias.data.zero_()
-
-    def forward(self, input, rois):
-        assert input.size(1) == self.output_channels
-        x = deform_roi_pool(input, rois, None, self.output_size,
-                            self.spatial_scale, self.sampling_ratio,
-                            self.gamma)
-        rois_num = rois.size(0)
-        offset = self.offset_fc(x.view(rois_num, -1))
-        offset = offset.view(rois_num, 2, self.output_size[0],
-                             self.output_size[1])
-        mask = self.mask_fc(x.view(rois_num, -1))
-        mask = mask.view(rois_num, 1, self.output_size[0], self.output_size[1])
-        d = deform_roi_pool(input, rois, offset, self.output_size,
-                            self.spatial_scale, self.sampling_ratio,
-                            self.gamma)
-        return d * mask
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/deprecated_wrappers.py b/ControlNet/annotator/uniformer/mmcv/ops/deprecated_wrappers.py
deleted file mode 100644
index a2e593df9ee57637038683d7a1efaa347b2b69e7..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/deprecated_wrappers.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-# This file is for backward compatibility.
-# Module wrappers for empty tensor have been moved to mmcv.cnn.bricks.
-import warnings
-
-from ..cnn.bricks.wrappers import Conv2d, ConvTranspose2d, Linear, MaxPool2d
-
-
-class Conv2d_deprecated(Conv2d):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        warnings.warn(
-            'Importing Conv2d wrapper from "mmcv.ops" will be deprecated in'
-            ' the future. Please import them from "mmcv.cnn" instead')
-
-
-class ConvTranspose2d_deprecated(ConvTranspose2d):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        warnings.warn(
-            'Importing ConvTranspose2d wrapper from "mmcv.ops" will be '
-            'deprecated in the future. Please import them from "mmcv.cnn" '
-            'instead')
-
-
-class MaxPool2d_deprecated(MaxPool2d):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        warnings.warn(
-            'Importing MaxPool2d wrapper from "mmcv.ops" will be deprecated in'
-            ' the future. Please import them from "mmcv.cnn" instead')
-
-
-class Linear_deprecated(Linear):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        warnings.warn(
-            'Importing Linear wrapper from "mmcv.ops" will be deprecated in'
-            ' the future. Please import them from "mmcv.cnn" instead')
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/focal_loss.py b/ControlNet/annotator/uniformer/mmcv/ops/focal_loss.py
deleted file mode 100644
index 763bc93bd2575c49ca8ccf20996bbd92d1e0d1a4..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/focal_loss.py
+++ /dev/null
@@ -1,212 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-import torch.nn as nn
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext', [
-    'sigmoid_focal_loss_forward', 'sigmoid_focal_loss_backward',
-    'softmax_focal_loss_forward', 'softmax_focal_loss_backward'
-])
-
-
-class SigmoidFocalLossFunction(Function):
-
-    @staticmethod
-    def symbolic(g, input, target, gamma, alpha, weight, reduction):
-        return g.op(
-            'mmcv::MMCVSigmoidFocalLoss',
-            input,
-            target,
-            gamma_f=gamma,
-            alpha_f=alpha,
-            weight_f=weight,
-            reduction_s=reduction)
-
-    @staticmethod
-    def forward(ctx,
-                input,
-                target,
-                gamma=2.0,
-                alpha=0.25,
-                weight=None,
-                reduction='mean'):
-
-        assert isinstance(target, (torch.LongTensor, torch.cuda.LongTensor))
-        assert input.dim() == 2
-        assert target.dim() == 1
-        assert input.size(0) == target.size(0)
-        if weight is None:
-            weight = input.new_empty(0)
-        else:
-            assert weight.dim() == 1
-            assert input.size(1) == weight.size(0)
-        ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2}
-        assert reduction in ctx.reduction_dict.keys()
-
-        ctx.gamma = float(gamma)
-        ctx.alpha = float(alpha)
-        ctx.reduction = ctx.reduction_dict[reduction]
-
-        output = input.new_zeros(input.size())
-
-        ext_module.sigmoid_focal_loss_forward(
-            input, target, weight, output, gamma=ctx.gamma, alpha=ctx.alpha)
-        if ctx.reduction == ctx.reduction_dict['mean']:
-            output = output.sum() / input.size(0)
-        elif ctx.reduction == ctx.reduction_dict['sum']:
-            output = output.sum()
-        ctx.save_for_backward(input, target, weight)
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output):
-        input, target, weight = ctx.saved_tensors
-
-        grad_input = input.new_zeros(input.size())
-
-        ext_module.sigmoid_focal_loss_backward(
-            input,
-            target,
-            weight,
-            grad_input,
-            gamma=ctx.gamma,
-            alpha=ctx.alpha)
-
-        grad_input *= grad_output
-        if ctx.reduction == ctx.reduction_dict['mean']:
-            grad_input /= input.size(0)
-        return grad_input, None, None, None, None, None
-
-
-sigmoid_focal_loss = SigmoidFocalLossFunction.apply
-
-
-class SigmoidFocalLoss(nn.Module):
-
-    def __init__(self, gamma, alpha, weight=None, reduction='mean'):
-        super(SigmoidFocalLoss, self).__init__()
-        self.gamma = gamma
-        self.alpha = alpha
-        self.register_buffer('weight', weight)
-        self.reduction = reduction
-
-    def forward(self, input, target):
-        return sigmoid_focal_loss(input, target, self.gamma, self.alpha,
-                                  self.weight, self.reduction)
-
-    def __repr__(self):
-        s = self.__class__.__name__
-        s += f'(gamma={self.gamma}, '
-        s += f'alpha={self.alpha}, '
-        s += f'reduction={self.reduction})'
-        return s
-
-
-class SoftmaxFocalLossFunction(Function):
-
-    @staticmethod
-    def symbolic(g, input, target, gamma, alpha, weight, reduction):
-        return g.op(
-            'mmcv::MMCVSoftmaxFocalLoss',
-            input,
-            target,
-            gamma_f=gamma,
-            alpha_f=alpha,
-            weight_f=weight,
-            reduction_s=reduction)
-
-    @staticmethod
-    def forward(ctx,
-                input,
-                target,
-                gamma=2.0,
-                alpha=0.25,
-                weight=None,
-                reduction='mean'):
-
-        assert isinstance(target, (torch.LongTensor, torch.cuda.LongTensor))
-        assert input.dim() == 2
-        assert target.dim() == 1
-        assert input.size(0) == target.size(0)
-        if weight is None:
-            weight = input.new_empty(0)
-        else:
-            assert weight.dim() == 1
-            assert input.size(1) == weight.size(0)
-        ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2}
-        assert reduction in ctx.reduction_dict.keys()
-
-        ctx.gamma = float(gamma)
-        ctx.alpha = float(alpha)
-        ctx.reduction = ctx.reduction_dict[reduction]
-
-        channel_stats, _ = torch.max(input, dim=1)
-        input_softmax = input - channel_stats.unsqueeze(1).expand_as(input)
-        input_softmax.exp_()
-
-        channel_stats = input_softmax.sum(dim=1)
-        input_softmax /= channel_stats.unsqueeze(1).expand_as(input)
-
-        output = input.new_zeros(input.size(0))
-        ext_module.softmax_focal_loss_forward(
-            input_softmax,
-            target,
-            weight,
-            output,
-            gamma=ctx.gamma,
-            alpha=ctx.alpha)
-
-        if ctx.reduction == ctx.reduction_dict['mean']:
-            output = output.sum() / input.size(0)
-        elif ctx.reduction == ctx.reduction_dict['sum']:
-            output = output.sum()
-        ctx.save_for_backward(input_softmax, target, weight)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input_softmax, target, weight = ctx.saved_tensors
-        buff = input_softmax.new_zeros(input_softmax.size(0))
-        grad_input = input_softmax.new_zeros(input_softmax.size())
-
-        ext_module.softmax_focal_loss_backward(
-            input_softmax,
-            target,
-            weight,
-            buff,
-            grad_input,
-            gamma=ctx.gamma,
-            alpha=ctx.alpha)
-
-        grad_input *= grad_output
-        if ctx.reduction == ctx.reduction_dict['mean']:
-            grad_input /= input_softmax.size(0)
-        return grad_input, None, None, None, None, None
-
-
-softmax_focal_loss = SoftmaxFocalLossFunction.apply
-
-
-class SoftmaxFocalLoss(nn.Module):
-
-    def __init__(self, gamma, alpha, weight=None, reduction='mean'):
-        super(SoftmaxFocalLoss, self).__init__()
-        self.gamma = gamma
-        self.alpha = alpha
-        self.register_buffer('weight', weight)
-        self.reduction = reduction
-
-    def forward(self, input, target):
-        return softmax_focal_loss(input, target, self.gamma, self.alpha,
-                                  self.weight, self.reduction)
-
-    def __repr__(self):
-        s = self.__class__.__name__
-        s += f'(gamma={self.gamma}, '
-        s += f'alpha={self.alpha}, '
-        s += f'reduction={self.reduction})'
-        return s
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/furthest_point_sample.py b/ControlNet/annotator/uniformer/mmcv/ops/furthest_point_sample.py
deleted file mode 100644
index 374b7a878f1972c183941af28ba1df216ac1a60f..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/furthest_point_sample.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import torch
-from torch.autograd import Function
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext', [
-    'furthest_point_sampling_forward',
-    'furthest_point_sampling_with_dist_forward'
-])
-
-
-class FurthestPointSampling(Function):
-    """Uses iterative furthest point sampling to select a set of features whose
-    corresponding points have the furthest distance."""
-
-    @staticmethod
-    def forward(ctx, points_xyz: torch.Tensor,
-                num_points: int) -> torch.Tensor:
-        """
-        Args:
-            points_xyz (Tensor): (B, N, 3) where N > num_points.
-            num_points (int): Number of points in the sampled set.
-
-        Returns:
-             Tensor: (B, num_points) indices of the sampled points.
-        """
-        assert points_xyz.is_contiguous()
-
-        B, N = points_xyz.size()[:2]
-        output = torch.cuda.IntTensor(B, num_points)
-        temp = torch.cuda.FloatTensor(B, N).fill_(1e10)
-
-        ext_module.furthest_point_sampling_forward(
-            points_xyz,
-            temp,
-            output,
-            b=B,
-            n=N,
-            m=num_points,
-        )
-        if torch.__version__ != 'parrots':
-            ctx.mark_non_differentiable(output)
-        return output
-
-    @staticmethod
-    def backward(xyz, a=None):
-        return None, None
-
-
-class FurthestPointSamplingWithDist(Function):
-    """Uses iterative furthest point sampling to select a set of features whose
-    corresponding points have the furthest distance."""
-
-    @staticmethod
-    def forward(ctx, points_dist: torch.Tensor,
-                num_points: int) -> torch.Tensor:
-        """
-        Args:
-            points_dist (Tensor): (B, N, N) Distance between each point pair.
-            num_points (int): Number of points in the sampled set.
-
-        Returns:
-             Tensor: (B, num_points) indices of the sampled points.
-        """
-        assert points_dist.is_contiguous()
-
-        B, N, _ = points_dist.size()
-        output = points_dist.new_zeros([B, num_points], dtype=torch.int32)
-        temp = points_dist.new_zeros([B, N]).fill_(1e10)
-
-        ext_module.furthest_point_sampling_with_dist_forward(
-            points_dist, temp, output, b=B, n=N, m=num_points)
-        if torch.__version__ != 'parrots':
-            ctx.mark_non_differentiable(output)
-        return output
-
-    @staticmethod
-    def backward(xyz, a=None):
-        return None, None
-
-
-furthest_point_sample = FurthestPointSampling.apply
-furthest_point_sample_with_dist = FurthestPointSamplingWithDist.apply
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/fused_bias_leakyrelu.py b/ControlNet/annotator/uniformer/mmcv/ops/fused_bias_leakyrelu.py
deleted file mode 100644
index 6d12508469c6c8fa1884debece44c58d158cb6fa..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/fused_bias_leakyrelu.py
+++ /dev/null
@@ -1,268 +0,0 @@
-# modified from https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_act.py # noqa:E501
-
-# Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
-# NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
-# Augmentation (ADA)
-# =======================================================================
-
-# 1. Definitions
-
-# "Licensor" means any person or entity that distributes its Work.
-
-# "Software" means the original work of authorship made available under
-# this License.
-
-# "Work" means the Software and any additions to or derivative works of
-# the Software that are made available under this License.
-
-# The terms "reproduce," "reproduction," "derivative works," and
-# "distribution" have the meaning as provided under U.S. copyright law;
-# provided, however, that for the purposes of this License, derivative
-# works shall not include works that remain separable from, or merely
-# link (or bind by name) to the interfaces of, the Work.
-
-# Works, including the Software, are "made available" under this License
-# by including in or with the Work either (a) a copyright notice
-# referencing the applicability of this License to the Work, or (b) a
-# copy of this License.
-
-# 2. License Grants
-
-#     2.1 Copyright Grant. Subject to the terms and conditions of this
-#     License, each Licensor grants to you a perpetual, worldwide,
-#     non-exclusive, royalty-free, copyright license to reproduce,
-#     prepare derivative works of, publicly display, publicly perform,
-#     sublicense and distribute its Work and any resulting derivative
-#     works in any form.
-
-# 3. Limitations
-
-#     3.1 Redistribution. You may reproduce or distribute the Work only
-#     if (a) you do so under this License, (b) you include a complete
-#     copy of this License with your distribution, and (c) you retain
-#     without modification any copyright, patent, trademark, or
-#     attribution notices that are present in the Work.
-
-#     3.2 Derivative Works. You may specify that additional or different
-#     terms apply to the use, reproduction, and distribution of your
-#     derivative works of the Work ("Your Terms") only if (a) Your Terms
-#     provide that the use limitation in Section 3.3 applies to your
-#     derivative works, and (b) you identify the specific derivative
-#     works that are subject to Your Terms. Notwithstanding Your Terms,
-#     this License (including the redistribution requirements in Section
-#     3.1) will continue to apply to the Work itself.
-
-#     3.3 Use Limitation. The Work and any derivative works thereof only
-#     may be used or intended for use non-commercially. Notwithstanding
-#     the foregoing, NVIDIA and its affiliates may use the Work and any
-#     derivative works commercially. As used herein, "non-commercially"
-#     means for research or evaluation purposes only.
-
-#     3.4 Patent Claims. If you bring or threaten to bring a patent claim
-#     against any Licensor (including any claim, cross-claim or
-#     counterclaim in a lawsuit) to enforce any patents that you allege
-#     are infringed by any Work, then your rights under this License from
-#     such Licensor (including the grant in Section 2.1) will terminate
-#     immediately.
-
-#     3.5 Trademarks. This License does not grant any rights to use any
-#     Licensor’s or its affiliates’ names, logos, or trademarks, except
-#     as necessary to reproduce the notices described in this License.
-
-#     3.6 Termination. If you violate any term of this License, then your
-#     rights under this License (including the grant in Section 2.1) will
-#     terminate immediately.
-
-# 4. Disclaimer of Warranty.
-
-# THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
-# NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
-# THIS LICENSE.
-
-# 5. Limitation of Liability.
-
-# EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
-# THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
-# SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
-# INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
-# OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
-# (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
-# LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
-# COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
-# THE POSSIBILITY OF SUCH DAMAGES.
-
-# =======================================================================
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-from torch.autograd import Function
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext', ['fused_bias_leakyrelu'])
-
-
-class FusedBiasLeakyReLUFunctionBackward(Function):
-    """Calculate second order deviation.
-
-    This function is to compute the second order deviation for the fused leaky
-    relu operation.
-    """
-
-    @staticmethod
-    def forward(ctx, grad_output, out, negative_slope, scale):
-        ctx.save_for_backward(out)
-        ctx.negative_slope = negative_slope
-        ctx.scale = scale
-
-        empty = grad_output.new_empty(0)
-
-        grad_input = ext_module.fused_bias_leakyrelu(
-            grad_output,
-            empty,
-            out,
-            act=3,
-            grad=1,
-            alpha=negative_slope,
-            scale=scale)
-
-        dim = [0]
-
-        if grad_input.ndim > 2:
-            dim += list(range(2, grad_input.ndim))
-
-        grad_bias = grad_input.sum(dim).detach()
-
-        return grad_input, grad_bias
-
-    @staticmethod
-    def backward(ctx, gradgrad_input, gradgrad_bias):
-        out, = ctx.saved_tensors
-
-        # The second order deviation, in fact, contains two parts, while the
-        # the first part is zero. Thus, we direct consider the second part
-        # which is similar with the first order deviation in implementation.
-        gradgrad_out = ext_module.fused_bias_leakyrelu(
-            gradgrad_input,
-            gradgrad_bias.to(out.dtype),
-            out,
-            act=3,
-            grad=1,
-            alpha=ctx.negative_slope,
-            scale=ctx.scale)
-
-        return gradgrad_out, None, None, None
-
-
-class FusedBiasLeakyReLUFunction(Function):
-
-    @staticmethod
-    def forward(ctx, input, bias, negative_slope, scale):
-        empty = input.new_empty(0)
-
-        out = ext_module.fused_bias_leakyrelu(
-            input,
-            bias,
-            empty,
-            act=3,
-            grad=0,
-            alpha=negative_slope,
-            scale=scale)
-        ctx.save_for_backward(out)
-        ctx.negative_slope = negative_slope
-        ctx.scale = scale
-
-        return out
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        out, = ctx.saved_tensors
-
-        grad_input, grad_bias = FusedBiasLeakyReLUFunctionBackward.apply(
-            grad_output, out, ctx.negative_slope, ctx.scale)
-
-        return grad_input, grad_bias, None, None
-
-
-class FusedBiasLeakyReLU(nn.Module):
-    """Fused bias leaky ReLU.
-
-    This function is introduced in the StyleGAN2:
-    http://arxiv.org/abs/1912.04958
-
-    The bias term comes from the convolution operation. In addition, to keep
-    the variance of the feature map or gradients unchanged, they also adopt a
-    scale similarly with Kaiming initialization. However, since the
-    :math:`1+{alpha}^2` : is too small, we can just ignore it. Therefore, the
-    final scale is just :math:`\sqrt{2}`:. Of course, you may change it with # noqa: W605, E501
-    your own scale.
-
-    TODO: Implement the CPU version.
-
-    Args:
-        channel (int): The channel number of the feature map.
-        negative_slope (float, optional): Same as nn.LeakyRelu.
-            Defaults to 0.2.
-        scale (float, optional): A scalar to adjust the variance of the feature
-            map. Defaults to 2**0.5.
-    """
-
-    def __init__(self, num_channels, negative_slope=0.2, scale=2**0.5):
-        super(FusedBiasLeakyReLU, self).__init__()
-
-        self.bias = nn.Parameter(torch.zeros(num_channels))
-        self.negative_slope = negative_slope
-        self.scale = scale
-
-    def forward(self, input):
-        return fused_bias_leakyrelu(input, self.bias, self.negative_slope,
-                                    self.scale)
-
-
-def fused_bias_leakyrelu(input, bias, negative_slope=0.2, scale=2**0.5):
-    """Fused bias leaky ReLU function.
-
-    This function is introduced in the StyleGAN2:
-    http://arxiv.org/abs/1912.04958
-
-    The bias term comes from the convolution operation. In addition, to keep
-    the variance of the feature map or gradients unchanged, they also adopt a
-    scale similarly with Kaiming initialization. However, since the
-    :math:`1+{alpha}^2` : is too small, we can just ignore it. Therefore, the
-    final scale is just :math:`\sqrt{2}`:. Of course, you may change it with # noqa: W605, E501
-    your own scale.
-
-    Args:
-        input (torch.Tensor): Input feature map.
-        bias (nn.Parameter): The bias from convolution operation.
-        negative_slope (float, optional): Same as nn.LeakyRelu.
-            Defaults to 0.2.
-        scale (float, optional): A scalar to adjust the variance of the feature
-            map. Defaults to 2**0.5.
-
-    Returns:
-        torch.Tensor: Feature map after non-linear activation.
-    """
-
-    if not input.is_cuda:
-        return bias_leakyrelu_ref(input, bias, negative_slope, scale)
-
-    return FusedBiasLeakyReLUFunction.apply(input, bias.to(input.dtype),
-                                            negative_slope, scale)
-
-
-def bias_leakyrelu_ref(x, bias, negative_slope=0.2, scale=2**0.5):
-
-    if bias is not None:
-        assert bias.ndim == 1
-        assert bias.shape[0] == x.shape[1]
-        x = x + bias.reshape([-1 if i == 1 else 1 for i in range(x.ndim)])
-
-    x = F.leaky_relu(x, negative_slope)
-    if scale != 1:
-        x = x * scale
-
-    return x
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/gather_points.py b/ControlNet/annotator/uniformer/mmcv/ops/gather_points.py
deleted file mode 100644
index f52f1677d8ea0facafc56a3672d37adb44677ff3..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/gather_points.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import torch
-from torch.autograd import Function
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext(
-    '_ext', ['gather_points_forward', 'gather_points_backward'])
-
-
-class GatherPoints(Function):
-    """Gather points with given index."""
-
-    @staticmethod
-    def forward(ctx, features: torch.Tensor,
-                indices: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            features (Tensor): (B, C, N) features to gather.
-            indices (Tensor): (B, M) where M is the number of points.
-
-        Returns:
-            Tensor: (B, C, M) where M is the number of points.
-        """
-        assert features.is_contiguous()
-        assert indices.is_contiguous()
-
-        B, npoint = indices.size()
-        _, C, N = features.size()
-        output = torch.cuda.FloatTensor(B, C, npoint)
-
-        ext_module.gather_points_forward(
-            features, indices, output, b=B, c=C, n=N, npoints=npoint)
-
-        ctx.for_backwards = (indices, C, N)
-        if torch.__version__ != 'parrots':
-            ctx.mark_non_differentiable(indices)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_out):
-        idx, C, N = ctx.for_backwards
-        B, npoint = idx.size()
-
-        grad_features = torch.cuda.FloatTensor(B, C, N).zero_()
-        grad_out_data = grad_out.data.contiguous()
-        ext_module.gather_points_backward(
-            grad_out_data,
-            idx,
-            grad_features.data,
-            b=B,
-            c=C,
-            n=N,
-            npoints=npoint)
-        return grad_features, None
-
-
-gather_points = GatherPoints.apply
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/group_points.py b/ControlNet/annotator/uniformer/mmcv/ops/group_points.py
deleted file mode 100644
index 6c3ec9d758ebe4e1c2205882af4be154008253a5..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/group_points.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
-
-import torch
-from torch import nn as nn
-from torch.autograd import Function
-
-from ..utils import ext_loader
-from .ball_query import ball_query
-from .knn import knn
-
-ext_module = ext_loader.load_ext(
-    '_ext', ['group_points_forward', 'group_points_backward'])
-
-
-class QueryAndGroup(nn.Module):
-    """Groups points with a ball query of radius.
-
-    Args:
-        max_radius (float): The maximum radius of the balls.
-            If None is given, we will use kNN sampling instead of ball query.
-        sample_num (int): Maximum number of features to gather in the ball.
-        min_radius (float, optional): The minimum radius of the balls.
-            Default: 0.
-        use_xyz (bool, optional): Whether to use xyz.
-            Default: True.
-        return_grouped_xyz (bool, optional): Whether to return grouped xyz.
-            Default: False.
-        normalize_xyz (bool, optional): Whether to normalize xyz.
-            Default: False.
-        uniform_sample (bool, optional): Whether to sample uniformly.
-            Default: False
-        return_unique_cnt (bool, optional): Whether to return the count of
-            unique samples. Default: False.
-        return_grouped_idx (bool, optional): Whether to return grouped idx.
-            Default: False.
-    """
-
-    def __init__(self,
-                 max_radius,
-                 sample_num,
-                 min_radius=0,
-                 use_xyz=True,
-                 return_grouped_xyz=False,
-                 normalize_xyz=False,
-                 uniform_sample=False,
-                 return_unique_cnt=False,
-                 return_grouped_idx=False):
-        super().__init__()
-        self.max_radius = max_radius
-        self.min_radius = min_radius
-        self.sample_num = sample_num
-        self.use_xyz = use_xyz
-        self.return_grouped_xyz = return_grouped_xyz
-        self.normalize_xyz = normalize_xyz
-        self.uniform_sample = uniform_sample
-        self.return_unique_cnt = return_unique_cnt
-        self.return_grouped_idx = return_grouped_idx
-        if self.return_unique_cnt:
-            assert self.uniform_sample, \
-                'uniform_sample should be True when ' \
-                'returning the count of unique samples'
-        if self.max_radius is None:
-            assert not self.normalize_xyz, \
-                'can not normalize grouped xyz when max_radius is None'
-
-    def forward(self, points_xyz, center_xyz, features=None):
-        """
-        Args:
-            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
-            center_xyz (Tensor): (B, npoint, 3) coordinates of the centriods.
-            features (Tensor): (B, C, N) Descriptors of the features.
-
-        Returns:
-            Tensor: (B, 3 + C, npoint, sample_num) Grouped feature.
-        """
-        # if self.max_radius is None, we will perform kNN instead of ball query
-        # idx is of shape [B, npoint, sample_num]
-        if self.max_radius is None:
-            idx = knn(self.sample_num, points_xyz, center_xyz, False)
-            idx = idx.transpose(1, 2).contiguous()
-        else:
-            idx = ball_query(self.min_radius, self.max_radius, self.sample_num,
-                             points_xyz, center_xyz)
-
-        if self.uniform_sample:
-            unique_cnt = torch.zeros((idx.shape[0], idx.shape[1]))
-            for i_batch in range(idx.shape[0]):
-                for i_region in range(idx.shape[1]):
-                    unique_ind = torch.unique(idx[i_batch, i_region, :])
-                    num_unique = unique_ind.shape[0]
-                    unique_cnt[i_batch, i_region] = num_unique
-                    sample_ind = torch.randint(
-                        0,
-                        num_unique, (self.sample_num - num_unique, ),
-                        dtype=torch.long)
-                    all_ind = torch.cat((unique_ind, unique_ind[sample_ind]))
-                    idx[i_batch, i_region, :] = all_ind
-
-        xyz_trans = points_xyz.transpose(1, 2).contiguous()
-        # (B, 3, npoint, sample_num)
-        grouped_xyz = grouping_operation(xyz_trans, idx)
-        grouped_xyz_diff = grouped_xyz - \
-            center_xyz.transpose(1, 2).unsqueeze(-1)  # relative offsets
-        if self.normalize_xyz:
-            grouped_xyz_diff /= self.max_radius
-
-        if features is not None:
-            grouped_features = grouping_operation(features, idx)
-            if self.use_xyz:
-                # (B, C + 3, npoint, sample_num)
-                new_features = torch.cat([grouped_xyz_diff, grouped_features],
-                                         dim=1)
-            else:
-                new_features = grouped_features
-        else:
-            assert (self.use_xyz
-                    ), 'Cannot have not features and not use xyz as a feature!'
-            new_features = grouped_xyz_diff
-
-        ret = [new_features]
-        if self.return_grouped_xyz:
-            ret.append(grouped_xyz)
-        if self.return_unique_cnt:
-            ret.append(unique_cnt)
-        if self.return_grouped_idx:
-            ret.append(idx)
-        if len(ret) == 1:
-            return ret[0]
-        else:
-            return tuple(ret)
-
-
-class GroupAll(nn.Module):
-    """Group xyz with feature.
-
-    Args:
-        use_xyz (bool): Whether to use xyz.
-    """
-
-    def __init__(self, use_xyz: bool = True):
-        super().__init__()
-        self.use_xyz = use_xyz
-
-    def forward(self,
-                xyz: torch.Tensor,
-                new_xyz: torch.Tensor,
-                features: torch.Tensor = None):
-        """
-        Args:
-            xyz (Tensor): (B, N, 3) xyz coordinates of the features.
-            new_xyz (Tensor): new xyz coordinates of the features.
-            features (Tensor): (B, C, N) features to group.
-
-        Returns:
-            Tensor: (B, C + 3, 1, N) Grouped feature.
-        """
-        grouped_xyz = xyz.transpose(1, 2).unsqueeze(2)
-        if features is not None:
-            grouped_features = features.unsqueeze(2)
-            if self.use_xyz:
-                # (B, 3 + C, 1, N)
-                new_features = torch.cat([grouped_xyz, grouped_features],
-                                         dim=1)
-            else:
-                new_features = grouped_features
-        else:
-            new_features = grouped_xyz
-
-        return new_features
-
-
-class GroupingOperation(Function):
-    """Group feature with given index."""
-
-    @staticmethod
-    def forward(ctx, features: torch.Tensor,
-                indices: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            features (Tensor): (B, C, N) tensor of features to group.
-            indices (Tensor): (B, npoint, nsample) the indices of
-                features to group with.
-
-        Returns:
-            Tensor: (B, C, npoint, nsample) Grouped features.
-        """
-        features = features.contiguous()
-        indices = indices.contiguous()
-
-        B, nfeatures, nsample = indices.size()
-        _, C, N = features.size()
-        output = torch.cuda.FloatTensor(B, C, nfeatures, nsample)
-
-        ext_module.group_points_forward(B, C, N, nfeatures, nsample, features,
-                                        indices, output)
-
-        ctx.for_backwards = (indices, N)
-        return output
-
-    @staticmethod
-    def backward(ctx,
-                 grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Args:
-            grad_out (Tensor): (B, C, npoint, nsample) tensor of the gradients
-                of the output from forward.
-
-        Returns:
-            Tensor: (B, C, N) gradient of the features.
-        """
-        idx, N = ctx.for_backwards
-
-        B, C, npoint, nsample = grad_out.size()
-        grad_features = torch.cuda.FloatTensor(B, C, N).zero_()
-
-        grad_out_data = grad_out.data.contiguous()
-        ext_module.group_points_backward(B, C, N, npoint, nsample,
-                                         grad_out_data, idx,
-                                         grad_features.data)
-        return grad_features, None
-
-
-grouping_operation = GroupingOperation.apply
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/info.py b/ControlNet/annotator/uniformer/mmcv/ops/info.py
deleted file mode 100644
index 29f2e5598ae2bb5866ccd15a7d3b4de33c0cd14d..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/info.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import glob
-import os
-
-import torch
-
-if torch.__version__ == 'parrots':
-    import parrots
-
-    def get_compiler_version():
-        return 'GCC ' + parrots.version.compiler
-
-    def get_compiling_cuda_version():
-        return parrots.version.cuda
-else:
-    from ..utils import ext_loader
-    ext_module = ext_loader.load_ext(
-        '_ext', ['get_compiler_version', 'get_compiling_cuda_version'])
-
-    def get_compiler_version():
-        return ext_module.get_compiler_version()
-
-    def get_compiling_cuda_version():
-        return ext_module.get_compiling_cuda_version()
-
-
-def get_onnxruntime_op_path():
-    wildcard = os.path.join(
-        os.path.abspath(os.path.dirname(os.path.dirname(__file__))),
-        '_ext_ort.*.so')
-
-    paths = glob.glob(wildcard)
-    if len(paths) > 0:
-        return paths[0]
-    else:
-        return ''
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/iou3d.py b/ControlNet/annotator/uniformer/mmcv/ops/iou3d.py
deleted file mode 100644
index 6fc71979190323f44c09f8b7e1761cf49cd2d76b..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/iou3d.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext', [
-    'iou3d_boxes_iou_bev_forward', 'iou3d_nms_forward',
-    'iou3d_nms_normal_forward'
-])
-
-
-def boxes_iou_bev(boxes_a, boxes_b):
-    """Calculate boxes IoU in the Bird's Eye View.
-
-    Args:
-        boxes_a (torch.Tensor): Input boxes a with shape (M, 5).
-        boxes_b (torch.Tensor): Input boxes b with shape (N, 5).
-
-    Returns:
-        ans_iou (torch.Tensor): IoU result with shape (M, N).
-    """
-    ans_iou = boxes_a.new_zeros(
-        torch.Size((boxes_a.shape[0], boxes_b.shape[0])))
-
-    ext_module.iou3d_boxes_iou_bev_forward(boxes_a.contiguous(),
-                                           boxes_b.contiguous(), ans_iou)
-
-    return ans_iou
-
-
-def nms_bev(boxes, scores, thresh, pre_max_size=None, post_max_size=None):
-    """NMS function GPU implementation (for BEV boxes). The overlap of two
-    boxes for IoU calculation is defined as the exact overlapping area of the
-    two boxes. In this function, one can also set ``pre_max_size`` and
-    ``post_max_size``.
-
-    Args:
-        boxes (torch.Tensor): Input boxes with the shape of [N, 5]
-            ([x1, y1, x2, y2, ry]).
-        scores (torch.Tensor): Scores of boxes with the shape of [N].
-        thresh (float): Overlap threshold of NMS.
-        pre_max_size (int, optional): Max size of boxes before NMS.
-            Default: None.
-        post_max_size (int, optional): Max size of boxes after NMS.
-            Default: None.
-
-    Returns:
-        torch.Tensor: Indexes after NMS.
-    """
-    assert boxes.size(1) == 5, 'Input boxes shape should be [N, 5]'
-    order = scores.sort(0, descending=True)[1]
-
-    if pre_max_size is not None:
-        order = order[:pre_max_size]
-    boxes = boxes[order].contiguous()
-
-    keep = torch.zeros(boxes.size(0), dtype=torch.long)
-    num_out = ext_module.iou3d_nms_forward(boxes, keep, thresh)
-    keep = order[keep[:num_out].cuda(boxes.device)].contiguous()
-    if post_max_size is not None:
-        keep = keep[:post_max_size]
-    return keep
-
-
-def nms_normal_bev(boxes, scores, thresh):
-    """Normal NMS function GPU implementation (for BEV boxes). The overlap of
-    two boxes for IoU calculation is defined as the exact overlapping area of
-    the two boxes WITH their yaw angle set to 0.
-
-    Args:
-        boxes (torch.Tensor): Input boxes with shape (N, 5).
-        scores (torch.Tensor): Scores of predicted boxes with shape (N).
-        thresh (float): Overlap threshold of NMS.
-
-    Returns:
-        torch.Tensor: Remaining indices with scores in descending order.
-    """
-    assert boxes.shape[1] == 5, 'Input boxes shape should be [N, 5]'
-    order = scores.sort(0, descending=True)[1]
-
-    boxes = boxes[order].contiguous()
-
-    keep = torch.zeros(boxes.size(0), dtype=torch.long)
-    num_out = ext_module.iou3d_nms_normal_forward(boxes, keep, thresh)
-    return order[keep[:num_out].cuda(boxes.device)].contiguous()
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/knn.py b/ControlNet/annotator/uniformer/mmcv/ops/knn.py
deleted file mode 100644
index f335785036669fc19239825b0aae6dde3f73bf92..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/knn.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import torch
-from torch.autograd import Function
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext', ['knn_forward'])
-
-
-class KNN(Function):
-    r"""KNN (CUDA) based on heap data structure.
-    Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/
-    scene_seg/lib/pointops/src/knnquery_heap>`_.
-
-    Find k-nearest points.
-    """
-
-    @staticmethod
-    def forward(ctx,
-                k: int,
-                xyz: torch.Tensor,
-                center_xyz: torch.Tensor = None,
-                transposed: bool = False) -> torch.Tensor:
-        """
-        Args:
-            k (int): number of nearest neighbors.
-            xyz (Tensor): (B, N, 3) if transposed == False, else (B, 3, N).
-                xyz coordinates of the features.
-            center_xyz (Tensor, optional): (B, npoint, 3) if transposed ==
-                False, else (B, 3, npoint). centers of the knn query.
-                Default: None.
-            transposed (bool, optional): whether the input tensors are
-                transposed. Should not explicitly use this keyword when
-                calling knn (=KNN.apply), just add the fourth param.
-                Default: False.
-
-        Returns:
-            Tensor: (B, k, npoint) tensor with the indices of
-                the features that form k-nearest neighbours.
-        """
-        assert (k > 0) & (k < 100), 'k should be in range(0, 100)'
-
-        if center_xyz is None:
-            center_xyz = xyz
-
-        if transposed:
-            xyz = xyz.transpose(2, 1).contiguous()
-            center_xyz = center_xyz.transpose(2, 1).contiguous()
-
-        assert xyz.is_contiguous()  # [B, N, 3]
-        assert center_xyz.is_contiguous()  # [B, npoint, 3]
-
-        center_xyz_device = center_xyz.get_device()
-        assert center_xyz_device == xyz.get_device(), \
-            'center_xyz and xyz should be put on the same device'
-        if torch.cuda.current_device() != center_xyz_device:
-            torch.cuda.set_device(center_xyz_device)
-
-        B, npoint, _ = center_xyz.shape
-        N = xyz.shape[1]
-
-        idx = center_xyz.new_zeros((B, npoint, k)).int()
-        dist2 = center_xyz.new_zeros((B, npoint, k)).float()
-
-        ext_module.knn_forward(
-            xyz, center_xyz, idx, dist2, b=B, n=N, m=npoint, nsample=k)
-        # idx shape to [B, k, npoint]
-        idx = idx.transpose(2, 1).contiguous()
-        if torch.__version__ != 'parrots':
-            ctx.mark_non_differentiable(idx)
-        return idx
-
-    @staticmethod
-    def backward(ctx, a=None):
-        return None, None, None
-
-
-knn = KNN.apply
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/masked_conv.py b/ControlNet/annotator/uniformer/mmcv/ops/masked_conv.py
deleted file mode 100644
index cd514cc204c1d571ea5dc7e74b038c0f477a008b..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/masked_conv.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import math
-
-import torch
-import torch.nn as nn
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-from torch.nn.modules.utils import _pair
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext(
-    '_ext', ['masked_im2col_forward', 'masked_col2im_forward'])
-
-
-class MaskedConv2dFunction(Function):
-
-    @staticmethod
-    def symbolic(g, features, mask, weight, bias, padding, stride):
-        return g.op(
-            'mmcv::MMCVMaskedConv2d',
-            features,
-            mask,
-            weight,
-            bias,
-            padding_i=padding,
-            stride_i=stride)
-
-    @staticmethod
-    def forward(ctx, features, mask, weight, bias, padding=0, stride=1):
-        assert mask.dim() == 3 and mask.size(0) == 1
-        assert features.dim() == 4 and features.size(0) == 1
-        assert features.size()[2:] == mask.size()[1:]
-        pad_h, pad_w = _pair(padding)
-        stride_h, stride_w = _pair(stride)
-        if stride_h != 1 or stride_w != 1:
-            raise ValueError(
-                'Stride could not only be 1 in masked_conv2d currently.')
-        out_channel, in_channel, kernel_h, kernel_w = weight.size()
-
-        batch_size = features.size(0)
-        out_h = int(
-            math.floor((features.size(2) + 2 * pad_h -
-                        (kernel_h - 1) - 1) / stride_h + 1))
-        out_w = int(
-            math.floor((features.size(3) + 2 * pad_w -
-                        (kernel_h - 1) - 1) / stride_w + 1))
-        mask_inds = torch.nonzero(mask[0] > 0, as_tuple=False)
-        output = features.new_zeros(batch_size, out_channel, out_h, out_w)
-        if mask_inds.numel() > 0:
-            mask_h_idx = mask_inds[:, 0].contiguous()
-            mask_w_idx = mask_inds[:, 1].contiguous()
-            data_col = features.new_zeros(in_channel * kernel_h * kernel_w,
-                                          mask_inds.size(0))
-            ext_module.masked_im2col_forward(
-                features,
-                mask_h_idx,
-                mask_w_idx,
-                data_col,
-                kernel_h=kernel_h,
-                kernel_w=kernel_w,
-                pad_h=pad_h,
-                pad_w=pad_w)
-
-            masked_output = torch.addmm(1, bias[:, None], 1,
-                                        weight.view(out_channel, -1), data_col)
-            ext_module.masked_col2im_forward(
-                masked_output,
-                mask_h_idx,
-                mask_w_idx,
-                output,
-                height=out_h,
-                width=out_w,
-                channels=out_channel)
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output):
-        return (None, ) * 5
-
-
-masked_conv2d = MaskedConv2dFunction.apply
-
-
-class MaskedConv2d(nn.Conv2d):
-    """A MaskedConv2d which inherits the official Conv2d.
-
-    The masked forward doesn't implement the backward function and only
-    supports the stride parameter to be 1 currently.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True):
-        super(MaskedConv2d,
-              self).__init__(in_channels, out_channels, kernel_size, stride,
-                             padding, dilation, groups, bias)
-
-    def forward(self, input, mask=None):
-        if mask is None:  # fallback to the normal Conv2d
-            return super(MaskedConv2d, self).forward(input)
-        else:
-            return masked_conv2d(input, mask, self.weight, self.bias,
-                                 self.padding)
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/merge_cells.py b/ControlNet/annotator/uniformer/mmcv/ops/merge_cells.py
deleted file mode 100644
index 48ca8cc0a8aca8432835bd760c0403a3c35b34cf..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/merge_cells.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from abc import abstractmethod
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ..cnn import ConvModule
-
-
-class BaseMergeCell(nn.Module):
-    """The basic class for cells used in NAS-FPN and NAS-FCOS.
-
-    BaseMergeCell takes 2 inputs. After applying convolution
-    on them, they are resized to the target size. Then,
-    they go through binary_op, which depends on the type of cell.
-    If with_out_conv is True, the result of output will go through
-    another convolution layer.
-
-    Args:
-        in_channels (int): number of input channels in out_conv layer.
-        out_channels (int): number of output channels in out_conv layer.
-        with_out_conv (bool): Whether to use out_conv layer
-        out_conv_cfg (dict): Config dict for convolution layer, which should
-            contain "groups", "kernel_size", "padding", "bias" to build
-            out_conv layer.
-        out_norm_cfg (dict): Config dict for normalization layer in out_conv.
-        out_conv_order (tuple): The order of conv/norm/activation layers in
-            out_conv.
-        with_input1_conv (bool): Whether to use convolution on input1.
-        with_input2_conv (bool): Whether to use convolution on input2.
-        input_conv_cfg (dict): Config dict for building input1_conv layer and
-            input2_conv layer, which is expected to contain the type of
-            convolution.
-            Default: None, which means using conv2d.
-        input_norm_cfg (dict): Config dict for normalization layer in
-            input1_conv and input2_conv layer. Default: None.
-        upsample_mode (str): Interpolation method used to resize the output
-            of input1_conv and input2_conv to target size. Currently, we
-            support ['nearest', 'bilinear']. Default: 'nearest'.
-    """
-
-    def __init__(self,
-                 fused_channels=256,
-                 out_channels=256,
-                 with_out_conv=True,
-                 out_conv_cfg=dict(
-                     groups=1, kernel_size=3, padding=1, bias=True),
-                 out_norm_cfg=None,
-                 out_conv_order=('act', 'conv', 'norm'),
-                 with_input1_conv=False,
-                 with_input2_conv=False,
-                 input_conv_cfg=None,
-                 input_norm_cfg=None,
-                 upsample_mode='nearest'):
-        super(BaseMergeCell, self).__init__()
-        assert upsample_mode in ['nearest', 'bilinear']
-        self.with_out_conv = with_out_conv
-        self.with_input1_conv = with_input1_conv
-        self.with_input2_conv = with_input2_conv
-        self.upsample_mode = upsample_mode
-
-        if self.with_out_conv:
-            self.out_conv = ConvModule(
-                fused_channels,
-                out_channels,
-                **out_conv_cfg,
-                norm_cfg=out_norm_cfg,
-                order=out_conv_order)
-
-        self.input1_conv = self._build_input_conv(
-            out_channels, input_conv_cfg,
-            input_norm_cfg) if with_input1_conv else nn.Sequential()
-        self.input2_conv = self._build_input_conv(
-            out_channels, input_conv_cfg,
-            input_norm_cfg) if with_input2_conv else nn.Sequential()
-
-    def _build_input_conv(self, channel, conv_cfg, norm_cfg):
-        return ConvModule(
-            channel,
-            channel,
-            3,
-            padding=1,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            bias=True)
-
-    @abstractmethod
-    def _binary_op(self, x1, x2):
-        pass
-
-    def _resize(self, x, size):
-        if x.shape[-2:] == size:
-            return x
-        elif x.shape[-2:] < size:
-            return F.interpolate(x, size=size, mode=self.upsample_mode)
-        else:
-            assert x.shape[-2] % size[-2] == 0 and x.shape[-1] % size[-1] == 0
-            kernel_size = x.shape[-1] // size[-1]
-            x = F.max_pool2d(x, kernel_size=kernel_size, stride=kernel_size)
-            return x
-
-    def forward(self, x1, x2, out_size=None):
-        assert x1.shape[:2] == x2.shape[:2]
-        assert out_size is None or len(out_size) == 2
-        if out_size is None:  # resize to larger one
-            out_size = max(x1.size()[2:], x2.size()[2:])
-
-        x1 = self.input1_conv(x1)
-        x2 = self.input2_conv(x2)
-
-        x1 = self._resize(x1, out_size)
-        x2 = self._resize(x2, out_size)
-
-        x = self._binary_op(x1, x2)
-        if self.with_out_conv:
-            x = self.out_conv(x)
-        return x
-
-
-class SumCell(BaseMergeCell):
-
-    def __init__(self, in_channels, out_channels, **kwargs):
-        super(SumCell, self).__init__(in_channels, out_channels, **kwargs)
-
-    def _binary_op(self, x1, x2):
-        return x1 + x2
-
-
-class ConcatCell(BaseMergeCell):
-
-    def __init__(self, in_channels, out_channels, **kwargs):
-        super(ConcatCell, self).__init__(in_channels * 2, out_channels,
-                                         **kwargs)
-
-    def _binary_op(self, x1, x2):
-        ret = torch.cat([x1, x2], dim=1)
-        return ret
-
-
-class GlobalPoolingCell(BaseMergeCell):
-
-    def __init__(self, in_channels=None, out_channels=None, **kwargs):
-        super().__init__(in_channels, out_channels, **kwargs)
-        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
-
-    def _binary_op(self, x1, x2):
-        x2_att = self.global_pool(x2).sigmoid()
-        return x2 + x2_att * x1
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/modulated_deform_conv.py b/ControlNet/annotator/uniformer/mmcv/ops/modulated_deform_conv.py
deleted file mode 100644
index 75559579cf053abcc99538606cbb88c723faf783..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/modulated_deform_conv.py
+++ /dev/null
@@ -1,282 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import math
-
-import torch
-import torch.nn as nn
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-from torch.nn.modules.utils import _pair, _single
-
-from annotator.uniformer.mmcv.utils import deprecated_api_warning
-from ..cnn import CONV_LAYERS
-from ..utils import ext_loader, print_log
-
-ext_module = ext_loader.load_ext(
-    '_ext',
-    ['modulated_deform_conv_forward', 'modulated_deform_conv_backward'])
-
-
-class ModulatedDeformConv2dFunction(Function):
-
-    @staticmethod
-    def symbolic(g, input, offset, mask, weight, bias, stride, padding,
-                 dilation, groups, deform_groups):
-        input_tensors = [input, offset, mask, weight]
-        if bias is not None:
-            input_tensors.append(bias)
-        return g.op(
-            'mmcv::MMCVModulatedDeformConv2d',
-            *input_tensors,
-            stride_i=stride,
-            padding_i=padding,
-            dilation_i=dilation,
-            groups_i=groups,
-            deform_groups_i=deform_groups)
-
-    @staticmethod
-    def forward(ctx,
-                input,
-                offset,
-                mask,
-                weight,
-                bias=None,
-                stride=1,
-                padding=0,
-                dilation=1,
-                groups=1,
-                deform_groups=1):
-        if input is not None and input.dim() != 4:
-            raise ValueError(
-                f'Expected 4D tensor as input, got {input.dim()}D tensor \
-                  instead.')
-        ctx.stride = _pair(stride)
-        ctx.padding = _pair(padding)
-        ctx.dilation = _pair(dilation)
-        ctx.groups = groups
-        ctx.deform_groups = deform_groups
-        ctx.with_bias = bias is not None
-        if not ctx.with_bias:
-            bias = input.new_empty(0)  # fake tensor
-        # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
-        # amp won't cast the type of model (float32), but "offset" is cast
-        # to float16 by nn.Conv2d automatically, leading to the type
-        # mismatch with input (when it is float32) or weight.
-        # The flag for whether to use fp16 or amp is the type of "offset",
-        # we cast weight and input to temporarily support fp16 and amp
-        # whatever the pytorch version is.
-        input = input.type_as(offset)
-        weight = weight.type_as(input)
-        ctx.save_for_backward(input, offset, mask, weight, bias)
-        output = input.new_empty(
-            ModulatedDeformConv2dFunction._output_size(ctx, input, weight))
-        ctx._bufs = [input.new_empty(0), input.new_empty(0)]
-        ext_module.modulated_deform_conv_forward(
-            input,
-            weight,
-            bias,
-            ctx._bufs[0],
-            offset,
-            mask,
-            output,
-            ctx._bufs[1],
-            kernel_h=weight.size(2),
-            kernel_w=weight.size(3),
-            stride_h=ctx.stride[0],
-            stride_w=ctx.stride[1],
-            pad_h=ctx.padding[0],
-            pad_w=ctx.padding[1],
-            dilation_h=ctx.dilation[0],
-            dilation_w=ctx.dilation[1],
-            group=ctx.groups,
-            deformable_group=ctx.deform_groups,
-            with_bias=ctx.with_bias)
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output):
-        input, offset, mask, weight, bias = ctx.saved_tensors
-        grad_input = torch.zeros_like(input)
-        grad_offset = torch.zeros_like(offset)
-        grad_mask = torch.zeros_like(mask)
-        grad_weight = torch.zeros_like(weight)
-        grad_bias = torch.zeros_like(bias)
-        grad_output = grad_output.contiguous()
-        ext_module.modulated_deform_conv_backward(
-            input,
-            weight,
-            bias,
-            ctx._bufs[0],
-            offset,
-            mask,
-            ctx._bufs[1],
-            grad_input,
-            grad_weight,
-            grad_bias,
-            grad_offset,
-            grad_mask,
-            grad_output,
-            kernel_h=weight.size(2),
-            kernel_w=weight.size(3),
-            stride_h=ctx.stride[0],
-            stride_w=ctx.stride[1],
-            pad_h=ctx.padding[0],
-            pad_w=ctx.padding[1],
-            dilation_h=ctx.dilation[0],
-            dilation_w=ctx.dilation[1],
-            group=ctx.groups,
-            deformable_group=ctx.deform_groups,
-            with_bias=ctx.with_bias)
-        if not ctx.with_bias:
-            grad_bias = None
-
-        return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,
-                None, None, None, None, None)
-
-    @staticmethod
-    def _output_size(ctx, input, weight):
-        channels = weight.size(0)
-        output_size = (input.size(0), channels)
-        for d in range(input.dim() - 2):
-            in_size = input.size(d + 2)
-            pad = ctx.padding[d]
-            kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1
-            stride_ = ctx.stride[d]
-            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
-        if not all(map(lambda s: s > 0, output_size)):
-            raise ValueError(
-                'convolution input is too small (output would be ' +
-                'x'.join(map(str, output_size)) + ')')
-        return output_size
-
-
-modulated_deform_conv2d = ModulatedDeformConv2dFunction.apply
-
-
-class ModulatedDeformConv2d(nn.Module):
-
-    @deprecated_api_warning({'deformable_groups': 'deform_groups'},
-                            cls_name='ModulatedDeformConv2d')
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 deform_groups=1,
-                 bias=True):
-        super(ModulatedDeformConv2d, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = _pair(kernel_size)
-        self.stride = _pair(stride)
-        self.padding = _pair(padding)
-        self.dilation = _pair(dilation)
-        self.groups = groups
-        self.deform_groups = deform_groups
-        # enable compatibility with nn.Conv2d
-        self.transposed = False
-        self.output_padding = _single(0)
-
-        self.weight = nn.Parameter(
-            torch.Tensor(out_channels, in_channels // groups,
-                         *self.kernel_size))
-        if bias:
-            self.bias = nn.Parameter(torch.Tensor(out_channels))
-        else:
-            self.register_parameter('bias', None)
-        self.init_weights()
-
-    def init_weights(self):
-        n = self.in_channels
-        for k in self.kernel_size:
-            n *= k
-        stdv = 1. / math.sqrt(n)
-        self.weight.data.uniform_(-stdv, stdv)
-        if self.bias is not None:
-            self.bias.data.zero_()
-
-    def forward(self, x, offset, mask):
-        return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias,
-                                       self.stride, self.padding,
-                                       self.dilation, self.groups,
-                                       self.deform_groups)
-
-
-@CONV_LAYERS.register_module('DCNv2')
-class ModulatedDeformConv2dPack(ModulatedDeformConv2d):
-    """A ModulatedDeformable Conv Encapsulation that acts as normal Conv
-    layers.
-
-    Args:
-        in_channels (int): Same as nn.Conv2d.
-        out_channels (int): Same as nn.Conv2d.
-        kernel_size (int or tuple[int]): Same as nn.Conv2d.
-        stride (int): Same as nn.Conv2d, while tuple is not supported.
-        padding (int): Same as nn.Conv2d, while tuple is not supported.
-        dilation (int): Same as nn.Conv2d, while tuple is not supported.
-        groups (int): Same as nn.Conv2d.
-        bias (bool or str): If specified as `auto`, it will be decided by the
-            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
-            False.
-    """
-
-    _version = 2
-
-    def __init__(self, *args, **kwargs):
-        super(ModulatedDeformConv2dPack, self).__init__(*args, **kwargs)
-        self.conv_offset = nn.Conv2d(
-            self.in_channels,
-            self.deform_groups * 3 * self.kernel_size[0] * self.kernel_size[1],
-            kernel_size=self.kernel_size,
-            stride=self.stride,
-            padding=self.padding,
-            dilation=self.dilation,
-            bias=True)
-        self.init_weights()
-
-    def init_weights(self):
-        super(ModulatedDeformConv2dPack, self).init_weights()
-        if hasattr(self, 'conv_offset'):
-            self.conv_offset.weight.data.zero_()
-            self.conv_offset.bias.data.zero_()
-
-    def forward(self, x):
-        out = self.conv_offset(x)
-        o1, o2, mask = torch.chunk(out, 3, dim=1)
-        offset = torch.cat((o1, o2), dim=1)
-        mask = torch.sigmoid(mask)
-        return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias,
-                                       self.stride, self.padding,
-                                       self.dilation, self.groups,
-                                       self.deform_groups)
-
-    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
-                              missing_keys, unexpected_keys, error_msgs):
-        version = local_metadata.get('version', None)
-
-        if version is None or version < 2:
-            # the key is different in early versions
-            # In version < 2, ModulatedDeformConvPack
-            # loads previous benchmark models.
-            if (prefix + 'conv_offset.weight' not in state_dict
-                    and prefix[:-1] + '_offset.weight' in state_dict):
-                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
-                    prefix[:-1] + '_offset.weight')
-            if (prefix + 'conv_offset.bias' not in state_dict
-                    and prefix[:-1] + '_offset.bias' in state_dict):
-                state_dict[prefix +
-                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
-                                                                '_offset.bias')
-
-        if version is not None and version > 1:
-            print_log(
-                f'ModulatedDeformConvPack {prefix.rstrip(".")} is upgraded to '
-                'version 2.',
-                logger='root')
-
-        super()._load_from_state_dict(state_dict, prefix, local_metadata,
-                                      strict, missing_keys, unexpected_keys,
-                                      error_msgs)
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/multi_scale_deform_attn.py b/ControlNet/annotator/uniformer/mmcv/ops/multi_scale_deform_attn.py
deleted file mode 100644
index c52dda18b41705705b47dd0e995b124048c16fba..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/multi_scale_deform_attn.py
+++ /dev/null
@@ -1,358 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import math
-import warnings
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.autograd.function import Function, once_differentiable
-
-from annotator.uniformer.mmcv import deprecated_api_warning
-from annotator.uniformer.mmcv.cnn import constant_init, xavier_init
-from annotator.uniformer.mmcv.cnn.bricks.registry import ATTENTION
-from annotator.uniformer.mmcv.runner import BaseModule
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext(
-    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
-
-
-class MultiScaleDeformableAttnFunction(Function):
-
-    @staticmethod
-    def forward(ctx, value, value_spatial_shapes, value_level_start_index,
-                sampling_locations, attention_weights, im2col_step):
-        """GPU version of multi-scale deformable attention.
-
-        Args:
-            value (Tensor): The value has shape
-                (bs, num_keys, mum_heads, embed_dims//num_heads)
-            value_spatial_shapes (Tensor): Spatial shape of
-                each feature map, has shape (num_levels, 2),
-                last dimension 2 represent (h, w)
-            sampling_locations (Tensor): The location of sampling points,
-                has shape
-                (bs ,num_queries, num_heads, num_levels, num_points, 2),
-                the last dimension 2 represent (x, y).
-            attention_weights (Tensor): The weight of sampling points used
-                when calculate the attention, has shape
-                (bs ,num_queries, num_heads, num_levels, num_points),
-            im2col_step (Tensor): The step used in image to column.
-
-        Returns:
-            Tensor: has shape (bs, num_queries, embed_dims)
-        """
-
-        ctx.im2col_step = im2col_step
-        output = ext_module.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            im2col_step=ctx.im2col_step)
-        ctx.save_for_backward(value, value_spatial_shapes,
-                              value_level_start_index, sampling_locations,
-                              attention_weights)
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output):
-        """GPU version of backward function.
-
-        Args:
-            grad_output (Tensor): Gradient
-                of output tensor of forward.
-
-        Returns:
-             Tuple[Tensor]: Gradient
-                of input tensors in forward.
-        """
-        value, value_spatial_shapes, value_level_start_index,\
-            sampling_locations, attention_weights = ctx.saved_tensors
-        grad_value = torch.zeros_like(value)
-        grad_sampling_loc = torch.zeros_like(sampling_locations)
-        grad_attn_weight = torch.zeros_like(attention_weights)
-
-        ext_module.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output.contiguous(),
-            grad_value,
-            grad_sampling_loc,
-            grad_attn_weight,
-            im2col_step=ctx.im2col_step)
-
-        return grad_value, None, None, \
-            grad_sampling_loc, grad_attn_weight, None
-
-
-def multi_scale_deformable_attn_pytorch(value, value_spatial_shapes,
-                                        sampling_locations, attention_weights):
-    """CPU version of multi-scale deformable attention.
-
-    Args:
-        value (Tensor): The value has shape
-            (bs, num_keys, mum_heads, embed_dims//num_heads)
-        value_spatial_shapes (Tensor): Spatial shape of
-            each feature map, has shape (num_levels, 2),
-            last dimension 2 represent (h, w)
-        sampling_locations (Tensor): The location of sampling points,
-            has shape
-            (bs ,num_queries, num_heads, num_levels, num_points, 2),
-            the last dimension 2 represent (x, y).
-        attention_weights (Tensor): The weight of sampling points used
-            when calculate the attention, has shape
-            (bs ,num_queries, num_heads, num_levels, num_points),
-
-    Returns:
-        Tensor: has shape (bs, num_queries, embed_dims)
-    """
-
-    bs, _, num_heads, embed_dims = value.shape
-    _, num_queries, num_heads, num_levels, num_points, _ =\
-        sampling_locations.shape
-    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes],
-                             dim=1)
-    sampling_grids = 2 * sampling_locations - 1
-    sampling_value_list = []
-    for level, (H_, W_) in enumerate(value_spatial_shapes):
-        # bs, H_*W_, num_heads, embed_dims ->
-        # bs, H_*W_, num_heads*embed_dims ->
-        # bs, num_heads*embed_dims, H_*W_ ->
-        # bs*num_heads, embed_dims, H_, W_
-        value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape(
-            bs * num_heads, embed_dims, H_, W_)
-        # bs, num_queries, num_heads, num_points, 2 ->
-        # bs, num_heads, num_queries, num_points, 2 ->
-        # bs*num_heads, num_queries, num_points, 2
-        sampling_grid_l_ = sampling_grids[:, :, :,
-                                          level].transpose(1, 2).flatten(0, 1)
-        # bs*num_heads, embed_dims, num_queries, num_points
-        sampling_value_l_ = F.grid_sample(
-            value_l_,
-            sampling_grid_l_,
-            mode='bilinear',
-            padding_mode='zeros',
-            align_corners=False)
-        sampling_value_list.append(sampling_value_l_)
-    # (bs, num_queries, num_heads, num_levels, num_points) ->
-    # (bs, num_heads, num_queries, num_levels, num_points) ->
-    # (bs, num_heads, 1, num_queries, num_levels*num_points)
-    attention_weights = attention_weights.transpose(1, 2).reshape(
-        bs * num_heads, 1, num_queries, num_levels * num_points)
-    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) *
-              attention_weights).sum(-1).view(bs, num_heads * embed_dims,
-                                              num_queries)
-    return output.transpose(1, 2).contiguous()
-
-
-@ATTENTION.register_module()
-class MultiScaleDeformableAttention(BaseModule):
-    """An attention module used in Deformable-Detr.
-
-    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
-    <https://arxiv.org/pdf/2010.04159.pdf>`_.
-
-    Args:
-        embed_dims (int): The embedding dimension of Attention.
-            Default: 256.
-        num_heads (int): Parallel attention heads. Default: 64.
-        num_levels (int): The number of feature map used in
-            Attention. Default: 4.
-        num_points (int): The number of sampling points for
-            each query in each head. Default: 4.
-        im2col_step (int): The step used in image_to_column.
-            Default: 64.
-        dropout (float): A Dropout layer on `inp_identity`.
-            Default: 0.1.
-        batch_first (bool): Key, Query and Value are shape of
-            (batch, n, embed_dim)
-            or (n, batch, embed_dim). Default to False.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: None.
-        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
-            Default: None.
-    """
-
-    def __init__(self,
-                 embed_dims=256,
-                 num_heads=8,
-                 num_levels=4,
-                 num_points=4,
-                 im2col_step=64,
-                 dropout=0.1,
-                 batch_first=False,
-                 norm_cfg=None,
-                 init_cfg=None):
-        super().__init__(init_cfg)
-        if embed_dims % num_heads != 0:
-            raise ValueError(f'embed_dims must be divisible by num_heads, '
-                             f'but got {embed_dims} and {num_heads}')
-        dim_per_head = embed_dims // num_heads
-        self.norm_cfg = norm_cfg
-        self.dropout = nn.Dropout(dropout)
-        self.batch_first = batch_first
-
-        # you'd better set dim_per_head to a power of 2
-        # which is more efficient in the CUDA implementation
-        def _is_power_of_2(n):
-            if (not isinstance(n, int)) or (n < 0):
-                raise ValueError(
-                    'invalid input for _is_power_of_2: {} (type: {})'.format(
-                        n, type(n)))
-            return (n & (n - 1) == 0) and n != 0
-
-        if not _is_power_of_2(dim_per_head):
-            warnings.warn(
-                "You'd better set embed_dims in "
-                'MultiScaleDeformAttention to make '
-                'the dimension of each attention head a power of 2 '
-                'which is more efficient in our CUDA implementation.')
-
-        self.im2col_step = im2col_step
-        self.embed_dims = embed_dims
-        self.num_levels = num_levels
-        self.num_heads = num_heads
-        self.num_points = num_points
-        self.sampling_offsets = nn.Linear(
-            embed_dims, num_heads * num_levels * num_points * 2)
-        self.attention_weights = nn.Linear(embed_dims,
-                                           num_heads * num_levels * num_points)
-        self.value_proj = nn.Linear(embed_dims, embed_dims)
-        self.output_proj = nn.Linear(embed_dims, embed_dims)
-        self.init_weights()
-
-    def init_weights(self):
-        """Default initialization for Parameters of Module."""
-        constant_init(self.sampling_offsets, 0.)
-        thetas = torch.arange(
-            self.num_heads,
-            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
-        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
-        grid_init = (grid_init /
-                     grid_init.abs().max(-1, keepdim=True)[0]).view(
-                         self.num_heads, 1, 1,
-                         2).repeat(1, self.num_levels, self.num_points, 1)
-        for i in range(self.num_points):
-            grid_init[:, :, i, :] *= i + 1
-
-        self.sampling_offsets.bias.data = grid_init.view(-1)
-        constant_init(self.attention_weights, val=0., bias=0.)
-        xavier_init(self.value_proj, distribution='uniform', bias=0.)
-        xavier_init(self.output_proj, distribution='uniform', bias=0.)
-        self._is_init = True
-
-    @deprecated_api_warning({'residual': 'identity'},
-                            cls_name='MultiScaleDeformableAttention')
-    def forward(self,
-                query,
-                key=None,
-                value=None,
-                identity=None,
-                query_pos=None,
-                key_padding_mask=None,
-                reference_points=None,
-                spatial_shapes=None,
-                level_start_index=None,
-                **kwargs):
-        """Forward Function of MultiScaleDeformAttention.
-
-        Args:
-            query (Tensor): Query of Transformer with shape
-                (num_query, bs, embed_dims).
-            key (Tensor): The key tensor with shape
-                `(num_key, bs, embed_dims)`.
-            value (Tensor): The value tensor with shape
-                `(num_key, bs, embed_dims)`.
-            identity (Tensor): The tensor used for addition, with the
-                same shape as `query`. Default None. If None,
-                `query` will be used.
-            query_pos (Tensor): The positional encoding for `query`.
-                Default: None.
-            key_pos (Tensor): The positional encoding for `key`. Default
-                None.
-            reference_points (Tensor):  The normalized reference
-                points with shape (bs, num_query, num_levels, 2),
-                all elements is range in [0, 1], top-left (0,0),
-                bottom-right (1, 1), including padding area.
-                or (N, Length_{query}, num_levels, 4), add
-                additional two dimensions is (w, h) to
-                form reference boxes.
-            key_padding_mask (Tensor): ByteTensor for `query`, with
-                shape [bs, num_key].
-            spatial_shapes (Tensor): Spatial shape of features in
-                different levels. With shape (num_levels, 2),
-                last dimension represents (h, w).
-            level_start_index (Tensor): The start index of each level.
-                A tensor has shape ``(num_levels, )`` and can be represented
-                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
-
-        Returns:
-             Tensor: forwarded results with shape [num_query, bs, embed_dims].
-        """
-
-        if value is None:
-            value = query
-
-        if identity is None:
-            identity = query
-        if query_pos is not None:
-            query = query + query_pos
-        if not self.batch_first:
-            # change to (bs, num_query ,embed_dims)
-            query = query.permute(1, 0, 2)
-            value = value.permute(1, 0, 2)
-
-        bs, num_query, _ = query.shape
-        bs, num_value, _ = value.shape
-        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
-
-        value = self.value_proj(value)
-        if key_padding_mask is not None:
-            value = value.masked_fill(key_padding_mask[..., None], 0.0)
-        value = value.view(bs, num_value, self.num_heads, -1)
-        sampling_offsets = self.sampling_offsets(query).view(
-            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
-        attention_weights = self.attention_weights(query).view(
-            bs, num_query, self.num_heads, self.num_levels * self.num_points)
-        attention_weights = attention_weights.softmax(-1)
-
-        attention_weights = attention_weights.view(bs, num_query,
-                                                   self.num_heads,
-                                                   self.num_levels,
-                                                   self.num_points)
-        if reference_points.shape[-1] == 2:
-            offset_normalizer = torch.stack(
-                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
-            sampling_locations = reference_points[:, :, None, :, None, :] \
-                + sampling_offsets \
-                / offset_normalizer[None, None, None, :, None, :]
-        elif reference_points.shape[-1] == 4:
-            sampling_locations = reference_points[:, :, None, :, None, :2] \
-                + sampling_offsets / self.num_points \
-                * reference_points[:, :, None, :, None, 2:] \
-                * 0.5
-        else:
-            raise ValueError(
-                f'Last dim of reference_points must be'
-                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
-        if torch.cuda.is_available() and value.is_cuda:
-            output = MultiScaleDeformableAttnFunction.apply(
-                value, spatial_shapes, level_start_index, sampling_locations,
-                attention_weights, self.im2col_step)
-        else:
-            output = multi_scale_deformable_attn_pytorch(
-                value, spatial_shapes, sampling_locations, attention_weights)
-
-        output = self.output_proj(output)
-
-        if not self.batch_first:
-            # (num_query, bs ,embed_dims)
-            output = output.permute(1, 0, 2)
-
-        return self.dropout(output) + identity
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/nms.py b/ControlNet/annotator/uniformer/mmcv/ops/nms.py
deleted file mode 100644
index 6d9634281f486ab284091786886854c451368052..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/nms.py
+++ /dev/null
@@ -1,417 +0,0 @@
-import os
-
-import numpy as np
-import torch
-
-from annotator.uniformer.mmcv.utils import deprecated_api_warning
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext(
-    '_ext', ['nms', 'softnms', 'nms_match', 'nms_rotated'])
-
-
-# This function is modified from: https://github.com/pytorch/vision/
-class NMSop(torch.autograd.Function):
-
-    @staticmethod
-    def forward(ctx, bboxes, scores, iou_threshold, offset, score_threshold,
-                max_num):
-        is_filtering_by_score = score_threshold > 0
-        if is_filtering_by_score:
-            valid_mask = scores > score_threshold
-            bboxes, scores = bboxes[valid_mask], scores[valid_mask]
-            valid_inds = torch.nonzero(
-                valid_mask, as_tuple=False).squeeze(dim=1)
-
-        inds = ext_module.nms(
-            bboxes, scores, iou_threshold=float(iou_threshold), offset=offset)
-
-        if max_num > 0:
-            inds = inds[:max_num]
-        if is_filtering_by_score:
-            inds = valid_inds[inds]
-        return inds
-
-    @staticmethod
-    def symbolic(g, bboxes, scores, iou_threshold, offset, score_threshold,
-                 max_num):
-        from ..onnx import is_custom_op_loaded
-        has_custom_op = is_custom_op_loaded()
-        # TensorRT nms plugin is aligned with original nms in ONNXRuntime
-        is_trt_backend = os.environ.get('ONNX_BACKEND') == 'MMCVTensorRT'
-        if has_custom_op and (not is_trt_backend):
-            return g.op(
-                'mmcv::NonMaxSuppression',
-                bboxes,
-                scores,
-                iou_threshold_f=float(iou_threshold),
-                offset_i=int(offset))
-        else:
-            from torch.onnx.symbolic_opset9 import select, squeeze, unsqueeze
-            from ..onnx.onnx_utils.symbolic_helper import _size_helper
-
-            boxes = unsqueeze(g, bboxes, 0)
-            scores = unsqueeze(g, unsqueeze(g, scores, 0), 0)
-
-            if max_num > 0:
-                max_num = g.op(
-                    'Constant',
-                    value_t=torch.tensor(max_num, dtype=torch.long))
-            else:
-                dim = g.op('Constant', value_t=torch.tensor(0))
-                max_num = _size_helper(g, bboxes, dim)
-            max_output_per_class = max_num
-            iou_threshold = g.op(
-                'Constant',
-                value_t=torch.tensor([iou_threshold], dtype=torch.float))
-            score_threshold = g.op(
-                'Constant',
-                value_t=torch.tensor([score_threshold], dtype=torch.float))
-            nms_out = g.op('NonMaxSuppression', boxes, scores,
-                           max_output_per_class, iou_threshold,
-                           score_threshold)
-            return squeeze(
-                g,
-                select(
-                    g, nms_out, 1,
-                    g.op(
-                        'Constant',
-                        value_t=torch.tensor([2], dtype=torch.long))), 1)
-
-
-class SoftNMSop(torch.autograd.Function):
-
-    @staticmethod
-    def forward(ctx, boxes, scores, iou_threshold, sigma, min_score, method,
-                offset):
-        dets = boxes.new_empty((boxes.size(0), 5), device='cpu')
-        inds = ext_module.softnms(
-            boxes.cpu(),
-            scores.cpu(),
-            dets.cpu(),
-            iou_threshold=float(iou_threshold),
-            sigma=float(sigma),
-            min_score=float(min_score),
-            method=int(method),
-            offset=int(offset))
-        return dets, inds
-
-    @staticmethod
-    def symbolic(g, boxes, scores, iou_threshold, sigma, min_score, method,
-                 offset):
-        from packaging import version
-        assert version.parse(torch.__version__) >= version.parse('1.7.0')
-        nms_out = g.op(
-            'mmcv::SoftNonMaxSuppression',
-            boxes,
-            scores,
-            iou_threshold_f=float(iou_threshold),
-            sigma_f=float(sigma),
-            min_score_f=float(min_score),
-            method_i=int(method),
-            offset_i=int(offset),
-            outputs=2)
-        return nms_out
-
-
-@deprecated_api_warning({'iou_thr': 'iou_threshold'})
-def nms(boxes, scores, iou_threshold, offset=0, score_threshold=0, max_num=-1):
-    """Dispatch to either CPU or GPU NMS implementations.
-
-    The input can be either torch tensor or numpy array. GPU NMS will be used
-    if the input is gpu tensor, otherwise CPU NMS
-    will be used. The returned type will always be the same as inputs.
-
-    Arguments:
-        boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4).
-        scores (torch.Tensor or np.ndarray): scores in shape (N, ).
-        iou_threshold (float): IoU threshold for NMS.
-        offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset).
-        score_threshold (float): score threshold for NMS.
-        max_num (int): maximum number of boxes after NMS.
-
-    Returns:
-        tuple: kept dets(boxes and scores) and indice, which is always the \
-            same data type as the input.
-
-    Example:
-        >>> boxes = np.array([[49.1, 32.4, 51.0, 35.9],
-        >>>                   [49.3, 32.9, 51.0, 35.3],
-        >>>                   [49.2, 31.8, 51.0, 35.4],
-        >>>                   [35.1, 11.5, 39.1, 15.7],
-        >>>                   [35.6, 11.8, 39.3, 14.2],
-        >>>                   [35.3, 11.5, 39.9, 14.5],
-        >>>                   [35.2, 11.7, 39.7, 15.7]], dtype=np.float32)
-        >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.5, 0.4, 0.3],\
-               dtype=np.float32)
-        >>> iou_threshold = 0.6
-        >>> dets, inds = nms(boxes, scores, iou_threshold)
-        >>> assert len(inds) == len(dets) == 3
-    """
-    assert isinstance(boxes, (torch.Tensor, np.ndarray))
-    assert isinstance(scores, (torch.Tensor, np.ndarray))
-    is_numpy = False
-    if isinstance(boxes, np.ndarray):
-        is_numpy = True
-        boxes = torch.from_numpy(boxes)
-    if isinstance(scores, np.ndarray):
-        scores = torch.from_numpy(scores)
-    assert boxes.size(1) == 4
-    assert boxes.size(0) == scores.size(0)
-    assert offset in (0, 1)
-
-    if torch.__version__ == 'parrots':
-        indata_list = [boxes, scores]
-        indata_dict = {
-            'iou_threshold': float(iou_threshold),
-            'offset': int(offset)
-        }
-        inds = ext_module.nms(*indata_list, **indata_dict)
-    else:
-        inds = NMSop.apply(boxes, scores, iou_threshold, offset,
-                           score_threshold, max_num)
-    dets = torch.cat((boxes[inds], scores[inds].reshape(-1, 1)), dim=1)
-    if is_numpy:
-        dets = dets.cpu().numpy()
-        inds = inds.cpu().numpy()
-    return dets, inds
-
-
-@deprecated_api_warning({'iou_thr': 'iou_threshold'})
-def soft_nms(boxes,
-             scores,
-             iou_threshold=0.3,
-             sigma=0.5,
-             min_score=1e-3,
-             method='linear',
-             offset=0):
-    """Dispatch to only CPU Soft NMS implementations.
-
-    The input can be either a torch tensor or numpy array.
-    The returned type will always be the same as inputs.
-
-    Arguments:
-        boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4).
-        scores (torch.Tensor or np.ndarray): scores in shape (N, ).
-        iou_threshold (float): IoU threshold for NMS.
-        sigma (float): hyperparameter for gaussian method
-        min_score (float): score filter threshold
-        method (str): either 'linear' or 'gaussian'
-        offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset).
-
-    Returns:
-        tuple: kept dets(boxes and scores) and indice, which is always the \
-            same data type as the input.
-
-    Example:
-        >>> boxes = np.array([[4., 3., 5., 3.],
-        >>>                   [4., 3., 5., 4.],
-        >>>                   [3., 1., 3., 1.],
-        >>>                   [3., 1., 3., 1.],
-        >>>                   [3., 1., 3., 1.],
-        >>>                   [3., 1., 3., 1.]], dtype=np.float32)
-        >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.4, 0.0], dtype=np.float32)
-        >>> iou_threshold = 0.6
-        >>> dets, inds = soft_nms(boxes, scores, iou_threshold, sigma=0.5)
-        >>> assert len(inds) == len(dets) == 5
-    """
-
-    assert isinstance(boxes, (torch.Tensor, np.ndarray))
-    assert isinstance(scores, (torch.Tensor, np.ndarray))
-    is_numpy = False
-    if isinstance(boxes, np.ndarray):
-        is_numpy = True
-        boxes = torch.from_numpy(boxes)
-    if isinstance(scores, np.ndarray):
-        scores = torch.from_numpy(scores)
-    assert boxes.size(1) == 4
-    assert boxes.size(0) == scores.size(0)
-    assert offset in (0, 1)
-    method_dict = {'naive': 0, 'linear': 1, 'gaussian': 2}
-    assert method in method_dict.keys()
-
-    if torch.__version__ == 'parrots':
-        dets = boxes.new_empty((boxes.size(0), 5), device='cpu')
-        indata_list = [boxes.cpu(), scores.cpu(), dets.cpu()]
-        indata_dict = {
-            'iou_threshold': float(iou_threshold),
-            'sigma': float(sigma),
-            'min_score': min_score,
-            'method': method_dict[method],
-            'offset': int(offset)
-        }
-        inds = ext_module.softnms(*indata_list, **indata_dict)
-    else:
-        dets, inds = SoftNMSop.apply(boxes.cpu(), scores.cpu(),
-                                     float(iou_threshold), float(sigma),
-                                     float(min_score), method_dict[method],
-                                     int(offset))
-
-    dets = dets[:inds.size(0)]
-
-    if is_numpy:
-        dets = dets.cpu().numpy()
-        inds = inds.cpu().numpy()
-        return dets, inds
-    else:
-        return dets.to(device=boxes.device), inds.to(device=boxes.device)
-
-
-def batched_nms(boxes, scores, idxs, nms_cfg, class_agnostic=False):
-    """Performs non-maximum suppression in a batched fashion.
-
-    Modified from https://github.com/pytorch/vision/blob
-    /505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39.
-    In order to perform NMS independently per class, we add an offset to all
-    the boxes. The offset is dependent only on the class idx, and is large
-    enough so that boxes from different classes do not overlap.
-
-    Arguments:
-        boxes (torch.Tensor): boxes in shape (N, 4).
-        scores (torch.Tensor): scores in shape (N, ).
-        idxs (torch.Tensor): each index value correspond to a bbox cluster,
-            and NMS will not be applied between elements of different idxs,
-            shape (N, ).
-        nms_cfg (dict): specify nms type and other parameters like iou_thr.
-            Possible keys includes the following.
-
-            - iou_thr (float): IoU threshold used for NMS.
-            - split_thr (float): threshold number of boxes. In some cases the
-                number of boxes is large (e.g., 200k). To avoid OOM during
-                training, the users could set `split_thr` to a small value.
-                If the number of boxes is greater than the threshold, it will
-                perform NMS on each group of boxes separately and sequentially.
-                Defaults to 10000.
-        class_agnostic (bool): if true, nms is class agnostic,
-            i.e. IoU thresholding happens over all boxes,
-            regardless of the predicted class.
-
-    Returns:
-        tuple: kept dets and indice.
-    """
-    nms_cfg_ = nms_cfg.copy()
-    class_agnostic = nms_cfg_.pop('class_agnostic', class_agnostic)
-    if class_agnostic:
-        boxes_for_nms = boxes
-    else:
-        max_coordinate = boxes.max()
-        offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes))
-        boxes_for_nms = boxes + offsets[:, None]
-
-    nms_type = nms_cfg_.pop('type', 'nms')
-    nms_op = eval(nms_type)
-
-    split_thr = nms_cfg_.pop('split_thr', 10000)
-    # Won't split to multiple nms nodes when exporting to onnx
-    if boxes_for_nms.shape[0] < split_thr or torch.onnx.is_in_onnx_export():
-        dets, keep = nms_op(boxes_for_nms, scores, **nms_cfg_)
-        boxes = boxes[keep]
-        # -1 indexing works abnormal in TensorRT
-        # This assumes `dets` has 5 dimensions where
-        # the last dimension is score.
-        # TODO: more elegant way to handle the dimension issue.
-        # Some type of nms would reweight the score, such as SoftNMS
-        scores = dets[:, 4]
-    else:
-        max_num = nms_cfg_.pop('max_num', -1)
-        total_mask = scores.new_zeros(scores.size(), dtype=torch.bool)
-        # Some type of nms would reweight the score, such as SoftNMS
-        scores_after_nms = scores.new_zeros(scores.size())
-        for id in torch.unique(idxs):
-            mask = (idxs == id).nonzero(as_tuple=False).view(-1)
-            dets, keep = nms_op(boxes_for_nms[mask], scores[mask], **nms_cfg_)
-            total_mask[mask[keep]] = True
-            scores_after_nms[mask[keep]] = dets[:, -1]
-        keep = total_mask.nonzero(as_tuple=False).view(-1)
-
-        scores, inds = scores_after_nms[keep].sort(descending=True)
-        keep = keep[inds]
-        boxes = boxes[keep]
-
-        if max_num > 0:
-            keep = keep[:max_num]
-            boxes = boxes[:max_num]
-            scores = scores[:max_num]
-
-    return torch.cat([boxes, scores[:, None]], -1), keep
-
-
-def nms_match(dets, iou_threshold):
-    """Matched dets into different groups by NMS.
-
-    NMS match is Similar to NMS but when a bbox is suppressed, nms match will
-    record the indice of suppressed bbox and form a group with the indice of
-    kept bbox. In each group, indice is sorted as score order.
-
-    Arguments:
-        dets (torch.Tensor | np.ndarray): Det boxes with scores, shape (N, 5).
-        iou_thr (float): IoU thresh for NMS.
-
-    Returns:
-        List[torch.Tensor | np.ndarray]: The outer list corresponds different
-            matched group, the inner Tensor corresponds the indices for a group
-            in score order.
-    """
-    if dets.shape[0] == 0:
-        matched = []
-    else:
-        assert dets.shape[-1] == 5, 'inputs dets.shape should be (N, 5), ' \
-                                    f'but get {dets.shape}'
-        if isinstance(dets, torch.Tensor):
-            dets_t = dets.detach().cpu()
-        else:
-            dets_t = torch.from_numpy(dets)
-        indata_list = [dets_t]
-        indata_dict = {'iou_threshold': float(iou_threshold)}
-        matched = ext_module.nms_match(*indata_list, **indata_dict)
-        if torch.__version__ == 'parrots':
-            matched = matched.tolist()
-
-    if isinstance(dets, torch.Tensor):
-        return [dets.new_tensor(m, dtype=torch.long) for m in matched]
-    else:
-        return [np.array(m, dtype=np.int) for m in matched]
-
-
-def nms_rotated(dets, scores, iou_threshold, labels=None):
-    """Performs non-maximum suppression (NMS) on the rotated boxes according to
-    their intersection-over-union (IoU).
-
-    Rotated NMS iteratively removes lower scoring rotated boxes which have an
-    IoU greater than iou_threshold with another (higher scoring) rotated box.
-
-    Args:
-        boxes (Tensor):  Rotated boxes in shape (N, 5). They are expected to \
-            be in (x_ctr, y_ctr, width, height, angle_radian) format.
-        scores (Tensor): scores in shape (N, ).
-        iou_threshold (float): IoU thresh for NMS.
-        labels (Tensor): boxes' label in shape (N,).
-
-    Returns:
-        tuple: kept dets(boxes and scores) and indice, which is always the \
-            same data type as the input.
-    """
-    if dets.shape[0] == 0:
-        return dets, None
-    multi_label = labels is not None
-    if multi_label:
-        dets_wl = torch.cat((dets, labels.unsqueeze(1)), 1)
-    else:
-        dets_wl = dets
-    _, order = scores.sort(0, descending=True)
-    dets_sorted = dets_wl.index_select(0, order)
-
-    if torch.__version__ == 'parrots':
-        keep_inds = ext_module.nms_rotated(
-            dets_wl,
-            scores,
-            order,
-            dets_sorted,
-            iou_threshold=iou_threshold,
-            multi_label=multi_label)
-    else:
-        keep_inds = ext_module.nms_rotated(dets_wl, scores, order, dets_sorted,
-                                           iou_threshold, multi_label)
-    dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)),
-                     dim=1)
-    return dets, keep_inds
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/pixel_group.py b/ControlNet/annotator/uniformer/mmcv/ops/pixel_group.py
deleted file mode 100644
index 2143c75f835a467c802fc3c37ecd3ac0f85bcda4..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/pixel_group.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext', ['pixel_group'])
-
-
-def pixel_group(score, mask, embedding, kernel_label, kernel_contour,
-                kernel_region_num, distance_threshold):
-    """Group pixels into text instances, which is widely used text detection
-    methods.
-
-    Arguments:
-        score (np.array or Tensor): The foreground score with size hxw.
-        mask (np.array or Tensor): The foreground mask with size hxw.
-        embedding (np.array or Tensor): The embedding with size hxwxc to
-            distinguish instances.
-        kernel_label (np.array or Tensor): The instance kernel index with
-            size hxw.
-        kernel_contour (np.array or Tensor): The kernel contour with size hxw.
-        kernel_region_num (int): The instance kernel region number.
-        distance_threshold (float): The embedding distance threshold between
-            kernel and pixel in one instance.
-
-    Returns:
-        pixel_assignment (List[List[float]]): The instance coordinate list.
-            Each element consists of averaged confidence, pixel number, and
-            coordinates (x_i, y_i for all pixels) in order.
-    """
-    assert isinstance(score, (torch.Tensor, np.ndarray))
-    assert isinstance(mask, (torch.Tensor, np.ndarray))
-    assert isinstance(embedding, (torch.Tensor, np.ndarray))
-    assert isinstance(kernel_label, (torch.Tensor, np.ndarray))
-    assert isinstance(kernel_contour, (torch.Tensor, np.ndarray))
-    assert isinstance(kernel_region_num, int)
-    assert isinstance(distance_threshold, float)
-
-    if isinstance(score, np.ndarray):
-        score = torch.from_numpy(score)
-    if isinstance(mask, np.ndarray):
-        mask = torch.from_numpy(mask)
-    if isinstance(embedding, np.ndarray):
-        embedding = torch.from_numpy(embedding)
-    if isinstance(kernel_label, np.ndarray):
-        kernel_label = torch.from_numpy(kernel_label)
-    if isinstance(kernel_contour, np.ndarray):
-        kernel_contour = torch.from_numpy(kernel_contour)
-
-    if torch.__version__ == 'parrots':
-        label = ext_module.pixel_group(
-            score,
-            mask,
-            embedding,
-            kernel_label,
-            kernel_contour,
-            kernel_region_num=kernel_region_num,
-            distance_threshold=distance_threshold)
-        label = label.tolist()
-        label = label[0]
-        list_index = kernel_region_num
-        pixel_assignment = []
-        for x in range(kernel_region_num):
-            pixel_assignment.append(
-                np.array(
-                    label[list_index:list_index + int(label[x])],
-                    dtype=np.float))
-            list_index = list_index + int(label[x])
-    else:
-        pixel_assignment = ext_module.pixel_group(score, mask, embedding,
-                                                  kernel_label, kernel_contour,
-                                                  kernel_region_num,
-                                                  distance_threshold)
-    return pixel_assignment
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/point_sample.py b/ControlNet/annotator/uniformer/mmcv/ops/point_sample.py
deleted file mode 100644
index 267f4b3c56630acd85f9bdc630b7be09abab0aba..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/point_sample.py
+++ /dev/null
@@ -1,336 +0,0 @@
-# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend  # noqa
-
-from os import path as osp
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.modules.utils import _pair
-from torch.onnx.operators import shape_as_tensor
-
-
-def bilinear_grid_sample(im, grid, align_corners=False):
-    """Given an input and a flow-field grid, computes the output using input
-    values and pixel locations from grid. Supported only bilinear interpolation
-    method to sample the input pixels.
-
-    Args:
-        im (torch.Tensor): Input feature map, shape (N, C, H, W)
-        grid (torch.Tensor): Point coordinates, shape (N, Hg, Wg, 2)
-        align_corners {bool}: If set to True, the extrema (-1 and 1) are
-            considered as referring to the center points of the input’s
-            corner pixels. If set to False, they are instead considered as
-            referring to the corner points of the input’s corner pixels,
-            making the sampling more resolution agnostic.
-    Returns:
-        torch.Tensor: A tensor with sampled points, shape (N, C, Hg, Wg)
-    """
-    n, c, h, w = im.shape
-    gn, gh, gw, _ = grid.shape
-    assert n == gn
-
-    x = grid[:, :, :, 0]
-    y = grid[:, :, :, 1]
-
-    if align_corners:
-        x = ((x + 1) / 2) * (w - 1)
-        y = ((y + 1) / 2) * (h - 1)
-    else:
-        x = ((x + 1) * w - 1) / 2
-        y = ((y + 1) * h - 1) / 2
-
-    x = x.view(n, -1)
-    y = y.view(n, -1)
-
-    x0 = torch.floor(x).long()
-    y0 = torch.floor(y).long()
-    x1 = x0 + 1
-    y1 = y0 + 1
-
-    wa = ((x1 - x) * (y1 - y)).unsqueeze(1)
-    wb = ((x1 - x) * (y - y0)).unsqueeze(1)
-    wc = ((x - x0) * (y1 - y)).unsqueeze(1)
-    wd = ((x - x0) * (y - y0)).unsqueeze(1)
-
-    # Apply default for grid_sample function zero padding
-    im_padded = F.pad(im, pad=[1, 1, 1, 1], mode='constant', value=0)
-    padded_h = h + 2
-    padded_w = w + 2
-    # save points positions after padding
-    x0, x1, y0, y1 = x0 + 1, x1 + 1, y0 + 1, y1 + 1
-
-    # Clip coordinates to padded image size
-    x0 = torch.where(x0 < 0, torch.tensor(0), x0)
-    x0 = torch.where(x0 > padded_w - 1, torch.tensor(padded_w - 1), x0)
-    x1 = torch.where(x1 < 0, torch.tensor(0), x1)
-    x1 = torch.where(x1 > padded_w - 1, torch.tensor(padded_w - 1), x1)
-    y0 = torch.where(y0 < 0, torch.tensor(0), y0)
-    y0 = torch.where(y0 > padded_h - 1, torch.tensor(padded_h - 1), y0)
-    y1 = torch.where(y1 < 0, torch.tensor(0), y1)
-    y1 = torch.where(y1 > padded_h - 1, torch.tensor(padded_h - 1), y1)
-
-    im_padded = im_padded.view(n, c, -1)
-
-    x0_y0 = (x0 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1)
-    x0_y1 = (x0 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1)
-    x1_y0 = (x1 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1)
-    x1_y1 = (x1 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1)
-
-    Ia = torch.gather(im_padded, 2, x0_y0)
-    Ib = torch.gather(im_padded, 2, x0_y1)
-    Ic = torch.gather(im_padded, 2, x1_y0)
-    Id = torch.gather(im_padded, 2, x1_y1)
-
-    return (Ia * wa + Ib * wb + Ic * wc + Id * wd).reshape(n, c, gh, gw)
-
-
-def is_in_onnx_export_without_custom_ops():
-    from annotator.uniformer.mmcv.ops import get_onnxruntime_op_path
-    ort_custom_op_path = get_onnxruntime_op_path()
-    return torch.onnx.is_in_onnx_export(
-    ) and not osp.exists(ort_custom_op_path)
-
-
-def normalize(grid):
-    """Normalize input grid from [-1, 1] to [0, 1]
-    Args:
-        grid (Tensor): The grid to be normalize, range [-1, 1].
-    Returns:
-        Tensor: Normalized grid, range [0, 1].
-    """
-
-    return (grid + 1.0) / 2.0
-
-
-def denormalize(grid):
-    """Denormalize input grid from range [0, 1] to [-1, 1]
-    Args:
-        grid (Tensor): The grid to be denormalize, range [0, 1].
-    Returns:
-        Tensor: Denormalized grid, range [-1, 1].
-    """
-
-    return grid * 2.0 - 1.0
-
-
-def generate_grid(num_grid, size, device):
-    """Generate regular square grid of points in [0, 1] x [0, 1] coordinate
-    space.
-
-    Args:
-        num_grid (int): The number of grids to sample, one for each region.
-        size (tuple(int, int)): The side size of the regular grid.
-        device (torch.device): Desired device of returned tensor.
-
-    Returns:
-        (torch.Tensor): A tensor of shape (num_grid, size[0]*size[1], 2) that
-            contains coordinates for the regular grids.
-    """
-
-    affine_trans = torch.tensor([[[1., 0., 0.], [0., 1., 0.]]], device=device)
-    grid = F.affine_grid(
-        affine_trans, torch.Size((1, 1, *size)), align_corners=False)
-    grid = normalize(grid)
-    return grid.view(1, -1, 2).expand(num_grid, -1, -1)
-
-
-def rel_roi_point_to_abs_img_point(rois, rel_roi_points):
-    """Convert roi based relative point coordinates to image based absolute
-    point coordinates.
-
-    Args:
-        rois (Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
-        rel_roi_points (Tensor): Point coordinates inside RoI, relative to
-            RoI, location, range (0, 1), shape (N, P, 2)
-    Returns:
-        Tensor: Image based absolute point coordinates, shape (N, P, 2)
-    """
-
-    with torch.no_grad():
-        assert rel_roi_points.size(0) == rois.size(0)
-        assert rois.dim() == 2
-        assert rel_roi_points.dim() == 3
-        assert rel_roi_points.size(2) == 2
-        # remove batch idx
-        if rois.size(1) == 5:
-            rois = rois[:, 1:]
-        abs_img_points = rel_roi_points.clone()
-        # To avoid an error during exporting to onnx use independent
-        # variables instead inplace computation
-        xs = abs_img_points[:, :, 0] * (rois[:, None, 2] - rois[:, None, 0])
-        ys = abs_img_points[:, :, 1] * (rois[:, None, 3] - rois[:, None, 1])
-        xs += rois[:, None, 0]
-        ys += rois[:, None, 1]
-        abs_img_points = torch.stack([xs, ys], dim=2)
-    return abs_img_points
-
-
-def get_shape_from_feature_map(x):
-    """Get spatial resolution of input feature map considering exporting to
-    onnx mode.
-
-    Args:
-        x (torch.Tensor): Input tensor, shape (N, C, H, W)
-    Returns:
-        torch.Tensor: Spatial resolution (width, height), shape (1, 1, 2)
-    """
-    if torch.onnx.is_in_onnx_export():
-        img_shape = shape_as_tensor(x)[2:].flip(0).view(1, 1, 2).to(
-            x.device).float()
-    else:
-        img_shape = torch.tensor(x.shape[2:]).flip(0).view(1, 1, 2).to(
-            x.device).float()
-    return img_shape
-
-
-def abs_img_point_to_rel_img_point(abs_img_points, img, spatial_scale=1.):
-    """Convert image based absolute point coordinates to image based relative
-    coordinates for sampling.
-
-    Args:
-        abs_img_points (Tensor): Image based absolute point coordinates,
-            shape (N, P, 2)
-        img (tuple/Tensor): (height, width) of image or feature map.
-        spatial_scale (float): Scale points by this factor. Default: 1.
-
-    Returns:
-        Tensor: Image based relative point coordinates for sampling,
-            shape (N, P, 2)
-    """
-
-    assert (isinstance(img, tuple) and len(img) == 2) or \
-           (isinstance(img, torch.Tensor) and len(img.shape) == 4)
-
-    if isinstance(img, tuple):
-        h, w = img
-        scale = torch.tensor([w, h],
-                             dtype=torch.float,
-                             device=abs_img_points.device)
-        scale = scale.view(1, 1, 2)
-    else:
-        scale = get_shape_from_feature_map(img)
-
-    return abs_img_points / scale * spatial_scale
-
-
-def rel_roi_point_to_rel_img_point(rois,
-                                   rel_roi_points,
-                                   img,
-                                   spatial_scale=1.):
-    """Convert roi based relative point coordinates to image based absolute
-    point coordinates.
-
-    Args:
-        rois (Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
-        rel_roi_points (Tensor): Point coordinates inside RoI, relative to
-            RoI, location, range (0, 1), shape (N, P, 2)
-        img (tuple/Tensor): (height, width) of image or feature map.
-        spatial_scale (float): Scale points by this factor. Default: 1.
-
-    Returns:
-        Tensor: Image based relative point coordinates for sampling,
-            shape (N, P, 2)
-    """
-
-    abs_img_point = rel_roi_point_to_abs_img_point(rois, rel_roi_points)
-    rel_img_point = abs_img_point_to_rel_img_point(abs_img_point, img,
-                                                   spatial_scale)
-
-    return rel_img_point
-
-
-def point_sample(input, points, align_corners=False, **kwargs):
-    """A wrapper around :func:`grid_sample` to support 3D point_coords tensors
-    Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to
-    lie inside ``[0, 1] x [0, 1]`` square.
-
-    Args:
-        input (Tensor): Feature map, shape (N, C, H, W).
-        points (Tensor): Image based absolute point coordinates (normalized),
-            range [0, 1] x [0, 1], shape (N, P, 2) or (N, Hgrid, Wgrid, 2).
-        align_corners (bool): Whether align_corners. Default: False
-
-    Returns:
-        Tensor: Features of `point` on `input`, shape (N, C, P) or
-            (N, C, Hgrid, Wgrid).
-    """
-
-    add_dim = False
-    if points.dim() == 3:
-        add_dim = True
-        points = points.unsqueeze(2)
-    if is_in_onnx_export_without_custom_ops():
-        # If custom ops for onnx runtime not compiled use python
-        # implementation of grid_sample function to make onnx graph
-        # with supported nodes
-        output = bilinear_grid_sample(
-            input, denormalize(points), align_corners=align_corners)
-    else:
-        output = F.grid_sample(
-            input, denormalize(points), align_corners=align_corners, **kwargs)
-    if add_dim:
-        output = output.squeeze(3)
-    return output
-
-
-class SimpleRoIAlign(nn.Module):
-
-    def __init__(self, output_size, spatial_scale, aligned=True):
-        """Simple RoI align in PointRend, faster than standard RoIAlign.
-
-        Args:
-            output_size (tuple[int]): h, w
-            spatial_scale (float): scale the input boxes by this number
-            aligned (bool): if False, use the legacy implementation in
-                MMDetection, align_corners=True will be used in F.grid_sample.
-                If True, align the results more perfectly.
-        """
-
-        super(SimpleRoIAlign, self).__init__()
-        self.output_size = _pair(output_size)
-        self.spatial_scale = float(spatial_scale)
-        # to be consistent with other RoI ops
-        self.use_torchvision = False
-        self.aligned = aligned
-
-    def forward(self, features, rois):
-        num_imgs = features.size(0)
-        num_rois = rois.size(0)
-        rel_roi_points = generate_grid(
-            num_rois, self.output_size, device=rois.device)
-
-        if torch.onnx.is_in_onnx_export():
-            rel_img_points = rel_roi_point_to_rel_img_point(
-                rois, rel_roi_points, features, self.spatial_scale)
-            rel_img_points = rel_img_points.reshape(num_imgs, -1,
-                                                    *rel_img_points.shape[1:])
-            point_feats = point_sample(
-                features, rel_img_points, align_corners=not self.aligned)
-            point_feats = point_feats.transpose(1, 2)
-        else:
-            point_feats = []
-            for batch_ind in range(num_imgs):
-                # unravel batch dim
-                feat = features[batch_ind].unsqueeze(0)
-                inds = (rois[:, 0].long() == batch_ind)
-                if inds.any():
-                    rel_img_points = rel_roi_point_to_rel_img_point(
-                        rois[inds], rel_roi_points[inds], feat,
-                        self.spatial_scale).unsqueeze(0)
-                    point_feat = point_sample(
-                        feat, rel_img_points, align_corners=not self.aligned)
-                    point_feat = point_feat.squeeze(0).transpose(0, 1)
-                    point_feats.append(point_feat)
-
-            point_feats = torch.cat(point_feats, dim=0)
-
-        channels = features.size(1)
-        roi_feats = point_feats.reshape(num_rois, channels, *self.output_size)
-
-        return roi_feats
-
-    def __repr__(self):
-        format_str = self.__class__.__name__
-        format_str += '(output_size={}, spatial_scale={}'.format(
-            self.output_size, self.spatial_scale)
-        return format_str
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/points_in_boxes.py b/ControlNet/annotator/uniformer/mmcv/ops/points_in_boxes.py
deleted file mode 100644
index 4003173a53052161dbcd687a2fa1d755642fdab8..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/points_in_boxes.py
+++ /dev/null
@@ -1,133 +0,0 @@
-import torch
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext', [
-    'points_in_boxes_part_forward', 'points_in_boxes_cpu_forward',
-    'points_in_boxes_all_forward'
-])
-
-
-def points_in_boxes_part(points, boxes):
-    """Find the box in which each point is (CUDA).
-
-    Args:
-        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate
-        boxes (torch.Tensor): [B, T, 7],
-            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz] in
-            LiDAR/DEPTH coordinate, (x, y, z) is the bottom center
-
-    Returns:
-        box_idxs_of_pts (torch.Tensor): (B, M), default background = -1
-    """
-    assert points.shape[0] == boxes.shape[0], \
-        'Points and boxes should have the same batch size, ' \
-        f'but got {points.shape[0]} and {boxes.shape[0]}'
-    assert boxes.shape[2] == 7, \
-        'boxes dimension should be 7, ' \
-        f'but got unexpected shape {boxes.shape[2]}'
-    assert points.shape[2] == 3, \
-        'points dimension should be 3, ' \
-        f'but got unexpected shape {points.shape[2]}'
-    batch_size, num_points, _ = points.shape
-
-    box_idxs_of_pts = points.new_zeros((batch_size, num_points),
-                                       dtype=torch.int).fill_(-1)
-
-    # If manually put the tensor 'points' or 'boxes' on a device
-    # which is not the current device, some temporary variables
-    # will be created on the current device in the cuda op,
-    # and the output will be incorrect.
-    # Therefore, we force the current device to be the same
-    # as the device of the tensors if it was not.
-    # Please refer to https://github.com/open-mmlab/mmdetection3d/issues/305
-    # for the incorrect output before the fix.
-    points_device = points.get_device()
-    assert points_device == boxes.get_device(), \
-        'Points and boxes should be put on the same device'
-    if torch.cuda.current_device() != points_device:
-        torch.cuda.set_device(points_device)
-
-    ext_module.points_in_boxes_part_forward(boxes.contiguous(),
-                                            points.contiguous(),
-                                            box_idxs_of_pts)
-
-    return box_idxs_of_pts
-
-
-def points_in_boxes_cpu(points, boxes):
-    """Find all boxes in which each point is (CPU). The CPU version of
-    :meth:`points_in_boxes_all`.
-
-    Args:
-        points (torch.Tensor): [B, M, 3], [x, y, z] in
-            LiDAR/DEPTH coordinate
-        boxes (torch.Tensor): [B, T, 7],
-            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
-            (x, y, z) is the bottom center.
-
-    Returns:
-        box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0.
-    """
-    assert points.shape[0] == boxes.shape[0], \
-        'Points and boxes should have the same batch size, ' \
-        f'but got {points.shape[0]} and {boxes.shape[0]}'
-    assert boxes.shape[2] == 7, \
-        'boxes dimension should be 7, ' \
-        f'but got unexpected shape {boxes.shape[2]}'
-    assert points.shape[2] == 3, \
-        'points dimension should be 3, ' \
-        f'but got unexpected shape {points.shape[2]}'
-    batch_size, num_points, _ = points.shape
-    num_boxes = boxes.shape[1]
-
-    point_indices = points.new_zeros((batch_size, num_boxes, num_points),
-                                     dtype=torch.int)
-    for b in range(batch_size):
-        ext_module.points_in_boxes_cpu_forward(boxes[b].float().contiguous(),
-                                               points[b].float().contiguous(),
-                                               point_indices[b])
-    point_indices = point_indices.transpose(1, 2)
-
-    return point_indices
-
-
-def points_in_boxes_all(points, boxes):
-    """Find all boxes in which each point is (CUDA).
-
-    Args:
-        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate
-        boxes (torch.Tensor): [B, T, 7],
-            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
-            (x, y, z) is the bottom center.
-
-    Returns:
-        box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0.
-    """
-    assert boxes.shape[0] == points.shape[0], \
-        'Points and boxes should have the same batch size, ' \
-        f'but got {boxes.shape[0]} and {boxes.shape[0]}'
-    assert boxes.shape[2] == 7, \
-        'boxes dimension should be 7, ' \
-        f'but got unexpected shape {boxes.shape[2]}'
-    assert points.shape[2] == 3, \
-        'points dimension should be 3, ' \
-        f'but got unexpected shape {points.shape[2]}'
-    batch_size, num_points, _ = points.shape
-    num_boxes = boxes.shape[1]
-
-    box_idxs_of_pts = points.new_zeros((batch_size, num_points, num_boxes),
-                                       dtype=torch.int).fill_(0)
-
-    # Same reason as line 25-32
-    points_device = points.get_device()
-    assert points_device == boxes.get_device(), \
-        'Points and boxes should be put on the same device'
-    if torch.cuda.current_device() != points_device:
-        torch.cuda.set_device(points_device)
-
-    ext_module.points_in_boxes_all_forward(boxes.contiguous(),
-                                           points.contiguous(),
-                                           box_idxs_of_pts)
-
-    return box_idxs_of_pts
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/points_sampler.py b/ControlNet/annotator/uniformer/mmcv/ops/points_sampler.py
deleted file mode 100644
index a802a74fd6c3610d9ae178e6201f47423eca7ad1..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/points_sampler.py
+++ /dev/null
@@ -1,177 +0,0 @@
-from typing import List
-
-import torch
-from torch import nn as nn
-
-from annotator.uniformer.mmcv.runner import force_fp32
-from .furthest_point_sample import (furthest_point_sample,
-                                    furthest_point_sample_with_dist)
-
-
-def calc_square_dist(point_feat_a, point_feat_b, norm=True):
-    """Calculating square distance between a and b.
-
-    Args:
-        point_feat_a (Tensor): (B, N, C) Feature vector of each point.
-        point_feat_b (Tensor): (B, M, C) Feature vector of each point.
-        norm (Bool, optional): Whether to normalize the distance.
-            Default: True.
-
-    Returns:
-        Tensor: (B, N, M) Distance between each pair points.
-    """
-    num_channel = point_feat_a.shape[-1]
-    # [bs, n, 1]
-    a_square = torch.sum(point_feat_a.unsqueeze(dim=2).pow(2), dim=-1)
-    # [bs, 1, m]
-    b_square = torch.sum(point_feat_b.unsqueeze(dim=1).pow(2), dim=-1)
-
-    corr_matrix = torch.matmul(point_feat_a, point_feat_b.transpose(1, 2))
-
-    dist = a_square + b_square - 2 * corr_matrix
-    if norm:
-        dist = torch.sqrt(dist) / num_channel
-    return dist
-
-
-def get_sampler_cls(sampler_type):
-    """Get the type and mode of points sampler.
-
-    Args:
-        sampler_type (str): The type of points sampler.
-            The valid value are "D-FPS", "F-FPS", or "FS".
-
-    Returns:
-        class: Points sampler type.
-    """
-    sampler_mappings = {
-        'D-FPS': DFPSSampler,
-        'F-FPS': FFPSSampler,
-        'FS': FSSampler,
-    }
-    try:
-        return sampler_mappings[sampler_type]
-    except KeyError:
-        raise KeyError(
-            f'Supported `sampler_type` are {sampler_mappings.keys()}, but got \
-                {sampler_type}')
-
-
-class PointsSampler(nn.Module):
-    """Points sampling.
-
-    Args:
-        num_point (list[int]): Number of sample points.
-        fps_mod_list (list[str], optional): Type of FPS method, valid mod
-            ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].
-            F-FPS: using feature distances for FPS.
-            D-FPS: using Euclidean distances of points for FPS.
-            FS: using F-FPS and D-FPS simultaneously.
-        fps_sample_range_list (list[int], optional):
-            Range of points to apply FPS. Default: [-1].
-    """
-
-    def __init__(self,
-                 num_point: List[int],
-                 fps_mod_list: List[str] = ['D-FPS'],
-                 fps_sample_range_list: List[int] = [-1]):
-        super().__init__()
-        # FPS would be applied to different fps_mod in the list,
-        # so the length of the num_point should be equal to
-        # fps_mod_list and fps_sample_range_list.
-        assert len(num_point) == len(fps_mod_list) == len(
-            fps_sample_range_list)
-        self.num_point = num_point
-        self.fps_sample_range_list = fps_sample_range_list
-        self.samplers = nn.ModuleList()
-        for fps_mod in fps_mod_list:
-            self.samplers.append(get_sampler_cls(fps_mod)())
-        self.fp16_enabled = False
-
-    @force_fp32()
-    def forward(self, points_xyz, features):
-        """
-        Args:
-            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
-            features (Tensor): (B, C, N) Descriptors of the features.
-
-        Returns:
-            Tensor: (B, npoint, sample_num) Indices of sampled points.
-        """
-        indices = []
-        last_fps_end_index = 0
-
-        for fps_sample_range, sampler, npoint in zip(
-                self.fps_sample_range_list, self.samplers, self.num_point):
-            assert fps_sample_range < points_xyz.shape[1]
-
-            if fps_sample_range == -1:
-                sample_points_xyz = points_xyz[:, last_fps_end_index:]
-                if features is not None:
-                    sample_features = features[:, :, last_fps_end_index:]
-                else:
-                    sample_features = None
-            else:
-                sample_points_xyz = \
-                    points_xyz[:, last_fps_end_index:fps_sample_range]
-                if features is not None:
-                    sample_features = features[:, :, last_fps_end_index:
-                                               fps_sample_range]
-                else:
-                    sample_features = None
-
-            fps_idx = sampler(sample_points_xyz.contiguous(), sample_features,
-                              npoint)
-
-            indices.append(fps_idx + last_fps_end_index)
-            last_fps_end_index += fps_sample_range
-        indices = torch.cat(indices, dim=1)
-
-        return indices
-
-
-class DFPSSampler(nn.Module):
-    """Using Euclidean distances of points for FPS."""
-
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, points, features, npoint):
-        """Sampling points with D-FPS."""
-        fps_idx = furthest_point_sample(points.contiguous(), npoint)
-        return fps_idx
-
-
-class FFPSSampler(nn.Module):
-    """Using feature distances for FPS."""
-
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, points, features, npoint):
-        """Sampling points with F-FPS."""
-        assert features is not None, \
-            'feature input to FFPS_Sampler should not be None'
-        features_for_fps = torch.cat([points, features.transpose(1, 2)], dim=2)
-        features_dist = calc_square_dist(
-            features_for_fps, features_for_fps, norm=False)
-        fps_idx = furthest_point_sample_with_dist(features_dist, npoint)
-        return fps_idx
-
-
-class FSSampler(nn.Module):
-    """Using F-FPS and D-FPS simultaneously."""
-
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, points, features, npoint):
-        """Sampling points with FS_Sampling."""
-        assert features is not None, \
-            'feature input to FS_Sampler should not be None'
-        ffps_sampler = FFPSSampler()
-        dfps_sampler = DFPSSampler()
-        fps_idx_ffps = ffps_sampler(points, features, npoint)
-        fps_idx_dfps = dfps_sampler(points, features, npoint)
-        fps_idx = torch.cat([fps_idx_ffps, fps_idx_dfps], dim=1)
-        return fps_idx
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/psa_mask.py b/ControlNet/annotator/uniformer/mmcv/ops/psa_mask.py
deleted file mode 100644
index cdf14e62b50e8d4dd6856c94333c703bcc4c9ab6..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/psa_mask.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Modified from https://github.com/hszhao/semseg/blob/master/lib/psa
-from torch import nn
-from torch.autograd import Function
-from torch.nn.modules.utils import _pair
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext',
-                                 ['psamask_forward', 'psamask_backward'])
-
-
-class PSAMaskFunction(Function):
-
-    @staticmethod
-    def symbolic(g, input, psa_type, mask_size):
-        return g.op(
-            'mmcv::MMCVPSAMask',
-            input,
-            psa_type_i=psa_type,
-            mask_size_i=mask_size)
-
-    @staticmethod
-    def forward(ctx, input, psa_type, mask_size):
-        ctx.psa_type = psa_type
-        ctx.mask_size = _pair(mask_size)
-        ctx.save_for_backward(input)
-
-        h_mask, w_mask = ctx.mask_size
-        batch_size, channels, h_feature, w_feature = input.size()
-        assert channels == h_mask * w_mask
-        output = input.new_zeros(
-            (batch_size, h_feature * w_feature, h_feature, w_feature))
-
-        ext_module.psamask_forward(
-            input,
-            output,
-            psa_type=psa_type,
-            num_=batch_size,
-            h_feature=h_feature,
-            w_feature=w_feature,
-            h_mask=h_mask,
-            w_mask=w_mask,
-            half_h_mask=(h_mask - 1) // 2,
-            half_w_mask=(w_mask - 1) // 2)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input = ctx.saved_tensors[0]
-        psa_type = ctx.psa_type
-        h_mask, w_mask = ctx.mask_size
-        batch_size, channels, h_feature, w_feature = input.size()
-        grad_input = grad_output.new_zeros(
-            (batch_size, channels, h_feature, w_feature))
-        ext_module.psamask_backward(
-            grad_output,
-            grad_input,
-            psa_type=psa_type,
-            num_=batch_size,
-            h_feature=h_feature,
-            w_feature=w_feature,
-            h_mask=h_mask,
-            w_mask=w_mask,
-            half_h_mask=(h_mask - 1) // 2,
-            half_w_mask=(w_mask - 1) // 2)
-        return grad_input, None, None, None
-
-
-psa_mask = PSAMaskFunction.apply
-
-
-class PSAMask(nn.Module):
-
-    def __init__(self, psa_type, mask_size=None):
-        super(PSAMask, self).__init__()
-        assert psa_type in ['collect', 'distribute']
-        if psa_type == 'collect':
-            psa_type_enum = 0
-        else:
-            psa_type_enum = 1
-        self.psa_type_enum = psa_type_enum
-        self.mask_size = mask_size
-        self.psa_type = psa_type
-
-    def forward(self, input):
-        return psa_mask(input, self.psa_type_enum, self.mask_size)
-
-    def __repr__(self):
-        s = self.__class__.__name__
-        s += f'(psa_type={self.psa_type}, '
-        s += f'mask_size={self.mask_size})'
-        return s
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/roi_align.py b/ControlNet/annotator/uniformer/mmcv/ops/roi_align.py
deleted file mode 100644
index 0755aefc66e67233ceae0f4b77948301c443e9fb..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/roi_align.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-import torch.nn as nn
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-from torch.nn.modules.utils import _pair
-
-from ..utils import deprecated_api_warning, ext_loader
-
-ext_module = ext_loader.load_ext('_ext',
-                                 ['roi_align_forward', 'roi_align_backward'])
-
-
-class RoIAlignFunction(Function):
-
-    @staticmethod
-    def symbolic(g, input, rois, output_size, spatial_scale, sampling_ratio,
-                 pool_mode, aligned):
-        from ..onnx import is_custom_op_loaded
-        has_custom_op = is_custom_op_loaded()
-        if has_custom_op:
-            return g.op(
-                'mmcv::MMCVRoiAlign',
-                input,
-                rois,
-                output_height_i=output_size[0],
-                output_width_i=output_size[1],
-                spatial_scale_f=spatial_scale,
-                sampling_ratio_i=sampling_ratio,
-                mode_s=pool_mode,
-                aligned_i=aligned)
-        else:
-            from torch.onnx.symbolic_opset9 import sub, squeeze
-            from torch.onnx.symbolic_helper import _slice_helper
-            from torch.onnx import TensorProtoDataType
-            # batch_indices = rois[:, 0].long()
-            batch_indices = _slice_helper(
-                g, rois, axes=[1], starts=[0], ends=[1])
-            batch_indices = squeeze(g, batch_indices, 1)
-            batch_indices = g.op(
-                'Cast', batch_indices, to_i=TensorProtoDataType.INT64)
-            # rois = rois[:, 1:]
-            rois = _slice_helper(g, rois, axes=[1], starts=[1], ends=[5])
-            if aligned:
-                # rois -= 0.5/spatial_scale
-                aligned_offset = g.op(
-                    'Constant',
-                    value_t=torch.tensor([0.5 / spatial_scale],
-                                         dtype=torch.float32))
-                rois = sub(g, rois, aligned_offset)
-            # roi align
-            return g.op(
-                'RoiAlign',
-                input,
-                rois,
-                batch_indices,
-                output_height_i=output_size[0],
-                output_width_i=output_size[1],
-                spatial_scale_f=spatial_scale,
-                sampling_ratio_i=max(0, sampling_ratio),
-                mode_s=pool_mode)
-
-    @staticmethod
-    def forward(ctx,
-                input,
-                rois,
-                output_size,
-                spatial_scale=1.0,
-                sampling_ratio=0,
-                pool_mode='avg',
-                aligned=True):
-        ctx.output_size = _pair(output_size)
-        ctx.spatial_scale = spatial_scale
-        ctx.sampling_ratio = sampling_ratio
-        assert pool_mode in ('max', 'avg')
-        ctx.pool_mode = 0 if pool_mode == 'max' else 1
-        ctx.aligned = aligned
-        ctx.input_shape = input.size()
-
-        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'
-
-        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],
-                        ctx.output_size[1])
-        output = input.new_zeros(output_shape)
-        if ctx.pool_mode == 0:
-            argmax_y = input.new_zeros(output_shape)
-            argmax_x = input.new_zeros(output_shape)
-        else:
-            argmax_y = input.new_zeros(0)
-            argmax_x = input.new_zeros(0)
-
-        ext_module.roi_align_forward(
-            input,
-            rois,
-            output,
-            argmax_y,
-            argmax_x,
-            aligned_height=ctx.output_size[0],
-            aligned_width=ctx.output_size[1],
-            spatial_scale=ctx.spatial_scale,
-            sampling_ratio=ctx.sampling_ratio,
-            pool_mode=ctx.pool_mode,
-            aligned=ctx.aligned)
-
-        ctx.save_for_backward(rois, argmax_y, argmax_x)
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output):
-        rois, argmax_y, argmax_x = ctx.saved_tensors
-        grad_input = grad_output.new_zeros(ctx.input_shape)
-        # complex head architecture may cause grad_output uncontiguous.
-        grad_output = grad_output.contiguous()
-        ext_module.roi_align_backward(
-            grad_output,
-            rois,
-            argmax_y,
-            argmax_x,
-            grad_input,
-            aligned_height=ctx.output_size[0],
-            aligned_width=ctx.output_size[1],
-            spatial_scale=ctx.spatial_scale,
-            sampling_ratio=ctx.sampling_ratio,
-            pool_mode=ctx.pool_mode,
-            aligned=ctx.aligned)
-        return grad_input, None, None, None, None, None, None
-
-
-roi_align = RoIAlignFunction.apply
-
-
-class RoIAlign(nn.Module):
-    """RoI align pooling layer.
-
-    Args:
-        output_size (tuple): h, w
-        spatial_scale (float): scale the input boxes by this number
-        sampling_ratio (int): number of inputs samples to take for each
-            output sample. 0 to take samples densely for current models.
-        pool_mode (str, 'avg' or 'max'): pooling mode in each bin.
-        aligned (bool): if False, use the legacy implementation in
-            MMDetection. If True, align the results more perfectly.
-        use_torchvision (bool): whether to use roi_align from torchvision.
-
-    Note:
-        The implementation of RoIAlign when aligned=True is modified from
-        https://github.com/facebookresearch/detectron2/
-
-        The meaning of aligned=True:
-
-        Given a continuous coordinate c, its two neighboring pixel
-        indices (in our pixel model) are computed by floor(c - 0.5) and
-        ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
-        indices [0] and [1] (which are sampled from the underlying signal
-        at continuous coordinates 0.5 and 1.5). But the original roi_align
-        (aligned=False) does not subtract the 0.5 when computing
-        neighboring pixel indices and therefore it uses pixels with a
-        slightly incorrect alignment (relative to our pixel model) when
-        performing bilinear interpolation.
-
-        With `aligned=True`,
-        we first appropriately scale the ROI and then shift it by -0.5
-        prior to calling roi_align. This produces the correct neighbors;
-
-        The difference does not make a difference to the model's
-        performance if ROIAlign is used together with conv layers.
-    """
-
-    @deprecated_api_warning(
-        {
-            'out_size': 'output_size',
-            'sample_num': 'sampling_ratio'
-        },
-        cls_name='RoIAlign')
-    def __init__(self,
-                 output_size,
-                 spatial_scale=1.0,
-                 sampling_ratio=0,
-                 pool_mode='avg',
-                 aligned=True,
-                 use_torchvision=False):
-        super(RoIAlign, self).__init__()
-
-        self.output_size = _pair(output_size)
-        self.spatial_scale = float(spatial_scale)
-        self.sampling_ratio = int(sampling_ratio)
-        self.pool_mode = pool_mode
-        self.aligned = aligned
-        self.use_torchvision = use_torchvision
-
-    def forward(self, input, rois):
-        """
-        Args:
-            input: NCHW images
-            rois: Bx5 boxes. First column is the index into N.\
-                The other 4 columns are xyxy.
-        """
-        if self.use_torchvision:
-            from torchvision.ops import roi_align as tv_roi_align
-            if 'aligned' in tv_roi_align.__code__.co_varnames:
-                return tv_roi_align(input, rois, self.output_size,
-                                    self.spatial_scale, self.sampling_ratio,
-                                    self.aligned)
-            else:
-                if self.aligned:
-                    rois -= rois.new_tensor([0.] +
-                                            [0.5 / self.spatial_scale] * 4)
-                return tv_roi_align(input, rois, self.output_size,
-                                    self.spatial_scale, self.sampling_ratio)
-        else:
-            return roi_align(input, rois, self.output_size, self.spatial_scale,
-                             self.sampling_ratio, self.pool_mode, self.aligned)
-
-    def __repr__(self):
-        s = self.__class__.__name__
-        s += f'(output_size={self.output_size}, '
-        s += f'spatial_scale={self.spatial_scale}, '
-        s += f'sampling_ratio={self.sampling_ratio}, '
-        s += f'pool_mode={self.pool_mode}, '
-        s += f'aligned={self.aligned}, '
-        s += f'use_torchvision={self.use_torchvision})'
-        return s
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/roi_align_rotated.py b/ControlNet/annotator/uniformer/mmcv/ops/roi_align_rotated.py
deleted file mode 100644
index 0ce4961a3555d4da8bc3e32f1f7d5ad50036587d..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/roi_align_rotated.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch.nn as nn
-from torch.autograd import Function
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext(
-    '_ext', ['roi_align_rotated_forward', 'roi_align_rotated_backward'])
-
-
-class RoIAlignRotatedFunction(Function):
-
-    @staticmethod
-    def symbolic(g, features, rois, out_size, spatial_scale, sample_num,
-                 aligned, clockwise):
-        if isinstance(out_size, int):
-            out_h = out_size
-            out_w = out_size
-        elif isinstance(out_size, tuple):
-            assert len(out_size) == 2
-            assert isinstance(out_size[0], int)
-            assert isinstance(out_size[1], int)
-            out_h, out_w = out_size
-        else:
-            raise TypeError(
-                '"out_size" must be an integer or tuple of integers')
-        return g.op(
-            'mmcv::MMCVRoIAlignRotated',
-            features,
-            rois,
-            output_height_i=out_h,
-            output_width_i=out_h,
-            spatial_scale_f=spatial_scale,
-            sampling_ratio_i=sample_num,
-            aligned_i=aligned,
-            clockwise_i=clockwise)
-
-    @staticmethod
-    def forward(ctx,
-                features,
-                rois,
-                out_size,
-                spatial_scale,
-                sample_num=0,
-                aligned=True,
-                clockwise=False):
-        if isinstance(out_size, int):
-            out_h = out_size
-            out_w = out_size
-        elif isinstance(out_size, tuple):
-            assert len(out_size) == 2
-            assert isinstance(out_size[0], int)
-            assert isinstance(out_size[1], int)
-            out_h, out_w = out_size
-        else:
-            raise TypeError(
-                '"out_size" must be an integer or tuple of integers')
-        ctx.spatial_scale = spatial_scale
-        ctx.sample_num = sample_num
-        ctx.aligned = aligned
-        ctx.clockwise = clockwise
-        ctx.save_for_backward(rois)
-        ctx.feature_size = features.size()
-
-        batch_size, num_channels, data_height, data_width = features.size()
-        num_rois = rois.size(0)
-
-        output = features.new_zeros(num_rois, num_channels, out_h, out_w)
-        ext_module.roi_align_rotated_forward(
-            features,
-            rois,
-            output,
-            pooled_height=out_h,
-            pooled_width=out_w,
-            spatial_scale=spatial_scale,
-            sample_num=sample_num,
-            aligned=aligned,
-            clockwise=clockwise)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        feature_size = ctx.feature_size
-        spatial_scale = ctx.spatial_scale
-        aligned = ctx.aligned
-        clockwise = ctx.clockwise
-        sample_num = ctx.sample_num
-        rois = ctx.saved_tensors[0]
-        assert feature_size is not None
-        batch_size, num_channels, data_height, data_width = feature_size
-
-        out_w = grad_output.size(3)
-        out_h = grad_output.size(2)
-
-        grad_input = grad_rois = None
-
-        if ctx.needs_input_grad[0]:
-            grad_input = rois.new_zeros(batch_size, num_channels, data_height,
-                                        data_width)
-            ext_module.roi_align_rotated_backward(
-                grad_output.contiguous(),
-                rois,
-                grad_input,
-                pooled_height=out_h,
-                pooled_width=out_w,
-                spatial_scale=spatial_scale,
-                sample_num=sample_num,
-                aligned=aligned,
-                clockwise=clockwise)
-        return grad_input, grad_rois, None, None, None, None, None
-
-
-roi_align_rotated = RoIAlignRotatedFunction.apply
-
-
-class RoIAlignRotated(nn.Module):
-    """RoI align pooling layer for rotated proposals.
-
-    It accepts a feature map of shape (N, C, H, W) and rois with shape
-    (n, 6) with each roi decoded as (batch_index, center_x, center_y,
-    w, h, angle). The angle is in radian.
-
-    Args:
-        out_size (tuple): h, w
-        spatial_scale (float): scale the input boxes by this number
-        sample_num (int): number of inputs samples to take for each
-            output sample. 0 to take samples densely for current models.
-        aligned (bool): if False, use the legacy implementation in
-            MMDetection. If True, align the results more perfectly.
-            Default: True.
-        clockwise (bool): If True, the angle in each proposal follows a
-            clockwise fashion in image space, otherwise, the angle is
-            counterclockwise. Default: False.
-
-    Note:
-        The implementation of RoIAlign when aligned=True is modified from
-        https://github.com/facebookresearch/detectron2/
-
-        The meaning of aligned=True:
-
-        Given a continuous coordinate c, its two neighboring pixel
-        indices (in our pixel model) are computed by floor(c - 0.5) and
-        ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
-        indices [0] and [1] (which are sampled from the underlying signal
-        at continuous coordinates 0.5 and 1.5). But the original roi_align
-        (aligned=False) does not subtract the 0.5 when computing
-        neighboring pixel indices and therefore it uses pixels with a
-        slightly incorrect alignment (relative to our pixel model) when
-        performing bilinear interpolation.
-
-        With `aligned=True`,
-        we first appropriately scale the ROI and then shift it by -0.5
-        prior to calling roi_align. This produces the correct neighbors;
-
-        The difference does not make a difference to the model's
-        performance if ROIAlign is used together with conv layers.
-    """
-
-    def __init__(self,
-                 out_size,
-                 spatial_scale,
-                 sample_num=0,
-                 aligned=True,
-                 clockwise=False):
-        super(RoIAlignRotated, self).__init__()
-
-        self.out_size = out_size
-        self.spatial_scale = float(spatial_scale)
-        self.sample_num = int(sample_num)
-        self.aligned = aligned
-        self.clockwise = clockwise
-
-    def forward(self, features, rois):
-        return RoIAlignRotatedFunction.apply(features, rois, self.out_size,
-                                             self.spatial_scale,
-                                             self.sample_num, self.aligned,
-                                             self.clockwise)
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/roi_pool.py b/ControlNet/annotator/uniformer/mmcv/ops/roi_pool.py
deleted file mode 100644
index d339d8f2941eabc1cbe181a9c6c5ab5ff4ff4e5f..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/roi_pool.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-import torch.nn as nn
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-from torch.nn.modules.utils import _pair
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext',
-                                 ['roi_pool_forward', 'roi_pool_backward'])
-
-
-class RoIPoolFunction(Function):
-
-    @staticmethod
-    def symbolic(g, input, rois, output_size, spatial_scale):
-        return g.op(
-            'MaxRoiPool',
-            input,
-            rois,
-            pooled_shape_i=output_size,
-            spatial_scale_f=spatial_scale)
-
-    @staticmethod
-    def forward(ctx, input, rois, output_size, spatial_scale=1.0):
-        ctx.output_size = _pair(output_size)
-        ctx.spatial_scale = spatial_scale
-        ctx.input_shape = input.size()
-
-        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'
-
-        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],
-                        ctx.output_size[1])
-        output = input.new_zeros(output_shape)
-        argmax = input.new_zeros(output_shape, dtype=torch.int)
-
-        ext_module.roi_pool_forward(
-            input,
-            rois,
-            output,
-            argmax,
-            pooled_height=ctx.output_size[0],
-            pooled_width=ctx.output_size[1],
-            spatial_scale=ctx.spatial_scale)
-
-        ctx.save_for_backward(rois, argmax)
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output):
-        rois, argmax = ctx.saved_tensors
-        grad_input = grad_output.new_zeros(ctx.input_shape)
-
-        ext_module.roi_pool_backward(
-            grad_output,
-            rois,
-            argmax,
-            grad_input,
-            pooled_height=ctx.output_size[0],
-            pooled_width=ctx.output_size[1],
-            spatial_scale=ctx.spatial_scale)
-
-        return grad_input, None, None, None
-
-
-roi_pool = RoIPoolFunction.apply
-
-
-class RoIPool(nn.Module):
-
-    def __init__(self, output_size, spatial_scale=1.0):
-        super(RoIPool, self).__init__()
-
-        self.output_size = _pair(output_size)
-        self.spatial_scale = float(spatial_scale)
-
-    def forward(self, input, rois):
-        return roi_pool(input, rois, self.output_size, self.spatial_scale)
-
-    def __repr__(self):
-        s = self.__class__.__name__
-        s += f'(output_size={self.output_size}, '
-        s += f'spatial_scale={self.spatial_scale})'
-        return s
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/roiaware_pool3d.py b/ControlNet/annotator/uniformer/mmcv/ops/roiaware_pool3d.py
deleted file mode 100644
index 291b0e5a9b692492c7d7e495ea639c46042e2f18..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/roiaware_pool3d.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch import nn as nn
-from torch.autograd import Function
-
-import annotator.uniformer.mmcv as mmcv
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext(
-    '_ext', ['roiaware_pool3d_forward', 'roiaware_pool3d_backward'])
-
-
-class RoIAwarePool3d(nn.Module):
-    """Encode the geometry-specific features of each 3D proposal.
-
-    Please refer to `PartA2 <https://arxiv.org/pdf/1907.03670.pdf>`_ for more
-    details.
-
-    Args:
-        out_size (int or tuple): The size of output features. n or
-            [n1, n2, n3].
-        max_pts_per_voxel (int, optional): The maximum number of points per
-            voxel. Default: 128.
-        mode (str, optional): Pooling method of RoIAware, 'max' or 'avg'.
-            Default: 'max'.
-    """
-
-    def __init__(self, out_size, max_pts_per_voxel=128, mode='max'):
-        super().__init__()
-
-        self.out_size = out_size
-        self.max_pts_per_voxel = max_pts_per_voxel
-        assert mode in ['max', 'avg']
-        pool_mapping = {'max': 0, 'avg': 1}
-        self.mode = pool_mapping[mode]
-
-    def forward(self, rois, pts, pts_feature):
-        """
-        Args:
-            rois (torch.Tensor): [N, 7], in LiDAR coordinate,
-                (x, y, z) is the bottom center of rois.
-            pts (torch.Tensor): [npoints, 3], coordinates of input points.
-            pts_feature (torch.Tensor): [npoints, C], features of input points.
-
-        Returns:
-            pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C]
-        """
-
-        return RoIAwarePool3dFunction.apply(rois, pts, pts_feature,
-                                            self.out_size,
-                                            self.max_pts_per_voxel, self.mode)
-
-
-class RoIAwarePool3dFunction(Function):
-
-    @staticmethod
-    def forward(ctx, rois, pts, pts_feature, out_size, max_pts_per_voxel,
-                mode):
-        """
-        Args:
-            rois (torch.Tensor): [N, 7], in LiDAR coordinate,
-                (x, y, z) is the bottom center of rois.
-            pts (torch.Tensor): [npoints, 3], coordinates of input points.
-            pts_feature (torch.Tensor): [npoints, C], features of input points.
-            out_size (int or tuple): The size of output features. n or
-                [n1, n2, n3].
-            max_pts_per_voxel (int): The maximum number of points per voxel.
-                Default: 128.
-            mode (int): Pooling method of RoIAware, 0 (max pool) or 1 (average
-                pool).
-
-        Returns:
-            pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C], output
-                pooled features.
-        """
-
-        if isinstance(out_size, int):
-            out_x = out_y = out_z = out_size
-        else:
-            assert len(out_size) == 3
-            assert mmcv.is_tuple_of(out_size, int)
-            out_x, out_y, out_z = out_size
-
-        num_rois = rois.shape[0]
-        num_channels = pts_feature.shape[-1]
-        num_pts = pts.shape[0]
-
-        pooled_features = pts_feature.new_zeros(
-            (num_rois, out_x, out_y, out_z, num_channels))
-        argmax = pts_feature.new_zeros(
-            (num_rois, out_x, out_y, out_z, num_channels), dtype=torch.int)
-        pts_idx_of_voxels = pts_feature.new_zeros(
-            (num_rois, out_x, out_y, out_z, max_pts_per_voxel),
-            dtype=torch.int)
-
-        ext_module.roiaware_pool3d_forward(rois, pts, pts_feature, argmax,
-                                           pts_idx_of_voxels, pooled_features,
-                                           mode)
-
-        ctx.roiaware_pool3d_for_backward = (pts_idx_of_voxels, argmax, mode,
-                                            num_pts, num_channels)
-        return pooled_features
-
-    @staticmethod
-    def backward(ctx, grad_out):
-        ret = ctx.roiaware_pool3d_for_backward
-        pts_idx_of_voxels, argmax, mode, num_pts, num_channels = ret
-
-        grad_in = grad_out.new_zeros((num_pts, num_channels))
-        ext_module.roiaware_pool3d_backward(pts_idx_of_voxels, argmax,
-                                            grad_out.contiguous(), grad_in,
-                                            mode)
-
-        return None, None, grad_in, None, None, None
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/roipoint_pool3d.py b/ControlNet/annotator/uniformer/mmcv/ops/roipoint_pool3d.py
deleted file mode 100644
index 0a21412c0728431c04b84245bc2e3109eea9aefc..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/roipoint_pool3d.py
+++ /dev/null
@@ -1,77 +0,0 @@
-from torch import nn as nn
-from torch.autograd import Function
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext', ['roipoint_pool3d_forward'])
-
-
-class RoIPointPool3d(nn.Module):
-    """Encode the geometry-specific features of each 3D proposal.
-
-    Please refer to `Paper of PartA2 <https://arxiv.org/pdf/1907.03670.pdf>`_
-    for more details.
-
-    Args:
-        num_sampled_points (int, optional): Number of samples in each roi.
-            Default: 512.
-    """
-
-    def __init__(self, num_sampled_points=512):
-        super().__init__()
-        self.num_sampled_points = num_sampled_points
-
-    def forward(self, points, point_features, boxes3d):
-        """
-        Args:
-            points (torch.Tensor): Input points whose shape is (B, N, C).
-            point_features (torch.Tensor): Features of input points whose shape
-                is (B, N, C).
-            boxes3d (B, M, 7), Input bounding boxes whose shape is (B, M, 7).
-
-        Returns:
-            pooled_features (torch.Tensor): The output pooled features whose
-                shape is (B, M, 512, 3 + C).
-            pooled_empty_flag (torch.Tensor): Empty flag whose shape is (B, M).
-        """
-        return RoIPointPool3dFunction.apply(points, point_features, boxes3d,
-                                            self.num_sampled_points)
-
-
-class RoIPointPool3dFunction(Function):
-
-    @staticmethod
-    def forward(ctx, points, point_features, boxes3d, num_sampled_points=512):
-        """
-        Args:
-            points (torch.Tensor): Input points whose shape is (B, N, C).
-            point_features (torch.Tensor): Features of input points whose shape
-                is (B, N, C).
-            boxes3d (B, M, 7), Input bounding boxes whose shape is (B, M, 7).
-            num_sampled_points (int, optional): The num of sampled points.
-                Default: 512.
-
-        Returns:
-            pooled_features (torch.Tensor): The output pooled features whose
-                shape is (B, M, 512, 3 + C).
-            pooled_empty_flag (torch.Tensor): Empty flag whose shape is (B, M).
-        """
-        assert len(points.shape) == 3 and points.shape[2] == 3
-        batch_size, boxes_num, feature_len = points.shape[0], boxes3d.shape[
-            1], point_features.shape[2]
-        pooled_boxes3d = boxes3d.view(batch_size, -1, 7)
-        pooled_features = point_features.new_zeros(
-            (batch_size, boxes_num, num_sampled_points, 3 + feature_len))
-        pooled_empty_flag = point_features.new_zeros(
-            (batch_size, boxes_num)).int()
-
-        ext_module.roipoint_pool3d_forward(points.contiguous(),
-                                           pooled_boxes3d.contiguous(),
-                                           point_features.contiguous(),
-                                           pooled_features, pooled_empty_flag)
-
-        return pooled_features, pooled_empty_flag
-
-    @staticmethod
-    def backward(ctx, grad_out):
-        raise NotImplementedError
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/saconv.py b/ControlNet/annotator/uniformer/mmcv/ops/saconv.py
deleted file mode 100644
index b4ee3978e097fca422805db4e31ae481006d7971..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/saconv.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from annotator.uniformer.mmcv.cnn import CONV_LAYERS, ConvAWS2d, constant_init
-from annotator.uniformer.mmcv.ops.deform_conv import deform_conv2d
-from annotator.uniformer.mmcv.utils import TORCH_VERSION, digit_version
-
-
-@CONV_LAYERS.register_module(name='SAC')
-class SAConv2d(ConvAWS2d):
-    """SAC (Switchable Atrous Convolution)
-
-    This is an implementation of SAC in DetectoRS
-    (https://arxiv.org/pdf/2006.02334.pdf).
-
-    Args:
-        in_channels (int): Number of channels in the input image
-        out_channels (int): Number of channels produced by the convolution
-        kernel_size (int or tuple): Size of the convolving kernel
-        stride (int or tuple, optional): Stride of the convolution. Default: 1
-        padding (int or tuple, optional): Zero-padding added to both sides of
-            the input. Default: 0
-        padding_mode (string, optional): ``'zeros'``, ``'reflect'``,
-            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
-        dilation (int or tuple, optional): Spacing between kernel elements.
-            Default: 1
-        groups (int, optional): Number of blocked connections from input
-            channels to output channels. Default: 1
-        bias (bool, optional): If ``True``, adds a learnable bias to the
-            output. Default: ``True``
-        use_deform: If ``True``, replace convolution with deformable
-            convolution. Default: ``False``.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 use_deform=False):
-        super().__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            bias=bias)
-        self.use_deform = use_deform
-        self.switch = nn.Conv2d(
-            self.in_channels, 1, kernel_size=1, stride=stride, bias=True)
-        self.weight_diff = nn.Parameter(torch.Tensor(self.weight.size()))
-        self.pre_context = nn.Conv2d(
-            self.in_channels, self.in_channels, kernel_size=1, bias=True)
-        self.post_context = nn.Conv2d(
-            self.out_channels, self.out_channels, kernel_size=1, bias=True)
-        if self.use_deform:
-            self.offset_s = nn.Conv2d(
-                self.in_channels,
-                18,
-                kernel_size=3,
-                padding=1,
-                stride=stride,
-                bias=True)
-            self.offset_l = nn.Conv2d(
-                self.in_channels,
-                18,
-                kernel_size=3,
-                padding=1,
-                stride=stride,
-                bias=True)
-        self.init_weights()
-
-    def init_weights(self):
-        constant_init(self.switch, 0, bias=1)
-        self.weight_diff.data.zero_()
-        constant_init(self.pre_context, 0)
-        constant_init(self.post_context, 0)
-        if self.use_deform:
-            constant_init(self.offset_s, 0)
-            constant_init(self.offset_l, 0)
-
-    def forward(self, x):
-        # pre-context
-        avg_x = F.adaptive_avg_pool2d(x, output_size=1)
-        avg_x = self.pre_context(avg_x)
-        avg_x = avg_x.expand_as(x)
-        x = x + avg_x
-        # switch
-        avg_x = F.pad(x, pad=(2, 2, 2, 2), mode='reflect')
-        avg_x = F.avg_pool2d(avg_x, kernel_size=5, stride=1, padding=0)
-        switch = self.switch(avg_x)
-        # sac
-        weight = self._get_weight(self.weight)
-        zero_bias = torch.zeros(
-            self.out_channels, device=weight.device, dtype=weight.dtype)
-
-        if self.use_deform:
-            offset = self.offset_s(avg_x)
-            out_s = deform_conv2d(x, offset, weight, self.stride, self.padding,
-                                  self.dilation, self.groups, 1)
-        else:
-            if (TORCH_VERSION == 'parrots'
-                    or digit_version(TORCH_VERSION) < digit_version('1.5.0')):
-                out_s = super().conv2d_forward(x, weight)
-            elif digit_version(TORCH_VERSION) >= digit_version('1.8.0'):
-                # bias is a required argument of _conv_forward in torch 1.8.0
-                out_s = super()._conv_forward(x, weight, zero_bias)
-            else:
-                out_s = super()._conv_forward(x, weight)
-        ori_p = self.padding
-        ori_d = self.dilation
-        self.padding = tuple(3 * p for p in self.padding)
-        self.dilation = tuple(3 * d for d in self.dilation)
-        weight = weight + self.weight_diff
-        if self.use_deform:
-            offset = self.offset_l(avg_x)
-            out_l = deform_conv2d(x, offset, weight, self.stride, self.padding,
-                                  self.dilation, self.groups, 1)
-        else:
-            if (TORCH_VERSION == 'parrots'
-                    or digit_version(TORCH_VERSION) < digit_version('1.5.0')):
-                out_l = super().conv2d_forward(x, weight)
-            elif digit_version(TORCH_VERSION) >= digit_version('1.8.0'):
-                # bias is a required argument of _conv_forward in torch 1.8.0
-                out_l = super()._conv_forward(x, weight, zero_bias)
-            else:
-                out_l = super()._conv_forward(x, weight)
-
-        out = switch * out_s + (1 - switch) * out_l
-        self.padding = ori_p
-        self.dilation = ori_d
-        # post-context
-        avg_x = F.adaptive_avg_pool2d(out, output_size=1)
-        avg_x = self.post_context(avg_x)
-        avg_x = avg_x.expand_as(out)
-        out = out + avg_x
-        return out
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/scatter_points.py b/ControlNet/annotator/uniformer/mmcv/ops/scatter_points.py
deleted file mode 100644
index 2b8aa4169e9f6ca4a6f845ce17d6d1e4db416bb8..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/scatter_points.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch import nn
-from torch.autograd import Function
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext(
-    '_ext',
-    ['dynamic_point_to_voxel_forward', 'dynamic_point_to_voxel_backward'])
-
-
-class _DynamicScatter(Function):
-
-    @staticmethod
-    def forward(ctx, feats, coors, reduce_type='max'):
-        """convert kitti points(N, >=3) to voxels.
-
-        Args:
-            feats (torch.Tensor): [N, C]. Points features to be reduced
-                into voxels.
-            coors (torch.Tensor): [N, ndim]. Corresponding voxel coordinates
-                (specifically multi-dim voxel index) of each points.
-            reduce_type (str, optional): Reduce op. support 'max', 'sum' and
-                'mean'. Default: 'max'.
-
-        Returns:
-            voxel_feats (torch.Tensor): [M, C]. Reduced features, input
-                features that shares the same voxel coordinates are reduced to
-                one row.
-            voxel_coors (torch.Tensor): [M, ndim]. Voxel coordinates.
-        """
-        results = ext_module.dynamic_point_to_voxel_forward(
-            feats, coors, reduce_type)
-        (voxel_feats, voxel_coors, point2voxel_map,
-         voxel_points_count) = results
-        ctx.reduce_type = reduce_type
-        ctx.save_for_backward(feats, voxel_feats, point2voxel_map,
-                              voxel_points_count)
-        ctx.mark_non_differentiable(voxel_coors)
-        return voxel_feats, voxel_coors
-
-    @staticmethod
-    def backward(ctx, grad_voxel_feats, grad_voxel_coors=None):
-        (feats, voxel_feats, point2voxel_map,
-         voxel_points_count) = ctx.saved_tensors
-        grad_feats = torch.zeros_like(feats)
-        # TODO: whether to use index put or use cuda_backward
-        # To use index put, need point to voxel index
-        ext_module.dynamic_point_to_voxel_backward(
-            grad_feats, grad_voxel_feats.contiguous(), feats, voxel_feats,
-            point2voxel_map, voxel_points_count, ctx.reduce_type)
-        return grad_feats, None, None
-
-
-dynamic_scatter = _DynamicScatter.apply
-
-
-class DynamicScatter(nn.Module):
-    """Scatters points into voxels, used in the voxel encoder with dynamic
-    voxelization.
-
-    Note:
-        The CPU and GPU implementation get the same output, but have numerical
-        difference after summation and division (e.g., 5e-7).
-
-    Args:
-        voxel_size (list): list [x, y, z] size of three dimension.
-        point_cloud_range (list): The coordinate range of points, [x_min,
-            y_min, z_min, x_max, y_max, z_max].
-        average_points (bool): whether to use avg pooling to scatter points
-            into voxel.
-    """
-
-    def __init__(self, voxel_size, point_cloud_range, average_points: bool):
-        super().__init__()
-
-        self.voxel_size = voxel_size
-        self.point_cloud_range = point_cloud_range
-        self.average_points = average_points
-
-    def forward_single(self, points, coors):
-        """Scatters points into voxels.
-
-        Args:
-            points (torch.Tensor): Points to be reduced into voxels.
-            coors (torch.Tensor): Corresponding voxel coordinates (specifically
-                multi-dim voxel index) of each points.
-
-        Returns:
-            voxel_feats (torch.Tensor): Reduced features, input features that
-                shares the same voxel coordinates are reduced to one row.
-            voxel_coors (torch.Tensor): Voxel coordinates.
-        """
-        reduce = 'mean' if self.average_points else 'max'
-        return dynamic_scatter(points.contiguous(), coors.contiguous(), reduce)
-
-    def forward(self, points, coors):
-        """Scatters points/features into voxels.
-
-        Args:
-            points (torch.Tensor): Points to be reduced into voxels.
-            coors (torch.Tensor): Corresponding voxel coordinates (specifically
-                multi-dim voxel index) of each points.
-
-        Returns:
-            voxel_feats (torch.Tensor): Reduced features, input features that
-                shares the same voxel coordinates are reduced to one row.
-            voxel_coors (torch.Tensor): Voxel coordinates.
-        """
-        if coors.size(-1) == 3:
-            return self.forward_single(points, coors)
-        else:
-            batch_size = coors[-1, 0] + 1
-            voxels, voxel_coors = [], []
-            for i in range(batch_size):
-                inds = torch.where(coors[:, 0] == i)
-                voxel, voxel_coor = self.forward_single(
-                    points[inds], coors[inds][:, 1:])
-                coor_pad = nn.functional.pad(
-                    voxel_coor, (1, 0), mode='constant', value=i)
-                voxel_coors.append(coor_pad)
-                voxels.append(voxel)
-            features = torch.cat(voxels, dim=0)
-            feature_coors = torch.cat(voxel_coors, dim=0)
-
-            return features, feature_coors
-
-    def __repr__(self):
-        s = self.__class__.__name__ + '('
-        s += 'voxel_size=' + str(self.voxel_size)
-        s += ', point_cloud_range=' + str(self.point_cloud_range)
-        s += ', average_points=' + str(self.average_points)
-        s += ')'
-        return s
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/sync_bn.py b/ControlNet/annotator/uniformer/mmcv/ops/sync_bn.py
deleted file mode 100644
index c9b016fcbe860989c56cd1040034bcfa60e146d2..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/sync_bn.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-import torch.distributed as dist
-import torch.nn.functional as F
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-from torch.nn.modules.module import Module
-from torch.nn.parameter import Parameter
-
-from annotator.uniformer.mmcv.cnn import NORM_LAYERS
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext', [
-    'sync_bn_forward_mean', 'sync_bn_forward_var', 'sync_bn_forward_output',
-    'sync_bn_backward_param', 'sync_bn_backward_data'
-])
-
-
-class SyncBatchNormFunction(Function):
-
-    @staticmethod
-    def symbolic(g, input, running_mean, running_var, weight, bias, momentum,
-                 eps, group, group_size, stats_mode):
-        return g.op(
-            'mmcv::MMCVSyncBatchNorm',
-            input,
-            running_mean,
-            running_var,
-            weight,
-            bias,
-            momentum_f=momentum,
-            eps_f=eps,
-            group_i=group,
-            group_size_i=group_size,
-            stats_mode=stats_mode)
-
-    @staticmethod
-    def forward(self, input, running_mean, running_var, weight, bias, momentum,
-                eps, group, group_size, stats_mode):
-        self.momentum = momentum
-        self.eps = eps
-        self.group = group
-        self.group_size = group_size
-        self.stats_mode = stats_mode
-
-        assert isinstance(
-                   input, (torch.HalfTensor, torch.FloatTensor,
-                           torch.cuda.HalfTensor, torch.cuda.FloatTensor)), \
-               f'only support Half or Float Tensor, but {input.type()}'
-        output = torch.zeros_like(input)
-        input3d = input.flatten(start_dim=2)
-        output3d = output.view_as(input3d)
-        num_channels = input3d.size(1)
-
-        # ensure mean/var/norm/std are initialized as zeros
-        # ``torch.empty()`` does not guarantee that
-        mean = torch.zeros(
-            num_channels, dtype=torch.float, device=input3d.device)
-        var = torch.zeros(
-            num_channels, dtype=torch.float, device=input3d.device)
-        norm = torch.zeros_like(
-            input3d, dtype=torch.float, device=input3d.device)
-        std = torch.zeros(
-            num_channels, dtype=torch.float, device=input3d.device)
-
-        batch_size = input3d.size(0)
-        if batch_size > 0:
-            ext_module.sync_bn_forward_mean(input3d, mean)
-            batch_flag = torch.ones([1], device=mean.device, dtype=mean.dtype)
-        else:
-            # skip updating mean and leave it as zeros when the input is empty
-            batch_flag = torch.zeros([1], device=mean.device, dtype=mean.dtype)
-
-        # synchronize mean and the batch flag
-        vec = torch.cat([mean, batch_flag])
-        if self.stats_mode == 'N':
-            vec *= batch_size
-        if self.group_size > 1:
-            dist.all_reduce(vec, group=self.group)
-        total_batch = vec[-1].detach()
-        mean = vec[:num_channels]
-
-        if self.stats_mode == 'default':
-            mean = mean / self.group_size
-        elif self.stats_mode == 'N':
-            mean = mean / total_batch.clamp(min=1)
-        else:
-            raise NotImplementedError
-
-        # leave var as zeros when the input is empty
-        if batch_size > 0:
-            ext_module.sync_bn_forward_var(input3d, mean, var)
-
-        if self.stats_mode == 'N':
-            var *= batch_size
-        if self.group_size > 1:
-            dist.all_reduce(var, group=self.group)
-
-        if self.stats_mode == 'default':
-            var /= self.group_size
-        elif self.stats_mode == 'N':
-            var /= total_batch.clamp(min=1)
-        else:
-            raise NotImplementedError
-
-        # if the total batch size over all the ranks is zero,
-        # we should not update the statistics in the current batch
-        update_flag = total_batch.clamp(max=1)
-        momentum = update_flag * self.momentum
-        ext_module.sync_bn_forward_output(
-            input3d,
-            mean,
-            var,
-            weight,
-            bias,
-            running_mean,
-            running_var,
-            norm,
-            std,
-            output3d,
-            eps=self.eps,
-            momentum=momentum,
-            group_size=self.group_size)
-        self.save_for_backward(norm, std, weight)
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(self, grad_output):
-        norm, std, weight = self.saved_tensors
-        grad_weight = torch.zeros_like(weight)
-        grad_bias = torch.zeros_like(weight)
-        grad_input = torch.zeros_like(grad_output)
-        grad_output3d = grad_output.flatten(start_dim=2)
-        grad_input3d = grad_input.view_as(grad_output3d)
-
-        batch_size = grad_input3d.size(0)
-        if batch_size > 0:
-            ext_module.sync_bn_backward_param(grad_output3d, norm, grad_weight,
-                                              grad_bias)
-
-        # all reduce
-        if self.group_size > 1:
-            dist.all_reduce(grad_weight, group=self.group)
-            dist.all_reduce(grad_bias, group=self.group)
-            grad_weight /= self.group_size
-            grad_bias /= self.group_size
-
-        if batch_size > 0:
-            ext_module.sync_bn_backward_data(grad_output3d, weight,
-                                             grad_weight, grad_bias, norm, std,
-                                             grad_input3d)
-
-        return grad_input, None, None, grad_weight, grad_bias, \
-            None, None, None, None, None
-
-
-@NORM_LAYERS.register_module(name='MMSyncBN')
-class SyncBatchNorm(Module):
-    """Synchronized Batch Normalization.
-
-    Args:
-        num_features (int): number of features/chennels in input tensor
-        eps (float, optional): a value added to the denominator for numerical
-            stability. Defaults to 1e-5.
-        momentum (float, optional): the value used for the running_mean and
-            running_var computation. Defaults to 0.1.
-        affine (bool, optional): whether to use learnable affine parameters.
-            Defaults to True.
-        track_running_stats (bool, optional): whether to track the running
-            mean and variance during training. When set to False, this
-            module does not track such statistics, and initializes statistics
-            buffers ``running_mean`` and ``running_var`` as ``None``. When
-            these buffers are ``None``, this module always uses batch
-            statistics in both training and eval modes. Defaults to True.
-        group (int, optional): synchronization of stats happen within
-            each process group individually. By default it is synchronization
-            across the whole world. Defaults to None.
-        stats_mode (str, optional): The statistical mode. Available options
-            includes ``'default'`` and ``'N'``. Defaults to 'default'.
-            When ``stats_mode=='default'``, it computes the overall statistics
-            using those from each worker with equal weight, i.e., the
-            statistics are synchronized and simply divied by ``group``. This
-            mode will produce inaccurate statistics when empty tensors occur.
-            When ``stats_mode=='N'``, it compute the overall statistics using
-            the total number of batches in each worker ignoring the number of
-            group, i.e., the statistics are synchronized and then divied by
-            the total batch ``N``. This mode is beneficial when empty tensors
-            occur during training, as it average the total mean by the real
-            number of batch.
-    """
-
-    def __init__(self,
-                 num_features,
-                 eps=1e-5,
-                 momentum=0.1,
-                 affine=True,
-                 track_running_stats=True,
-                 group=None,
-                 stats_mode='default'):
-        super(SyncBatchNorm, self).__init__()
-        self.num_features = num_features
-        self.eps = eps
-        self.momentum = momentum
-        self.affine = affine
-        self.track_running_stats = track_running_stats
-        group = dist.group.WORLD if group is None else group
-        self.group = group
-        self.group_size = dist.get_world_size(group)
-        assert stats_mode in ['default', 'N'], \
-            f'"stats_mode" only accepts "default" and "N", got "{stats_mode}"'
-        self.stats_mode = stats_mode
-        if self.affine:
-            self.weight = Parameter(torch.Tensor(num_features))
-            self.bias = Parameter(torch.Tensor(num_features))
-        else:
-            self.register_parameter('weight', None)
-            self.register_parameter('bias', None)
-        if self.track_running_stats:
-            self.register_buffer('running_mean', torch.zeros(num_features))
-            self.register_buffer('running_var', torch.ones(num_features))
-            self.register_buffer('num_batches_tracked',
-                                 torch.tensor(0, dtype=torch.long))
-        else:
-            self.register_buffer('running_mean', None)
-            self.register_buffer('running_var', None)
-            self.register_buffer('num_batches_tracked', None)
-        self.reset_parameters()
-
-    def reset_running_stats(self):
-        if self.track_running_stats:
-            self.running_mean.zero_()
-            self.running_var.fill_(1)
-            self.num_batches_tracked.zero_()
-
-    def reset_parameters(self):
-        self.reset_running_stats()
-        if self.affine:
-            self.weight.data.uniform_()  # pytorch use ones_()
-            self.bias.data.zero_()
-
-    def forward(self, input):
-        if input.dim() < 2:
-            raise ValueError(
-                f'expected at least 2D input, got {input.dim()}D input')
-        if self.momentum is None:
-            exponential_average_factor = 0.0
-        else:
-            exponential_average_factor = self.momentum
-
-        if self.training and self.track_running_stats:
-            if self.num_batches_tracked is not None:
-                self.num_batches_tracked += 1
-                if self.momentum is None:  # use cumulative moving average
-                    exponential_average_factor = 1.0 / float(
-                        self.num_batches_tracked)
-                else:  # use exponential moving average
-                    exponential_average_factor = self.momentum
-
-        if self.training or not self.track_running_stats:
-            return SyncBatchNormFunction.apply(
-                input, self.running_mean, self.running_var, self.weight,
-                self.bias, exponential_average_factor, self.eps, self.group,
-                self.group_size, self.stats_mode)
-        else:
-            return F.batch_norm(input, self.running_mean, self.running_var,
-                                self.weight, self.bias, False,
-                                exponential_average_factor, self.eps)
-
-    def __repr__(self):
-        s = self.__class__.__name__
-        s += f'({self.num_features}, '
-        s += f'eps={self.eps}, '
-        s += f'momentum={self.momentum}, '
-        s += f'affine={self.affine}, '
-        s += f'track_running_stats={self.track_running_stats}, '
-        s += f'group_size={self.group_size},'
-        s += f'stats_mode={self.stats_mode})'
-        return s
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/three_interpolate.py b/ControlNet/annotator/uniformer/mmcv/ops/three_interpolate.py
deleted file mode 100644
index 203f47f05d58087e034fb3cd8cd6a09233947b4a..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/three_interpolate.py
+++ /dev/null
@@ -1,68 +0,0 @@
-from typing import Tuple
-
-import torch
-from torch.autograd import Function
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext(
-    '_ext', ['three_interpolate_forward', 'three_interpolate_backward'])
-
-
-class ThreeInterpolate(Function):
-    """Performs weighted linear interpolation on 3 features.
-
-    Please refer to `Paper of PointNet++ <https://arxiv.org/abs/1706.02413>`_
-    for more details.
-    """
-
-    @staticmethod
-    def forward(ctx, features: torch.Tensor, indices: torch.Tensor,
-                weight: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            features (Tensor): (B, C, M) Features descriptors to be
-                interpolated
-            indices (Tensor): (B, n, 3) index three nearest neighbors
-                of the target features in features
-            weight (Tensor): (B, n, 3) weights of interpolation
-
-        Returns:
-            Tensor: (B, C, N) tensor of the interpolated features
-        """
-        assert features.is_contiguous()
-        assert indices.is_contiguous()
-        assert weight.is_contiguous()
-
-        B, c, m = features.size()
-        n = indices.size(1)
-        ctx.three_interpolate_for_backward = (indices, weight, m)
-        output = torch.cuda.FloatTensor(B, c, n)
-
-        ext_module.three_interpolate_forward(
-            features, indices, weight, output, b=B, c=c, m=m, n=n)
-        return output
-
-    @staticmethod
-    def backward(
-        ctx, grad_out: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Args:
-            grad_out (Tensor): (B, C, N) tensor with gradients of outputs
-
-        Returns:
-            Tensor: (B, C, M) tensor with gradients of features
-        """
-        idx, weight, m = ctx.three_interpolate_for_backward
-        B, c, n = grad_out.size()
-
-        grad_features = torch.cuda.FloatTensor(B, c, m).zero_()
-        grad_out_data = grad_out.data.contiguous()
-
-        ext_module.three_interpolate_backward(
-            grad_out_data, idx, weight, grad_features.data, b=B, c=c, n=n, m=m)
-        return grad_features, None, None
-
-
-three_interpolate = ThreeInterpolate.apply
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/three_nn.py b/ControlNet/annotator/uniformer/mmcv/ops/three_nn.py
deleted file mode 100644
index 2b01047a129989cd5545a0a86f23a487f4a13ce1..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/three_nn.py
+++ /dev/null
@@ -1,51 +0,0 @@
-from typing import Tuple
-
-import torch
-from torch.autograd import Function
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext', ['three_nn_forward'])
-
-
-class ThreeNN(Function):
-    """Find the top-3 nearest neighbors of the target set from the source set.
-
-    Please refer to `Paper of PointNet++ <https://arxiv.org/abs/1706.02413>`_
-    for more details.
-    """
-
-    @staticmethod
-    def forward(ctx, target: torch.Tensor,
-                source: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Args:
-            target (Tensor): shape (B, N, 3), points set that needs to
-                find the nearest neighbors.
-            source (Tensor): shape (B, M, 3), points set that is used
-                to find the nearest neighbors of points in target set.
-
-        Returns:
-            Tensor: shape (B, N, 3), L2 distance of each point in target
-                set to their corresponding nearest neighbors.
-        """
-        target = target.contiguous()
-        source = source.contiguous()
-
-        B, N, _ = target.size()
-        m = source.size(1)
-        dist2 = torch.cuda.FloatTensor(B, N, 3)
-        idx = torch.cuda.IntTensor(B, N, 3)
-
-        ext_module.three_nn_forward(target, source, dist2, idx, b=B, n=N, m=m)
-        if torch.__version__ != 'parrots':
-            ctx.mark_non_differentiable(idx)
-
-        return torch.sqrt(dist2), idx
-
-    @staticmethod
-    def backward(ctx, a=None, b=None):
-        return None, None
-
-
-three_nn = ThreeNN.apply
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/tin_shift.py b/ControlNet/annotator/uniformer/mmcv/ops/tin_shift.py
deleted file mode 100644
index 472c9fcfe45a124e819b7ed5653e585f94a8811e..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/tin_shift.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-# Code reference from "Temporal Interlacing Network"
-# https://github.com/deepcs233/TIN/blob/master/cuda_shift/rtc_wrap.py
-# Hao Shao, Shengju Qian, Yu Liu
-# shaoh19@mails.tsinghua.edu.cn, sjqian@cse.cuhk.edu.hk, yuliu@ee.cuhk.edu.hk
-
-import torch
-import torch.nn as nn
-from torch.autograd import Function
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext',
-                                 ['tin_shift_forward', 'tin_shift_backward'])
-
-
-class TINShiftFunction(Function):
-
-    @staticmethod
-    def forward(ctx, input, shift):
-        C = input.size(2)
-        num_segments = shift.size(1)
-        if C // num_segments <= 0 or C % num_segments != 0:
-            raise ValueError('C should be a multiple of num_segments, '
-                             f'but got C={C} and num_segments={num_segments}.')
-
-        ctx.save_for_backward(shift)
-
-        out = torch.zeros_like(input)
-        ext_module.tin_shift_forward(input, shift, out)
-
-        return out
-
-    @staticmethod
-    def backward(ctx, grad_output):
-
-        shift = ctx.saved_tensors[0]
-        data_grad_input = grad_output.new(*grad_output.size()).zero_()
-        shift_grad_input = shift.new(*shift.size()).zero_()
-        ext_module.tin_shift_backward(grad_output, shift, data_grad_input)
-
-        return data_grad_input, shift_grad_input
-
-
-tin_shift = TINShiftFunction.apply
-
-
-class TINShift(nn.Module):
-    """Temporal Interlace Shift.
-
-    Temporal Interlace shift is a differentiable temporal-wise frame shifting
-    which is proposed in "Temporal Interlacing Network"
-
-    Please refer to https://arxiv.org/abs/2001.06499 for more details.
-    Code is modified from https://github.com/mit-han-lab/temporal-shift-module
-    """
-
-    def forward(self, input, shift):
-        """Perform temporal interlace shift.
-
-        Args:
-            input (Tensor): Feature map with shape [N, num_segments, C, H * W].
-            shift (Tensor): Shift tensor with shape [N, num_segments].
-
-        Returns:
-            Feature map after temporal interlace shift.
-        """
-        return tin_shift(input, shift)
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/upfirdn2d.py b/ControlNet/annotator/uniformer/mmcv/ops/upfirdn2d.py
deleted file mode 100644
index c8bb2c3c949eed38a6465ed369fa881538dca010..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/upfirdn2d.py
+++ /dev/null
@@ -1,330 +0,0 @@
-# modified from https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.py  # noqa:E501
-
-# Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
-# NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
-# Augmentation (ADA)
-# =======================================================================
-
-# 1. Definitions
-
-# "Licensor" means any person or entity that distributes its Work.
-
-# "Software" means the original work of authorship made available under
-# this License.
-
-# "Work" means the Software and any additions to or derivative works of
-# the Software that are made available under this License.
-
-# The terms "reproduce," "reproduction," "derivative works," and
-# "distribution" have the meaning as provided under U.S. copyright law;
-# provided, however, that for the purposes of this License, derivative
-# works shall not include works that remain separable from, or merely
-# link (or bind by name) to the interfaces of, the Work.
-
-# Works, including the Software, are "made available" under this License
-# by including in or with the Work either (a) a copyright notice
-# referencing the applicability of this License to the Work, or (b) a
-# copy of this License.
-
-# 2. License Grants
-
-#     2.1 Copyright Grant. Subject to the terms and conditions of this
-#     License, each Licensor grants to you a perpetual, worldwide,
-#     non-exclusive, royalty-free, copyright license to reproduce,
-#     prepare derivative works of, publicly display, publicly perform,
-#     sublicense and distribute its Work and any resulting derivative
-#     works in any form.
-
-# 3. Limitations
-
-#     3.1 Redistribution. You may reproduce or distribute the Work only
-#     if (a) you do so under this License, (b) you include a complete
-#     copy of this License with your distribution, and (c) you retain
-#     without modification any copyright, patent, trademark, or
-#     attribution notices that are present in the Work.
-
-#     3.2 Derivative Works. You may specify that additional or different
-#     terms apply to the use, reproduction, and distribution of your
-#     derivative works of the Work ("Your Terms") only if (a) Your Terms
-#     provide that the use limitation in Section 3.3 applies to your
-#     derivative works, and (b) you identify the specific derivative
-#     works that are subject to Your Terms. Notwithstanding Your Terms,
-#     this License (including the redistribution requirements in Section
-#     3.1) will continue to apply to the Work itself.
-
-#     3.3 Use Limitation. The Work and any derivative works thereof only
-#     may be used or intended for use non-commercially. Notwithstanding
-#     the foregoing, NVIDIA and its affiliates may use the Work and any
-#     derivative works commercially. As used herein, "non-commercially"
-#     means for research or evaluation purposes only.
-
-#     3.4 Patent Claims. If you bring or threaten to bring a patent claim
-#     against any Licensor (including any claim, cross-claim or
-#     counterclaim in a lawsuit) to enforce any patents that you allege
-#     are infringed by any Work, then your rights under this License from
-#     such Licensor (including the grant in Section 2.1) will terminate
-#     immediately.
-
-#     3.5 Trademarks. This License does not grant any rights to use any
-#     Licensor’s or its affiliates’ names, logos, or trademarks, except
-#     as necessary to reproduce the notices described in this License.
-
-#     3.6 Termination. If you violate any term of this License, then your
-#     rights under this License (including the grant in Section 2.1) will
-#     terminate immediately.
-
-# 4. Disclaimer of Warranty.
-
-# THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
-# NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
-# THIS LICENSE.
-
-# 5. Limitation of Liability.
-
-# EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
-# THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
-# SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
-# INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
-# OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
-# (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
-# LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
-# COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
-# THE POSSIBILITY OF SUCH DAMAGES.
-
-# =======================================================================
-
-import torch
-from torch.autograd import Function
-from torch.nn import functional as F
-
-from annotator.uniformer.mmcv.utils import to_2tuple
-from ..utils import ext_loader
-
-upfirdn2d_ext = ext_loader.load_ext('_ext', ['upfirdn2d'])
-
-
-class UpFirDn2dBackward(Function):
-
-    @staticmethod
-    def forward(ctx, grad_output, kernel, grad_kernel, up, down, pad, g_pad,
-                in_size, out_size):
-
-        up_x, up_y = up
-        down_x, down_y = down
-        g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1 = g_pad
-
-        grad_output = grad_output.reshape(-1, out_size[0], out_size[1], 1)
-
-        grad_input = upfirdn2d_ext.upfirdn2d(
-            grad_output,
-            grad_kernel,
-            up_x=down_x,
-            up_y=down_y,
-            down_x=up_x,
-            down_y=up_y,
-            pad_x0=g_pad_x0,
-            pad_x1=g_pad_x1,
-            pad_y0=g_pad_y0,
-            pad_y1=g_pad_y1)
-        grad_input = grad_input.view(in_size[0], in_size[1], in_size[2],
-                                     in_size[3])
-
-        ctx.save_for_backward(kernel)
-
-        pad_x0, pad_x1, pad_y0, pad_y1 = pad
-
-        ctx.up_x = up_x
-        ctx.up_y = up_y
-        ctx.down_x = down_x
-        ctx.down_y = down_y
-        ctx.pad_x0 = pad_x0
-        ctx.pad_x1 = pad_x1
-        ctx.pad_y0 = pad_y0
-        ctx.pad_y1 = pad_y1
-        ctx.in_size = in_size
-        ctx.out_size = out_size
-
-        return grad_input
-
-    @staticmethod
-    def backward(ctx, gradgrad_input):
-        kernel, = ctx.saved_tensors
-
-        gradgrad_input = gradgrad_input.reshape(-1, ctx.in_size[2],
-                                                ctx.in_size[3], 1)
-
-        gradgrad_out = upfirdn2d_ext.upfirdn2d(
-            gradgrad_input,
-            kernel,
-            up_x=ctx.up_x,
-            up_y=ctx.up_y,
-            down_x=ctx.down_x,
-            down_y=ctx.down_y,
-            pad_x0=ctx.pad_x0,
-            pad_x1=ctx.pad_x1,
-            pad_y0=ctx.pad_y0,
-            pad_y1=ctx.pad_y1)
-        # gradgrad_out = gradgrad_out.view(ctx.in_size[0], ctx.out_size[0],
-        #                                  ctx.out_size[1], ctx.in_size[3])
-        gradgrad_out = gradgrad_out.view(ctx.in_size[0], ctx.in_size[1],
-                                         ctx.out_size[0], ctx.out_size[1])
-
-        return gradgrad_out, None, None, None, None, None, None, None, None
-
-
-class UpFirDn2d(Function):
-
-    @staticmethod
-    def forward(ctx, input, kernel, up, down, pad):
-        up_x, up_y = up
-        down_x, down_y = down
-        pad_x0, pad_x1, pad_y0, pad_y1 = pad
-
-        kernel_h, kernel_w = kernel.shape
-        batch, channel, in_h, in_w = input.shape
-        ctx.in_size = input.shape
-
-        input = input.reshape(-1, in_h, in_w, 1)
-
-        ctx.save_for_backward(kernel, torch.flip(kernel, [0, 1]))
-
-        out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1
-        out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1
-        ctx.out_size = (out_h, out_w)
-
-        ctx.up = (up_x, up_y)
-        ctx.down = (down_x, down_y)
-        ctx.pad = (pad_x0, pad_x1, pad_y0, pad_y1)
-
-        g_pad_x0 = kernel_w - pad_x0 - 1
-        g_pad_y0 = kernel_h - pad_y0 - 1
-        g_pad_x1 = in_w * up_x - out_w * down_x + pad_x0 - up_x + 1
-        g_pad_y1 = in_h * up_y - out_h * down_y + pad_y0 - up_y + 1
-
-        ctx.g_pad = (g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1)
-
-        out = upfirdn2d_ext.upfirdn2d(
-            input,
-            kernel,
-            up_x=up_x,
-            up_y=up_y,
-            down_x=down_x,
-            down_y=down_y,
-            pad_x0=pad_x0,
-            pad_x1=pad_x1,
-            pad_y0=pad_y0,
-            pad_y1=pad_y1)
-        # out = out.view(major, out_h, out_w, minor)
-        out = out.view(-1, channel, out_h, out_w)
-
-        return out
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        kernel, grad_kernel = ctx.saved_tensors
-
-        grad_input = UpFirDn2dBackward.apply(
-            grad_output,
-            kernel,
-            grad_kernel,
-            ctx.up,
-            ctx.down,
-            ctx.pad,
-            ctx.g_pad,
-            ctx.in_size,
-            ctx.out_size,
-        )
-
-        return grad_input, None, None, None, None
-
-
-def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)):
-    """UpFRIDn for 2d features.
-
-    UpFIRDn is short for upsample, apply FIR filter and downsample. More
-    details can be found in:
-    https://www.mathworks.com/help/signal/ref/upfirdn.html
-
-    Args:
-        input (Tensor): Tensor with shape of (n, c, h, w).
-        kernel (Tensor): Filter kernel.
-        up (int | tuple[int], optional): Upsampling factor. If given a number,
-            we will use this factor for the both height and width side.
-            Defaults to 1.
-        down (int | tuple[int], optional): Downsampling factor. If given a
-            number, we will use this factor for the both height and width side.
-            Defaults to 1.
-        pad (tuple[int], optional): Padding for tensors, (x_pad, y_pad) or
-            (x_pad_0, x_pad_1, y_pad_0, y_pad_1). Defaults to (0, 0).
-
-    Returns:
-        Tensor: Tensor after UpFIRDn.
-    """
-    if input.device.type == 'cpu':
-        if len(pad) == 2:
-            pad = (pad[0], pad[1], pad[0], pad[1])
-
-        up = to_2tuple(up)
-
-        down = to_2tuple(down)
-
-        out = upfirdn2d_native(input, kernel, up[0], up[1], down[0], down[1],
-                               pad[0], pad[1], pad[2], pad[3])
-    else:
-        _up = to_2tuple(up)
-
-        _down = to_2tuple(down)
-
-        if len(pad) == 4:
-            _pad = pad
-        elif len(pad) == 2:
-            _pad = (pad[0], pad[1], pad[0], pad[1])
-
-        out = UpFirDn2d.apply(input, kernel, _up, _down, _pad)
-
-    return out
-
-
-def upfirdn2d_native(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1,
-                     pad_y0, pad_y1):
-    _, channel, in_h, in_w = input.shape
-    input = input.reshape(-1, in_h, in_w, 1)
-
-    _, in_h, in_w, minor = input.shape
-    kernel_h, kernel_w = kernel.shape
-
-    out = input.view(-1, in_h, 1, in_w, 1, minor)
-    out = F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1])
-    out = out.view(-1, in_h * up_y, in_w * up_x, minor)
-
-    out = F.pad(
-        out,
-        [0, 0,
-         max(pad_x0, 0),
-         max(pad_x1, 0),
-         max(pad_y0, 0),
-         max(pad_y1, 0)])
-    out = out[:,
-              max(-pad_y0, 0):out.shape[1] - max(-pad_y1, 0),
-              max(-pad_x0, 0):out.shape[2] - max(-pad_x1, 0), :, ]
-
-    out = out.permute(0, 3, 1, 2)
-    out = out.reshape(
-        [-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1])
-    w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w)
-    out = F.conv2d(out, w)
-    out = out.reshape(
-        -1,
-        minor,
-        in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1,
-        in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1,
-    )
-    out = out.permute(0, 2, 3, 1)
-    out = out[:, ::down_y, ::down_x, :]
-
-    out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1
-    out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1
-
-    return out.view(-1, channel, out_h, out_w)
diff --git a/ControlNet/annotator/uniformer/mmcv/ops/voxelize.py b/ControlNet/annotator/uniformer/mmcv/ops/voxelize.py
deleted file mode 100644
index ca3226a4fbcbfe58490fa2ea8e1c16b531214121..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/ops/voxelize.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch import nn
-from torch.autograd import Function
-from torch.nn.modules.utils import _pair
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext(
-    '_ext', ['dynamic_voxelize_forward', 'hard_voxelize_forward'])
-
-
-class _Voxelization(Function):
-
-    @staticmethod
-    def forward(ctx,
-                points,
-                voxel_size,
-                coors_range,
-                max_points=35,
-                max_voxels=20000):
-        """Convert kitti points(N, >=3) to voxels.
-
-        Args:
-            points (torch.Tensor): [N, ndim]. Points[:, :3] contain xyz points
-                and points[:, 3:] contain other information like reflectivity.
-            voxel_size (tuple or float): The size of voxel with the shape of
-                [3].
-            coors_range (tuple or float): The coordinate range of voxel with
-                the shape of [6].
-            max_points (int, optional): maximum points contained in a voxel. if
-                max_points=-1, it means using dynamic_voxelize. Default: 35.
-            max_voxels (int, optional): maximum voxels this function create.
-                for second, 20000 is a good choice. Users should shuffle points
-                before call this function because max_voxels may drop points.
-                Default: 20000.
-
-        Returns:
-            voxels_out (torch.Tensor): Output voxels with the shape of [M,
-                max_points, ndim]. Only contain points and returned when
-                max_points != -1.
-            coors_out (torch.Tensor): Output coordinates with the shape of
-                [M, 3].
-            num_points_per_voxel_out (torch.Tensor): Num points per voxel with
-                the shape of [M]. Only returned when max_points != -1.
-        """
-        if max_points == -1 or max_voxels == -1:
-            coors = points.new_zeros(size=(points.size(0), 3), dtype=torch.int)
-            ext_module.dynamic_voxelize_forward(points, coors, voxel_size,
-                                                coors_range, 3)
-            return coors
-        else:
-            voxels = points.new_zeros(
-                size=(max_voxels, max_points, points.size(1)))
-            coors = points.new_zeros(size=(max_voxels, 3), dtype=torch.int)
-            num_points_per_voxel = points.new_zeros(
-                size=(max_voxels, ), dtype=torch.int)
-            voxel_num = ext_module.hard_voxelize_forward(
-                points, voxels, coors, num_points_per_voxel, voxel_size,
-                coors_range, max_points, max_voxels, 3)
-            # select the valid voxels
-            voxels_out = voxels[:voxel_num]
-            coors_out = coors[:voxel_num]
-            num_points_per_voxel_out = num_points_per_voxel[:voxel_num]
-            return voxels_out, coors_out, num_points_per_voxel_out
-
-
-voxelization = _Voxelization.apply
-
-
-class Voxelization(nn.Module):
-    """Convert kitti points(N, >=3) to voxels.
-
-    Please refer to `PVCNN <https://arxiv.org/abs/1907.03739>`_ for more
-    details.
-
-    Args:
-        voxel_size (tuple or float): The size of voxel with the shape of [3].
-        point_cloud_range (tuple or float): The coordinate range of voxel with
-            the shape of [6].
-        max_num_points (int): maximum points contained in a voxel. if
-            max_points=-1, it means using dynamic_voxelize.
-        max_voxels (int, optional): maximum voxels this function create.
-            for second, 20000 is a good choice. Users should shuffle points
-            before call this function because max_voxels may drop points.
-            Default: 20000.
-    """
-
-    def __init__(self,
-                 voxel_size,
-                 point_cloud_range,
-                 max_num_points,
-                 max_voxels=20000):
-        super().__init__()
-
-        self.voxel_size = voxel_size
-        self.point_cloud_range = point_cloud_range
-        self.max_num_points = max_num_points
-        if isinstance(max_voxels, tuple):
-            self.max_voxels = max_voxels
-        else:
-            self.max_voxels = _pair(max_voxels)
-
-        point_cloud_range = torch.tensor(
-            point_cloud_range, dtype=torch.float32)
-        voxel_size = torch.tensor(voxel_size, dtype=torch.float32)
-        grid_size = (point_cloud_range[3:] -
-                     point_cloud_range[:3]) / voxel_size
-        grid_size = torch.round(grid_size).long()
-        input_feat_shape = grid_size[:2]
-        self.grid_size = grid_size
-        # the origin shape is as [x-len, y-len, z-len]
-        # [w, h, d] -> [d, h, w]
-        self.pcd_shape = [*input_feat_shape, 1][::-1]
-
-    def forward(self, input):
-        if self.training:
-            max_voxels = self.max_voxels[0]
-        else:
-            max_voxels = self.max_voxels[1]
-
-        return voxelization(input, self.voxel_size, self.point_cloud_range,
-                            self.max_num_points, max_voxels)
-
-    def __repr__(self):
-        s = self.__class__.__name__ + '('
-        s += 'voxel_size=' + str(self.voxel_size)
-        s += ', point_cloud_range=' + str(self.point_cloud_range)
-        s += ', max_num_points=' + str(self.max_num_points)
-        s += ', max_voxels=' + str(self.max_voxels)
-        s += ')'
-        return s
diff --git a/ControlNet/annotator/uniformer/mmcv/parallel/__init__.py b/ControlNet/annotator/uniformer/mmcv/parallel/__init__.py
deleted file mode 100644
index 2ed2c17ad357742e423beeaf4d35db03fe9af469..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/parallel/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .collate import collate
-from .data_container import DataContainer
-from .data_parallel import MMDataParallel
-from .distributed import MMDistributedDataParallel
-from .registry import MODULE_WRAPPERS
-from .scatter_gather import scatter, scatter_kwargs
-from .utils import is_module_wrapper
-
-__all__ = [
-    'collate', 'DataContainer', 'MMDataParallel', 'MMDistributedDataParallel',
-    'scatter', 'scatter_kwargs', 'is_module_wrapper', 'MODULE_WRAPPERS'
-]
diff --git a/ControlNet/annotator/uniformer/mmcv/parallel/_functions.py b/ControlNet/annotator/uniformer/mmcv/parallel/_functions.py
deleted file mode 100644
index 9b5a8a44483ab991411d07122b22a1d027e4be8e..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/parallel/_functions.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch.nn.parallel._functions import _get_stream
-
-
-def scatter(input, devices, streams=None):
-    """Scatters tensor across multiple GPUs."""
-    if streams is None:
-        streams = [None] * len(devices)
-
-    if isinstance(input, list):
-        chunk_size = (len(input) - 1) // len(devices) + 1
-        outputs = [
-            scatter(input[i], [devices[i // chunk_size]],
-                    [streams[i // chunk_size]]) for i in range(len(input))
-        ]
-        return outputs
-    elif isinstance(input, torch.Tensor):
-        output = input.contiguous()
-        # TODO: copy to a pinned buffer first (if copying from CPU)
-        stream = streams[0] if output.numel() > 0 else None
-        if devices != [-1]:
-            with torch.cuda.device(devices[0]), torch.cuda.stream(stream):
-                output = output.cuda(devices[0], non_blocking=True)
-        else:
-            # unsqueeze the first dimension thus the tensor's shape is the
-            # same as those scattered with GPU.
-            output = output.unsqueeze(0)
-        return output
-    else:
-        raise Exception(f'Unknown type {type(input)}.')
-
-
-def synchronize_stream(output, devices, streams):
-    if isinstance(output, list):
-        chunk_size = len(output) // len(devices)
-        for i in range(len(devices)):
-            for j in range(chunk_size):
-                synchronize_stream(output[i * chunk_size + j], [devices[i]],
-                                   [streams[i]])
-    elif isinstance(output, torch.Tensor):
-        if output.numel() != 0:
-            with torch.cuda.device(devices[0]):
-                main_stream = torch.cuda.current_stream()
-                main_stream.wait_stream(streams[0])
-                output.record_stream(main_stream)
-    else:
-        raise Exception(f'Unknown type {type(output)}.')
-
-
-def get_input_device(input):
-    if isinstance(input, list):
-        for item in input:
-            input_device = get_input_device(item)
-            if input_device != -1:
-                return input_device
-        return -1
-    elif isinstance(input, torch.Tensor):
-        return input.get_device() if input.is_cuda else -1
-    else:
-        raise Exception(f'Unknown type {type(input)}.')
-
-
-class Scatter:
-
-    @staticmethod
-    def forward(target_gpus, input):
-        input_device = get_input_device(input)
-        streams = None
-        if input_device == -1 and target_gpus != [-1]:
-            # Perform CPU to GPU copies in a background stream
-            streams = [_get_stream(device) for device in target_gpus]
-
-        outputs = scatter(input, target_gpus, streams)
-        # Synchronize with the copy stream
-        if streams is not None:
-            synchronize_stream(outputs, target_gpus, streams)
-
-        return tuple(outputs)
diff --git a/ControlNet/annotator/uniformer/mmcv/parallel/collate.py b/ControlNet/annotator/uniformer/mmcv/parallel/collate.py
deleted file mode 100644
index ad749197df21b0d74297548be5f66a696adebf7f..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/parallel/collate.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from collections.abc import Mapping, Sequence
-
-import torch
-import torch.nn.functional as F
-from torch.utils.data.dataloader import default_collate
-
-from .data_container import DataContainer
-
-
-def collate(batch, samples_per_gpu=1):
-    """Puts each data field into a tensor/DataContainer with outer dimension
-    batch size.
-
-    Extend default_collate to add support for
-    :type:`~mmcv.parallel.DataContainer`. There are 3 cases.
-
-    1. cpu_only = True, e.g., meta data
-    2. cpu_only = False, stack = True, e.g., images tensors
-    3. cpu_only = False, stack = False, e.g., gt bboxes
-    """
-
-    if not isinstance(batch, Sequence):
-        raise TypeError(f'{batch.dtype} is not supported.')
-
-    if isinstance(batch[0], DataContainer):
-        stacked = []
-        if batch[0].cpu_only:
-            for i in range(0, len(batch), samples_per_gpu):
-                stacked.append(
-                    [sample.data for sample in batch[i:i + samples_per_gpu]])
-            return DataContainer(
-                stacked, batch[0].stack, batch[0].padding_value, cpu_only=True)
-        elif batch[0].stack:
-            for i in range(0, len(batch), samples_per_gpu):
-                assert isinstance(batch[i].data, torch.Tensor)
-
-                if batch[i].pad_dims is not None:
-                    ndim = batch[i].dim()
-                    assert ndim > batch[i].pad_dims
-                    max_shape = [0 for _ in range(batch[i].pad_dims)]
-                    for dim in range(1, batch[i].pad_dims + 1):
-                        max_shape[dim - 1] = batch[i].size(-dim)
-                    for sample in batch[i:i + samples_per_gpu]:
-                        for dim in range(0, ndim - batch[i].pad_dims):
-                            assert batch[i].size(dim) == sample.size(dim)
-                        for dim in range(1, batch[i].pad_dims + 1):
-                            max_shape[dim - 1] = max(max_shape[dim - 1],
-                                                     sample.size(-dim))
-                    padded_samples = []
-                    for sample in batch[i:i + samples_per_gpu]:
-                        pad = [0 for _ in range(batch[i].pad_dims * 2)]
-                        for dim in range(1, batch[i].pad_dims + 1):
-                            pad[2 * dim -
-                                1] = max_shape[dim - 1] - sample.size(-dim)
-                        padded_samples.append(
-                            F.pad(
-                                sample.data, pad, value=sample.padding_value))
-                    stacked.append(default_collate(padded_samples))
-                elif batch[i].pad_dims is None:
-                    stacked.append(
-                        default_collate([
-                            sample.data
-                            for sample in batch[i:i + samples_per_gpu]
-                        ]))
-                else:
-                    raise ValueError(
-                        'pad_dims should be either None or integers (1-3)')
-
-        else:
-            for i in range(0, len(batch), samples_per_gpu):
-                stacked.append(
-                    [sample.data for sample in batch[i:i + samples_per_gpu]])
-        return DataContainer(stacked, batch[0].stack, batch[0].padding_value)
-    elif isinstance(batch[0], Sequence):
-        transposed = zip(*batch)
-        return [collate(samples, samples_per_gpu) for samples in transposed]
-    elif isinstance(batch[0], Mapping):
-        return {
-            key: collate([d[key] for d in batch], samples_per_gpu)
-            for key in batch[0]
-        }
-    else:
-        return default_collate(batch)
diff --git a/ControlNet/annotator/uniformer/mmcv/parallel/data_container.py b/ControlNet/annotator/uniformer/mmcv/parallel/data_container.py
deleted file mode 100644
index cedb0d32a51a1f575a622b38de2cee3ab4757821..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/parallel/data_container.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import functools
-
-import torch
-
-
-def assert_tensor_type(func):
-
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        if not isinstance(args[0].data, torch.Tensor):
-            raise AttributeError(
-                f'{args[0].__class__.__name__} has no attribute '
-                f'{func.__name__} for type {args[0].datatype}')
-        return func(*args, **kwargs)
-
-    return wrapper
-
-
-class DataContainer:
-    """A container for any type of objects.
-
-    Typically tensors will be stacked in the collate function and sliced along
-    some dimension in the scatter function. This behavior has some limitations.
-    1. All tensors have to be the same size.
-    2. Types are limited (numpy array or Tensor).
-
-    We design `DataContainer` and `MMDataParallel` to overcome these
-    limitations. The behavior can be either of the following.
-
-    - copy to GPU, pad all tensors to the same size and stack them
-    - copy to GPU without stacking
-    - leave the objects as is and pass it to the model
-    - pad_dims specifies the number of last few dimensions to do padding
-    """
-
-    def __init__(self,
-                 data,
-                 stack=False,
-                 padding_value=0,
-                 cpu_only=False,
-                 pad_dims=2):
-        self._data = data
-        self._cpu_only = cpu_only
-        self._stack = stack
-        self._padding_value = padding_value
-        assert pad_dims in [None, 1, 2, 3]
-        self._pad_dims = pad_dims
-
-    def __repr__(self):
-        return f'{self.__class__.__name__}({repr(self.data)})'
-
-    def __len__(self):
-        return len(self._data)
-
-    @property
-    def data(self):
-        return self._data
-
-    @property
-    def datatype(self):
-        if isinstance(self.data, torch.Tensor):
-            return self.data.type()
-        else:
-            return type(self.data)
-
-    @property
-    def cpu_only(self):
-        return self._cpu_only
-
-    @property
-    def stack(self):
-        return self._stack
-
-    @property
-    def padding_value(self):
-        return self._padding_value
-
-    @property
-    def pad_dims(self):
-        return self._pad_dims
-
-    @assert_tensor_type
-    def size(self, *args, **kwargs):
-        return self.data.size(*args, **kwargs)
-
-    @assert_tensor_type
-    def dim(self):
-        return self.data.dim()
diff --git a/ControlNet/annotator/uniformer/mmcv/parallel/data_parallel.py b/ControlNet/annotator/uniformer/mmcv/parallel/data_parallel.py
deleted file mode 100644
index 79b5f69b654cf647dc7ae9174223781ab5c607d2..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/parallel/data_parallel.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from itertools import chain
-
-from torch.nn.parallel import DataParallel
-
-from .scatter_gather import scatter_kwargs
-
-
-class MMDataParallel(DataParallel):
-    """The DataParallel module that supports DataContainer.
-
-    MMDataParallel has two main differences with PyTorch DataParallel:
-
-    - It supports a custom type :class:`DataContainer` which allows more
-      flexible control of input data during both GPU and CPU inference.
-    - It implement two more APIs ``train_step()`` and ``val_step()``.
-
-    Args:
-        module (:class:`nn.Module`): Module to be encapsulated.
-        device_ids (list[int]): Device IDS of modules to be scattered to.
-            Defaults to None when GPU is not available.
-        output_device (str | int): Device ID for output. Defaults to None.
-        dim (int): Dimension used to scatter the data. Defaults to 0.
-    """
-
-    def __init__(self, *args, dim=0, **kwargs):
-        super(MMDataParallel, self).__init__(*args, dim=dim, **kwargs)
-        self.dim = dim
-
-    def forward(self, *inputs, **kwargs):
-        """Override the original forward function.
-
-        The main difference lies in the CPU inference where the data in
-        :class:`DataContainers` will still be gathered.
-        """
-        if not self.device_ids:
-            # We add the following line thus the module could gather and
-            # convert data containers as those in GPU inference
-            inputs, kwargs = self.scatter(inputs, kwargs, [-1])
-            return self.module(*inputs[0], **kwargs[0])
-        else:
-            return super().forward(*inputs, **kwargs)
-
-    def scatter(self, inputs, kwargs, device_ids):
-        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
-
-    def train_step(self, *inputs, **kwargs):
-        if not self.device_ids:
-            # We add the following line thus the module could gather and
-            # convert data containers as those in GPU inference
-            inputs, kwargs = self.scatter(inputs, kwargs, [-1])
-            return self.module.train_step(*inputs[0], **kwargs[0])
-
-        assert len(self.device_ids) == 1, \
-            ('MMDataParallel only supports single GPU training, if you need to'
-             ' train with multiple GPUs, please use MMDistributedDataParallel'
-             'instead.')
-
-        for t in chain(self.module.parameters(), self.module.buffers()):
-            if t.device != self.src_device_obj:
-                raise RuntimeError(
-                    'module must have its parameters and buffers '
-                    f'on device {self.src_device_obj} (device_ids[0]) but '
-                    f'found one of them on device: {t.device}')
-
-        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
-        return self.module.train_step(*inputs[0], **kwargs[0])
-
-    def val_step(self, *inputs, **kwargs):
-        if not self.device_ids:
-            # We add the following line thus the module could gather and
-            # convert data containers as those in GPU inference
-            inputs, kwargs = self.scatter(inputs, kwargs, [-1])
-            return self.module.val_step(*inputs[0], **kwargs[0])
-
-        assert len(self.device_ids) == 1, \
-            ('MMDataParallel only supports single GPU training, if you need to'
-             ' train with multiple GPUs, please use MMDistributedDataParallel'
-             ' instead.')
-
-        for t in chain(self.module.parameters(), self.module.buffers()):
-            if t.device != self.src_device_obj:
-                raise RuntimeError(
-                    'module must have its parameters and buffers '
-                    f'on device {self.src_device_obj} (device_ids[0]) but '
-                    f'found one of them on device: {t.device}')
-
-        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
-        return self.module.val_step(*inputs[0], **kwargs[0])
diff --git a/ControlNet/annotator/uniformer/mmcv/parallel/distributed.py b/ControlNet/annotator/uniformer/mmcv/parallel/distributed.py
deleted file mode 100644
index 1e4c27903db58a54d37ea1ed9ec0104098b486f2..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/parallel/distributed.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch.nn.parallel.distributed import (DistributedDataParallel,
-                                           _find_tensors)
-
-from annotator.uniformer.mmcv import print_log
-from annotator.uniformer.mmcv.utils import TORCH_VERSION, digit_version
-from .scatter_gather import scatter_kwargs
-
-
-class MMDistributedDataParallel(DistributedDataParallel):
-    """The DDP module that supports DataContainer.
-
-    MMDDP has two main differences with PyTorch DDP:
-
-    - It supports a custom type :class:`DataContainer` which allows more
-      flexible control of input data.
-    - It implement two APIs ``train_step()`` and ``val_step()``.
-    """
-
-    def to_kwargs(self, inputs, kwargs, device_id):
-        # Use `self.to_kwargs` instead of `self.scatter` in pytorch1.8
-        # to move all tensors to device_id
-        return scatter_kwargs(inputs, kwargs, [device_id], dim=self.dim)
-
-    def scatter(self, inputs, kwargs, device_ids):
-        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
-
-    def train_step(self, *inputs, **kwargs):
-        """train_step() API for module wrapped by DistributedDataParallel.
-
-        This method is basically the same as
-        ``DistributedDataParallel.forward()``, while replacing
-        ``self.module.forward()`` with ``self.module.train_step()``.
-        It is compatible with PyTorch 1.1 - 1.5.
-        """
-
-        # In PyTorch >= 1.7, ``reducer._rebuild_buckets()`` is moved from the
-        # end of backward to the beginning of forward.
-        if ('parrots' not in TORCH_VERSION
-                and digit_version(TORCH_VERSION) >= digit_version('1.7')
-                and self.reducer._rebuild_buckets()):
-            print_log(
-                'Reducer buckets have been rebuilt in this iteration.',
-                logger='mmcv')
-
-        if getattr(self, 'require_forward_param_sync', True):
-            self._sync_params()
-        if self.device_ids:
-            inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
-            if len(self.device_ids) == 1:
-                output = self.module.train_step(*inputs[0], **kwargs[0])
-            else:
-                outputs = self.parallel_apply(
-                    self._module_copies[:len(inputs)], inputs, kwargs)
-                output = self.gather(outputs, self.output_device)
-        else:
-            output = self.module.train_step(*inputs, **kwargs)
-
-        if torch.is_grad_enabled() and getattr(
-                self, 'require_backward_grad_sync', True):
-            if self.find_unused_parameters:
-                self.reducer.prepare_for_backward(list(_find_tensors(output)))
-            else:
-                self.reducer.prepare_for_backward([])
-        else:
-            if ('parrots' not in TORCH_VERSION
-                    and digit_version(TORCH_VERSION) > digit_version('1.2')):
-                self.require_forward_param_sync = False
-        return output
-
-    def val_step(self, *inputs, **kwargs):
-        """val_step() API for module wrapped by DistributedDataParallel.
-
-        This method is basically the same as
-        ``DistributedDataParallel.forward()``, while replacing
-        ``self.module.forward()`` with ``self.module.val_step()``.
-        It is compatible with PyTorch 1.1 - 1.5.
-        """
-        # In PyTorch >= 1.7, ``reducer._rebuild_buckets()`` is moved from the
-        # end of backward to the beginning of forward.
-        if ('parrots' not in TORCH_VERSION
-                and digit_version(TORCH_VERSION) >= digit_version('1.7')
-                and self.reducer._rebuild_buckets()):
-            print_log(
-                'Reducer buckets have been rebuilt in this iteration.',
-                logger='mmcv')
-
-        if getattr(self, 'require_forward_param_sync', True):
-            self._sync_params()
-        if self.device_ids:
-            inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
-            if len(self.device_ids) == 1:
-                output = self.module.val_step(*inputs[0], **kwargs[0])
-            else:
-                outputs = self.parallel_apply(
-                    self._module_copies[:len(inputs)], inputs, kwargs)
-                output = self.gather(outputs, self.output_device)
-        else:
-            output = self.module.val_step(*inputs, **kwargs)
-
-        if torch.is_grad_enabled() and getattr(
-                self, 'require_backward_grad_sync', True):
-            if self.find_unused_parameters:
-                self.reducer.prepare_for_backward(list(_find_tensors(output)))
-            else:
-                self.reducer.prepare_for_backward([])
-        else:
-            if ('parrots' not in TORCH_VERSION
-                    and digit_version(TORCH_VERSION) > digit_version('1.2')):
-                self.require_forward_param_sync = False
-        return output
diff --git a/ControlNet/annotator/uniformer/mmcv/parallel/distributed_deprecated.py b/ControlNet/annotator/uniformer/mmcv/parallel/distributed_deprecated.py
deleted file mode 100644
index 676937a2085d4da20fa87923041a200fca6214eb..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/parallel/distributed_deprecated.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-from torch._utils import (_flatten_dense_tensors, _take_tensors,
-                          _unflatten_dense_tensors)
-
-from annotator.uniformer.mmcv.utils import TORCH_VERSION, digit_version
-from .registry import MODULE_WRAPPERS
-from .scatter_gather import scatter_kwargs
-
-
-@MODULE_WRAPPERS.register_module()
-class MMDistributedDataParallel(nn.Module):
-
-    def __init__(self,
-                 module,
-                 dim=0,
-                 broadcast_buffers=True,
-                 bucket_cap_mb=25):
-        super(MMDistributedDataParallel, self).__init__()
-        self.module = module
-        self.dim = dim
-        self.broadcast_buffers = broadcast_buffers
-
-        self.broadcast_bucket_size = bucket_cap_mb * 1024 * 1024
-        self._sync_params()
-
-    def _dist_broadcast_coalesced(self, tensors, buffer_size):
-        for tensors in _take_tensors(tensors, buffer_size):
-            flat_tensors = _flatten_dense_tensors(tensors)
-            dist.broadcast(flat_tensors, 0)
-            for tensor, synced in zip(
-                    tensors, _unflatten_dense_tensors(flat_tensors, tensors)):
-                tensor.copy_(synced)
-
-    def _sync_params(self):
-        module_states = list(self.module.state_dict().values())
-        if len(module_states) > 0:
-            self._dist_broadcast_coalesced(module_states,
-                                           self.broadcast_bucket_size)
-        if self.broadcast_buffers:
-            if (TORCH_VERSION != 'parrots'
-                    and digit_version(TORCH_VERSION) < digit_version('1.0')):
-                buffers = [b.data for b in self.module._all_buffers()]
-            else:
-                buffers = [b.data for b in self.module.buffers()]
-            if len(buffers) > 0:
-                self._dist_broadcast_coalesced(buffers,
-                                               self.broadcast_bucket_size)
-
-    def scatter(self, inputs, kwargs, device_ids):
-        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
-
-    def forward(self, *inputs, **kwargs):
-        inputs, kwargs = self.scatter(inputs, kwargs,
-                                      [torch.cuda.current_device()])
-        return self.module(*inputs[0], **kwargs[0])
-
-    def train_step(self, *inputs, **kwargs):
-        inputs, kwargs = self.scatter(inputs, kwargs,
-                                      [torch.cuda.current_device()])
-        output = self.module.train_step(*inputs[0], **kwargs[0])
-        return output
-
-    def val_step(self, *inputs, **kwargs):
-        inputs, kwargs = self.scatter(inputs, kwargs,
-                                      [torch.cuda.current_device()])
-        output = self.module.val_step(*inputs[0], **kwargs[0])
-        return output
diff --git a/ControlNet/annotator/uniformer/mmcv/parallel/registry.py b/ControlNet/annotator/uniformer/mmcv/parallel/registry.py
deleted file mode 100644
index a204a07fba10e614223f090d1a57cf9c4d74d4a1..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/parallel/registry.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from torch.nn.parallel import DataParallel, DistributedDataParallel
-
-from annotator.uniformer.mmcv.utils import Registry
-
-MODULE_WRAPPERS = Registry('module wrapper')
-MODULE_WRAPPERS.register_module(module=DataParallel)
-MODULE_WRAPPERS.register_module(module=DistributedDataParallel)
diff --git a/ControlNet/annotator/uniformer/mmcv/parallel/scatter_gather.py b/ControlNet/annotator/uniformer/mmcv/parallel/scatter_gather.py
deleted file mode 100644
index 900ff88566f8f14830590459dc4fd16d4b382e47..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/parallel/scatter_gather.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch.nn.parallel._functions import Scatter as OrigScatter
-
-from ._functions import Scatter
-from .data_container import DataContainer
-
-
-def scatter(inputs, target_gpus, dim=0):
-    """Scatter inputs to target gpus.
-
-    The only difference from original :func:`scatter` is to add support for
-    :type:`~mmcv.parallel.DataContainer`.
-    """
-
-    def scatter_map(obj):
-        if isinstance(obj, torch.Tensor):
-            if target_gpus != [-1]:
-                return OrigScatter.apply(target_gpus, None, dim, obj)
-            else:
-                # for CPU inference we use self-implemented scatter
-                return Scatter.forward(target_gpus, obj)
-        if isinstance(obj, DataContainer):
-            if obj.cpu_only:
-                return obj.data
-            else:
-                return Scatter.forward(target_gpus, obj.data)
-        if isinstance(obj, tuple) and len(obj) > 0:
-            return list(zip(*map(scatter_map, obj)))
-        if isinstance(obj, list) and len(obj) > 0:
-            out = list(map(list, zip(*map(scatter_map, obj))))
-            return out
-        if isinstance(obj, dict) and len(obj) > 0:
-            out = list(map(type(obj), zip(*map(scatter_map, obj.items()))))
-            return out
-        return [obj for targets in target_gpus]
-
-    # After scatter_map is called, a scatter_map cell will exist. This cell
-    # has a reference to the actual function scatter_map, which has references
-    # to a closure that has a reference to the scatter_map cell (because the
-    # fn is recursive). To avoid this reference cycle, we set the function to
-    # None, clearing the cell
-    try:
-        return scatter_map(inputs)
-    finally:
-        scatter_map = None
-
-
-def scatter_kwargs(inputs, kwargs, target_gpus, dim=0):
-    """Scatter with support for kwargs dictionary."""
-    inputs = scatter(inputs, target_gpus, dim) if inputs else []
-    kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
-    if len(inputs) < len(kwargs):
-        inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
-    elif len(kwargs) < len(inputs):
-        kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
-    inputs = tuple(inputs)
-    kwargs = tuple(kwargs)
-    return inputs, kwargs
diff --git a/ControlNet/annotator/uniformer/mmcv/parallel/utils.py b/ControlNet/annotator/uniformer/mmcv/parallel/utils.py
deleted file mode 100644
index 0f5712cb42c38a2e8563bf563efb6681383cab9b..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/parallel/utils.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .registry import MODULE_WRAPPERS
-
-
-def is_module_wrapper(module):
-    """Check if a module is a module wrapper.
-
-    The following 3 modules in MMCV (and their subclasses) are regarded as
-    module wrappers: DataParallel, DistributedDataParallel,
-    MMDistributedDataParallel (the deprecated version). You may add you own
-    module wrapper by registering it to mmcv.parallel.MODULE_WRAPPERS.
-
-    Args:
-        module (nn.Module): The module to be checked.
-
-    Returns:
-        bool: True if the input module is a module wrapper.
-    """
-    module_wrappers = tuple(MODULE_WRAPPERS.module_dict.values())
-    return isinstance(module, module_wrappers)
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/__init__.py b/ControlNet/annotator/uniformer/mmcv/runner/__init__.py
deleted file mode 100644
index 52e4b48d383a84a055dcd7f6236f6e8e58eab924..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/__init__.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .base_module import BaseModule, ModuleList, Sequential
-from .base_runner import BaseRunner
-from .builder import RUNNERS, build_runner
-from .checkpoint import (CheckpointLoader, _load_checkpoint,
-                         _load_checkpoint_with_prefix, load_checkpoint,
-                         load_state_dict, save_checkpoint, weights_to_cpu)
-from .default_constructor import DefaultRunnerConstructor
-from .dist_utils import (allreduce_grads, allreduce_params, get_dist_info,
-                         init_dist, master_only)
-from .epoch_based_runner import EpochBasedRunner, Runner
-from .fp16_utils import LossScaler, auto_fp16, force_fp32, wrap_fp16_model
-from .hooks import (HOOKS, CheckpointHook, ClosureHook, DistEvalHook,
-                    DistSamplerSeedHook, DvcliveLoggerHook, EMAHook, EvalHook,
-                    Fp16OptimizerHook, GradientCumulativeFp16OptimizerHook,
-                    GradientCumulativeOptimizerHook, Hook, IterTimerHook,
-                    LoggerHook, LrUpdaterHook, MlflowLoggerHook,
-                    NeptuneLoggerHook, OptimizerHook, PaviLoggerHook,
-                    SyncBuffersHook, TensorboardLoggerHook, TextLoggerHook,
-                    WandbLoggerHook)
-from .iter_based_runner import IterBasedRunner, IterLoader
-from .log_buffer import LogBuffer
-from .optimizer import (OPTIMIZER_BUILDERS, OPTIMIZERS,
-                        DefaultOptimizerConstructor, build_optimizer,
-                        build_optimizer_constructor)
-from .priority import Priority, get_priority
-from .utils import get_host_info, get_time_str, obj_from_dict, set_random_seed
-
-__all__ = [
-    'BaseRunner', 'Runner', 'EpochBasedRunner', 'IterBasedRunner', 'LogBuffer',
-    'HOOKS', 'Hook', 'CheckpointHook', 'ClosureHook', 'LrUpdaterHook',
-    'OptimizerHook', 'IterTimerHook', 'DistSamplerSeedHook', 'LoggerHook',
-    'PaviLoggerHook', 'TextLoggerHook', 'TensorboardLoggerHook',
-    'NeptuneLoggerHook', 'WandbLoggerHook', 'MlflowLoggerHook',
-    'DvcliveLoggerHook', '_load_checkpoint', 'load_state_dict',
-    'load_checkpoint', 'weights_to_cpu', 'save_checkpoint', 'Priority',
-    'get_priority', 'get_host_info', 'get_time_str', 'obj_from_dict',
-    'init_dist', 'get_dist_info', 'master_only', 'OPTIMIZER_BUILDERS',
-    'OPTIMIZERS', 'DefaultOptimizerConstructor', 'build_optimizer',
-    'build_optimizer_constructor', 'IterLoader', 'set_random_seed',
-    'auto_fp16', 'force_fp32', 'wrap_fp16_model', 'Fp16OptimizerHook',
-    'SyncBuffersHook', 'EMAHook', 'build_runner', 'RUNNERS', 'allreduce_grads',
-    'allreduce_params', 'LossScaler', 'CheckpointLoader', 'BaseModule',
-    '_load_checkpoint_with_prefix', 'EvalHook', 'DistEvalHook', 'Sequential',
-    'ModuleList', 'GradientCumulativeOptimizerHook',
-    'GradientCumulativeFp16OptimizerHook', 'DefaultRunnerConstructor'
-]
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/base_module.py b/ControlNet/annotator/uniformer/mmcv/runner/base_module.py
deleted file mode 100644
index 617fad9bb89f10a9a0911d962dfb3bc8f3a3628c..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/base_module.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import warnings
-from abc import ABCMeta
-from collections import defaultdict
-from logging import FileHandler
-
-import torch.nn as nn
-
-from annotator.uniformer.mmcv.runner.dist_utils import master_only
-from annotator.uniformer.mmcv.utils.logging import get_logger, logger_initialized, print_log
-
-
-class BaseModule(nn.Module, metaclass=ABCMeta):
-    """Base module for all modules in openmmlab.
-
-    ``BaseModule`` is a wrapper of ``torch.nn.Module`` with additional
-    functionality of parameter initialization. Compared with
-    ``torch.nn.Module``, ``BaseModule`` mainly adds three attributes.
-
-        - ``init_cfg``: the config to control the initialization.
-        - ``init_weights``: The function of parameter
-            initialization and recording initialization
-            information.
-        - ``_params_init_info``: Used to track the parameter
-            initialization information. This attribute only
-            exists during executing the ``init_weights``.
-
-    Args:
-        init_cfg (dict, optional): Initialization config dict.
-    """
-
-    def __init__(self, init_cfg=None):
-        """Initialize BaseModule, inherited from `torch.nn.Module`"""
-
-        # NOTE init_cfg can be defined in different levels, but init_cfg
-        # in low levels has a higher priority.
-
-        super(BaseModule, self).__init__()
-        # define default value of init_cfg instead of hard code
-        # in init_weights() function
-        self._is_init = False
-
-        self.init_cfg = copy.deepcopy(init_cfg)
-
-        # Backward compatibility in derived classes
-        # if pretrained is not None:
-        #     warnings.warn('DeprecationWarning: pretrained is a deprecated \
-        #         key, please consider using init_cfg')
-        #     self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
-
-    @property
-    def is_init(self):
-        return self._is_init
-
-    def init_weights(self):
-        """Initialize the weights."""
-
-        is_top_level_module = False
-        # check if it is top-level module
-        if not hasattr(self, '_params_init_info'):
-            # The `_params_init_info` is used to record the initialization
-            # information of the parameters
-            # the key should be the obj:`nn.Parameter` of model and the value
-            # should be a dict containing
-            # - init_info (str): The string that describes the initialization.
-            # - tmp_mean_value (FloatTensor): The mean of the parameter,
-            #       which indicates whether the parameter has been modified.
-            # this attribute would be deleted after all parameters
-            # is initialized.
-            self._params_init_info = defaultdict(dict)
-            is_top_level_module = True
-
-            # Initialize the `_params_init_info`,
-            # When detecting the `tmp_mean_value` of
-            # the corresponding parameter is changed, update related
-            # initialization information
-            for name, param in self.named_parameters():
-                self._params_init_info[param][
-                    'init_info'] = f'The value is the same before and ' \
-                                   f'after calling `init_weights` ' \
-                                   f'of {self.__class__.__name__} '
-                self._params_init_info[param][
-                    'tmp_mean_value'] = param.data.mean()
-
-            # pass `params_init_info` to all submodules
-            # All submodules share the same `params_init_info`,
-            # so it will be updated when parameters are
-            # modified at any level of the model.
-            for sub_module in self.modules():
-                sub_module._params_init_info = self._params_init_info
-
-        # Get the initialized logger, if not exist,
-        # create a logger named `mmcv`
-        logger_names = list(logger_initialized.keys())
-        logger_name = logger_names[0] if logger_names else 'mmcv'
-
-        from ..cnn import initialize
-        from ..cnn.utils.weight_init import update_init_info
-        module_name = self.__class__.__name__
-        if not self._is_init:
-            if self.init_cfg:
-                print_log(
-                    f'initialize {module_name} with init_cfg {self.init_cfg}',
-                    logger=logger_name)
-                initialize(self, self.init_cfg)
-                if isinstance(self.init_cfg, dict):
-                    # prevent the parameters of
-                    # the pre-trained model
-                    # from being overwritten by
-                    # the `init_weights`
-                    if self.init_cfg['type'] == 'Pretrained':
-                        return
-
-            for m in self.children():
-                if hasattr(m, 'init_weights'):
-                    m.init_weights()
-                    # users may overload the `init_weights`
-                    update_init_info(
-                        m,
-                        init_info=f'Initialized by '
-                        f'user-defined `init_weights`'
-                        f' in {m.__class__.__name__} ')
-
-            self._is_init = True
-        else:
-            warnings.warn(f'init_weights of {self.__class__.__name__} has '
-                          f'been called more than once.')
-
-        if is_top_level_module:
-            self._dump_init_info(logger_name)
-
-            for sub_module in self.modules():
-                del sub_module._params_init_info
-
-    @master_only
-    def _dump_init_info(self, logger_name):
-        """Dump the initialization information to a file named
-        `initialization.log.json` in workdir.
-
-        Args:
-            logger_name (str): The name of logger.
-        """
-
-        logger = get_logger(logger_name)
-
-        with_file_handler = False
-        # dump the information to the logger file if there is a `FileHandler`
-        for handler in logger.handlers:
-            if isinstance(handler, FileHandler):
-                handler.stream.write(
-                    'Name of parameter - Initialization information\n')
-                for name, param in self.named_parameters():
-                    handler.stream.write(
-                        f'\n{name} - {param.shape}: '
-                        f"\n{self._params_init_info[param]['init_info']} \n")
-                handler.stream.flush()
-                with_file_handler = True
-        if not with_file_handler:
-            for name, param in self.named_parameters():
-                print_log(
-                    f'\n{name} - {param.shape}: '
-                    f"\n{self._params_init_info[param]['init_info']} \n ",
-                    logger=logger_name)
-
-    def __repr__(self):
-        s = super().__repr__()
-        if self.init_cfg:
-            s += f'\ninit_cfg={self.init_cfg}'
-        return s
-
-
-class Sequential(BaseModule, nn.Sequential):
-    """Sequential module in openmmlab.
-
-    Args:
-        init_cfg (dict, optional): Initialization config dict.
-    """
-
-    def __init__(self, *args, init_cfg=None):
-        BaseModule.__init__(self, init_cfg)
-        nn.Sequential.__init__(self, *args)
-
-
-class ModuleList(BaseModule, nn.ModuleList):
-    """ModuleList in openmmlab.
-
-    Args:
-        modules (iterable, optional): an iterable of modules to add.
-        init_cfg (dict, optional): Initialization config dict.
-    """
-
-    def __init__(self, modules=None, init_cfg=None):
-        BaseModule.__init__(self, init_cfg)
-        nn.ModuleList.__init__(self, modules)
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/base_runner.py b/ControlNet/annotator/uniformer/mmcv/runner/base_runner.py
deleted file mode 100644
index 4928db0a73b56fe0218a4bf66ec4ffa082d31ccc..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/base_runner.py
+++ /dev/null
@@ -1,542 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import logging
-import os.path as osp
-import warnings
-from abc import ABCMeta, abstractmethod
-
-import torch
-from torch.optim import Optimizer
-
-import annotator.uniformer.mmcv as mmcv
-from ..parallel import is_module_wrapper
-from .checkpoint import load_checkpoint
-from .dist_utils import get_dist_info
-from .hooks import HOOKS, Hook
-from .log_buffer import LogBuffer
-from .priority import Priority, get_priority
-from .utils import get_time_str
-
-
-class BaseRunner(metaclass=ABCMeta):
-    """The base class of Runner, a training helper for PyTorch.
-
-    All subclasses should implement the following APIs:
-
-    - ``run()``
-    - ``train()``
-    - ``val()``
-    - ``save_checkpoint()``
-
-    Args:
-        model (:obj:`torch.nn.Module`): The model to be run.
-        batch_processor (callable): A callable method that process a data
-            batch. The interface of this method should be
-            `batch_processor(model, data, train_mode) -> dict`
-        optimizer (dict or :obj:`torch.optim.Optimizer`): It can be either an
-            optimizer (in most cases) or a dict of optimizers (in models that
-            requires more than one optimizer, e.g., GAN).
-        work_dir (str, optional): The working directory to save checkpoints
-            and logs. Defaults to None.
-        logger (:obj:`logging.Logger`): Logger used during training.
-             Defaults to None. (The default value is just for backward
-             compatibility)
-        meta (dict | None): A dict records some import information such as
-            environment info and seed, which will be logged in logger hook.
-            Defaults to None.
-        max_epochs (int, optional): Total training epochs.
-        max_iters (int, optional): Total training iterations.
-    """
-
-    def __init__(self,
-                 model,
-                 batch_processor=None,
-                 optimizer=None,
-                 work_dir=None,
-                 logger=None,
-                 meta=None,
-                 max_iters=None,
-                 max_epochs=None):
-        if batch_processor is not None:
-            if not callable(batch_processor):
-                raise TypeError('batch_processor must be callable, '
-                                f'but got {type(batch_processor)}')
-            warnings.warn('batch_processor is deprecated, please implement '
-                          'train_step() and val_step() in the model instead.')
-            # raise an error is `batch_processor` is not None and
-            # `model.train_step()` exists.
-            if is_module_wrapper(model):
-                _model = model.module
-            else:
-                _model = model
-            if hasattr(_model, 'train_step') or hasattr(_model, 'val_step'):
-                raise RuntimeError(
-                    'batch_processor and model.train_step()/model.val_step() '
-                    'cannot be both available.')
-        else:
-            assert hasattr(model, 'train_step')
-
-        # check the type of `optimizer`
-        if isinstance(optimizer, dict):
-            for name, optim in optimizer.items():
-                if not isinstance(optim, Optimizer):
-                    raise TypeError(
-                        f'optimizer must be a dict of torch.optim.Optimizers, '
-                        f'but optimizer["{name}"] is a {type(optim)}')
-        elif not isinstance(optimizer, Optimizer) and optimizer is not None:
-            raise TypeError(
-                f'optimizer must be a torch.optim.Optimizer object '
-                f'or dict or None, but got {type(optimizer)}')
-
-        # check the type of `logger`
-        if not isinstance(logger, logging.Logger):
-            raise TypeError(f'logger must be a logging.Logger object, '
-                            f'but got {type(logger)}')
-
-        # check the type of `meta`
-        if meta is not None and not isinstance(meta, dict):
-            raise TypeError(
-                f'meta must be a dict or None, but got {type(meta)}')
-
-        self.model = model
-        self.batch_processor = batch_processor
-        self.optimizer = optimizer
-        self.logger = logger
-        self.meta = meta
-        # create work_dir
-        if mmcv.is_str(work_dir):
-            self.work_dir = osp.abspath(work_dir)
-            mmcv.mkdir_or_exist(self.work_dir)
-        elif work_dir is None:
-            self.work_dir = None
-        else:
-            raise TypeError('"work_dir" must be a str or None')
-
-        # get model name from the model class
-        if hasattr(self.model, 'module'):
-            self._model_name = self.model.module.__class__.__name__
-        else:
-            self._model_name = self.model.__class__.__name__
-
-        self._rank, self._world_size = get_dist_info()
-        self.timestamp = get_time_str()
-        self.mode = None
-        self._hooks = []
-        self._epoch = 0
-        self._iter = 0
-        self._inner_iter = 0
-
-        if max_epochs is not None and max_iters is not None:
-            raise ValueError(
-                'Only one of `max_epochs` or `max_iters` can be set.')
-
-        self._max_epochs = max_epochs
-        self._max_iters = max_iters
-        # TODO: Redesign LogBuffer, it is not flexible and elegant enough
-        self.log_buffer = LogBuffer()
-
-    @property
-    def model_name(self):
-        """str: Name of the model, usually the module class name."""
-        return self._model_name
-
-    @property
-    def rank(self):
-        """int: Rank of current process. (distributed training)"""
-        return self._rank
-
-    @property
-    def world_size(self):
-        """int: Number of processes participating in the job.
-        (distributed training)"""
-        return self._world_size
-
-    @property
-    def hooks(self):
-        """list[:obj:`Hook`]: A list of registered hooks."""
-        return self._hooks
-
-    @property
-    def epoch(self):
-        """int: Current epoch."""
-        return self._epoch
-
-    @property
-    def iter(self):
-        """int: Current iteration."""
-        return self._iter
-
-    @property
-    def inner_iter(self):
-        """int: Iteration in an epoch."""
-        return self._inner_iter
-
-    @property
-    def max_epochs(self):
-        """int: Maximum training epochs."""
-        return self._max_epochs
-
-    @property
-    def max_iters(self):
-        """int: Maximum training iterations."""
-        return self._max_iters
-
-    @abstractmethod
-    def train(self):
-        pass
-
-    @abstractmethod
-    def val(self):
-        pass
-
-    @abstractmethod
-    def run(self, data_loaders, workflow, **kwargs):
-        pass
-
-    @abstractmethod
-    def save_checkpoint(self,
-                        out_dir,
-                        filename_tmpl,
-                        save_optimizer=True,
-                        meta=None,
-                        create_symlink=True):
-        pass
-
-    def current_lr(self):
-        """Get current learning rates.
-
-        Returns:
-            list[float] | dict[str, list[float]]: Current learning rates of all
-                param groups. If the runner has a dict of optimizers, this
-                method will return a dict.
-        """
-        if isinstance(self.optimizer, torch.optim.Optimizer):
-            lr = [group['lr'] for group in self.optimizer.param_groups]
-        elif isinstance(self.optimizer, dict):
-            lr = dict()
-            for name, optim in self.optimizer.items():
-                lr[name] = [group['lr'] for group in optim.param_groups]
-        else:
-            raise RuntimeError(
-                'lr is not applicable because optimizer does not exist.')
-        return lr
-
-    def current_momentum(self):
-        """Get current momentums.
-
-        Returns:
-            list[float] | dict[str, list[float]]: Current momentums of all
-                param groups. If the runner has a dict of optimizers, this
-                method will return a dict.
-        """
-
-        def _get_momentum(optimizer):
-            momentums = []
-            for group in optimizer.param_groups:
-                if 'momentum' in group.keys():
-                    momentums.append(group['momentum'])
-                elif 'betas' in group.keys():
-                    momentums.append(group['betas'][0])
-                else:
-                    momentums.append(0)
-            return momentums
-
-        if self.optimizer is None:
-            raise RuntimeError(
-                'momentum is not applicable because optimizer does not exist.')
-        elif isinstance(self.optimizer, torch.optim.Optimizer):
-            momentums = _get_momentum(self.optimizer)
-        elif isinstance(self.optimizer, dict):
-            momentums = dict()
-            for name, optim in self.optimizer.items():
-                momentums[name] = _get_momentum(optim)
-        return momentums
-
-    def register_hook(self, hook, priority='NORMAL'):
-        """Register a hook into the hook list.
-
-        The hook will be inserted into a priority queue, with the specified
-        priority (See :class:`Priority` for details of priorities).
-        For hooks with the same priority, they will be triggered in the same
-        order as they are registered.
-
-        Args:
-            hook (:obj:`Hook`): The hook to be registered.
-            priority (int or str or :obj:`Priority`): Hook priority.
-                Lower value means higher priority.
-        """
-        assert isinstance(hook, Hook)
-        if hasattr(hook, 'priority'):
-            raise ValueError('"priority" is a reserved attribute for hooks')
-        priority = get_priority(priority)
-        hook.priority = priority
-        # insert the hook to a sorted list
-        inserted = False
-        for i in range(len(self._hooks) - 1, -1, -1):
-            if priority >= self._hooks[i].priority:
-                self._hooks.insert(i + 1, hook)
-                inserted = True
-                break
-        if not inserted:
-            self._hooks.insert(0, hook)
-
-    def register_hook_from_cfg(self, hook_cfg):
-        """Register a hook from its cfg.
-
-        Args:
-            hook_cfg (dict): Hook config. It should have at least keys 'type'
-              and 'priority' indicating its type and priority.
-
-        Notes:
-            The specific hook class to register should not use 'type' and
-            'priority' arguments during initialization.
-        """
-        hook_cfg = hook_cfg.copy()
-        priority = hook_cfg.pop('priority', 'NORMAL')
-        hook = mmcv.build_from_cfg(hook_cfg, HOOKS)
-        self.register_hook(hook, priority=priority)
-
-    def call_hook(self, fn_name):
-        """Call all hooks.
-
-        Args:
-            fn_name (str): The function name in each hook to be called, such as
-                "before_train_epoch".
-        """
-        for hook in self._hooks:
-            getattr(hook, fn_name)(self)
-
-    def get_hook_info(self):
-        # Get hooks info in each stage
-        stage_hook_map = {stage: [] for stage in Hook.stages}
-        for hook in self.hooks:
-            try:
-                priority = Priority(hook.priority).name
-            except ValueError:
-                priority = hook.priority
-            classname = hook.__class__.__name__
-            hook_info = f'({priority:<12}) {classname:<35}'
-            for trigger_stage in hook.get_triggered_stages():
-                stage_hook_map[trigger_stage].append(hook_info)
-
-        stage_hook_infos = []
-        for stage in Hook.stages:
-            hook_infos = stage_hook_map[stage]
-            if len(hook_infos) > 0:
-                info = f'{stage}:\n'
-                info += '\n'.join(hook_infos)
-                info += '\n -------------------- '
-                stage_hook_infos.append(info)
-        return '\n'.join(stage_hook_infos)
-
-    def load_checkpoint(self,
-                        filename,
-                        map_location='cpu',
-                        strict=False,
-                        revise_keys=[(r'^module.', '')]):
-        return load_checkpoint(
-            self.model,
-            filename,
-            map_location,
-            strict,
-            self.logger,
-            revise_keys=revise_keys)
-
-    def resume(self,
-               checkpoint,
-               resume_optimizer=True,
-               map_location='default'):
-        if map_location == 'default':
-            if torch.cuda.is_available():
-                device_id = torch.cuda.current_device()
-                checkpoint = self.load_checkpoint(
-                    checkpoint,
-                    map_location=lambda storage, loc: storage.cuda(device_id))
-            else:
-                checkpoint = self.load_checkpoint(checkpoint)
-        else:
-            checkpoint = self.load_checkpoint(
-                checkpoint, map_location=map_location)
-
-        self._epoch = checkpoint['meta']['epoch']
-        self._iter = checkpoint['meta']['iter']
-        if self.meta is None:
-            self.meta = {}
-        self.meta.setdefault('hook_msgs', {})
-        # load `last_ckpt`, `best_score`, `best_ckpt`, etc. for hook messages
-        self.meta['hook_msgs'].update(checkpoint['meta'].get('hook_msgs', {}))
-
-        # Re-calculate the number of iterations when resuming
-        # models with different number of GPUs
-        if 'config' in checkpoint['meta']:
-            config = mmcv.Config.fromstring(
-                checkpoint['meta']['config'], file_format='.py')
-            previous_gpu_ids = config.get('gpu_ids', None)
-            if previous_gpu_ids and len(previous_gpu_ids) > 0 and len(
-                    previous_gpu_ids) != self.world_size:
-                self._iter = int(self._iter * len(previous_gpu_ids) /
-                                 self.world_size)
-                self.logger.info('the iteration number is changed due to '
-                                 'change of GPU number')
-
-        # resume meta information meta
-        self.meta = checkpoint['meta']
-
-        if 'optimizer' in checkpoint and resume_optimizer:
-            if isinstance(self.optimizer, Optimizer):
-                self.optimizer.load_state_dict(checkpoint['optimizer'])
-            elif isinstance(self.optimizer, dict):
-                for k in self.optimizer.keys():
-                    self.optimizer[k].load_state_dict(
-                        checkpoint['optimizer'][k])
-            else:
-                raise TypeError(
-                    'Optimizer should be dict or torch.optim.Optimizer '
-                    f'but got {type(self.optimizer)}')
-
-        self.logger.info('resumed epoch %d, iter %d', self.epoch, self.iter)
-
-    def register_lr_hook(self, lr_config):
-        if lr_config is None:
-            return
-        elif isinstance(lr_config, dict):
-            assert 'policy' in lr_config
-            policy_type = lr_config.pop('policy')
-            # If the type of policy is all in lower case, e.g., 'cyclic',
-            # then its first letter will be capitalized, e.g., to be 'Cyclic'.
-            # This is for the convenient usage of Lr updater.
-            # Since this is not applicable for `
-            # CosineAnnealingLrUpdater`,
-            # the string will not be changed if it contains capital letters.
-            if policy_type == policy_type.lower():
-                policy_type = policy_type.title()
-            hook_type = policy_type + 'LrUpdaterHook'
-            lr_config['type'] = hook_type
-            hook = mmcv.build_from_cfg(lr_config, HOOKS)
-        else:
-            hook = lr_config
-        self.register_hook(hook, priority='VERY_HIGH')
-
-    def register_momentum_hook(self, momentum_config):
-        if momentum_config is None:
-            return
-        if isinstance(momentum_config, dict):
-            assert 'policy' in momentum_config
-            policy_type = momentum_config.pop('policy')
-            # If the type of policy is all in lower case, e.g., 'cyclic',
-            # then its first letter will be capitalized, e.g., to be 'Cyclic'.
-            # This is for the convenient usage of momentum updater.
-            # Since this is not applicable for
-            # `CosineAnnealingMomentumUpdater`,
-            # the string will not be changed if it contains capital letters.
-            if policy_type == policy_type.lower():
-                policy_type = policy_type.title()
-            hook_type = policy_type + 'MomentumUpdaterHook'
-            momentum_config['type'] = hook_type
-            hook = mmcv.build_from_cfg(momentum_config, HOOKS)
-        else:
-            hook = momentum_config
-        self.register_hook(hook, priority='HIGH')
-
-    def register_optimizer_hook(self, optimizer_config):
-        if optimizer_config is None:
-            return
-        if isinstance(optimizer_config, dict):
-            optimizer_config.setdefault('type', 'OptimizerHook')
-            hook = mmcv.build_from_cfg(optimizer_config, HOOKS)
-        else:
-            hook = optimizer_config
-        self.register_hook(hook, priority='ABOVE_NORMAL')
-
-    def register_checkpoint_hook(self, checkpoint_config):
-        if checkpoint_config is None:
-            return
-        if isinstance(checkpoint_config, dict):
-            checkpoint_config.setdefault('type', 'CheckpointHook')
-            hook = mmcv.build_from_cfg(checkpoint_config, HOOKS)
-        else:
-            hook = checkpoint_config
-        self.register_hook(hook, priority='NORMAL')
-
-    def register_logger_hooks(self, log_config):
-        if log_config is None:
-            return
-        log_interval = log_config['interval']
-        for info in log_config['hooks']:
-            logger_hook = mmcv.build_from_cfg(
-                info, HOOKS, default_args=dict(interval=log_interval))
-            self.register_hook(logger_hook, priority='VERY_LOW')
-
-    def register_timer_hook(self, timer_config):
-        if timer_config is None:
-            return
-        if isinstance(timer_config, dict):
-            timer_config_ = copy.deepcopy(timer_config)
-            hook = mmcv.build_from_cfg(timer_config_, HOOKS)
-        else:
-            hook = timer_config
-        self.register_hook(hook, priority='LOW')
-
-    def register_custom_hooks(self, custom_config):
-        if custom_config is None:
-            return
-
-        if not isinstance(custom_config, list):
-            custom_config = [custom_config]
-
-        for item in custom_config:
-            if isinstance(item, dict):
-                self.register_hook_from_cfg(item)
-            else:
-                self.register_hook(item, priority='NORMAL')
-
-    def register_profiler_hook(self, profiler_config):
-        if profiler_config is None:
-            return
-        if isinstance(profiler_config, dict):
-            profiler_config.setdefault('type', 'ProfilerHook')
-            hook = mmcv.build_from_cfg(profiler_config, HOOKS)
-        else:
-            hook = profiler_config
-        self.register_hook(hook)
-
-    def register_training_hooks(self,
-                                lr_config,
-                                optimizer_config=None,
-                                checkpoint_config=None,
-                                log_config=None,
-                                momentum_config=None,
-                                timer_config=dict(type='IterTimerHook'),
-                                custom_hooks_config=None):
-        """Register default and custom hooks for training.
-
-        Default and custom hooks include:
-
-        +----------------------+-------------------------+
-        | Hooks                | Priority                |
-        +======================+=========================+
-        | LrUpdaterHook        | VERY_HIGH (10)          |
-        +----------------------+-------------------------+
-        | MomentumUpdaterHook  | HIGH (30)               |
-        +----------------------+-------------------------+
-        | OptimizerStepperHook | ABOVE_NORMAL (40)       |
-        +----------------------+-------------------------+
-        | CheckpointSaverHook  | NORMAL (50)             |
-        +----------------------+-------------------------+
-        | IterTimerHook        | LOW (70)                |
-        +----------------------+-------------------------+
-        | LoggerHook(s)        | VERY_LOW (90)           |
-        +----------------------+-------------------------+
-        | CustomHook(s)        | defaults to NORMAL (50) |
-        +----------------------+-------------------------+
-
-        If custom hooks have same priority with default hooks, custom hooks
-        will be triggered after default hooks.
-        """
-        self.register_lr_hook(lr_config)
-        self.register_momentum_hook(momentum_config)
-        self.register_optimizer_hook(optimizer_config)
-        self.register_checkpoint_hook(checkpoint_config)
-        self.register_timer_hook(timer_config)
-        self.register_logger_hooks(log_config)
-        self.register_custom_hooks(custom_hooks_config)
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/builder.py b/ControlNet/annotator/uniformer/mmcv/runner/builder.py
deleted file mode 100644
index 77c96ba0b2f30ead9da23f293c5dc84dd3e4a74f..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/builder.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-
-from ..utils import Registry
-
-RUNNERS = Registry('runner')
-RUNNER_BUILDERS = Registry('runner builder')
-
-
-def build_runner_constructor(cfg):
-    return RUNNER_BUILDERS.build(cfg)
-
-
-def build_runner(cfg, default_args=None):
-    runner_cfg = copy.deepcopy(cfg)
-    constructor_type = runner_cfg.pop('constructor',
-                                      'DefaultRunnerConstructor')
-    runner_constructor = build_runner_constructor(
-        dict(
-            type=constructor_type,
-            runner_cfg=runner_cfg,
-            default_args=default_args))
-    runner = runner_constructor()
-    return runner
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/checkpoint.py b/ControlNet/annotator/uniformer/mmcv/runner/checkpoint.py
deleted file mode 100644
index b29ca320679164432f446adad893e33fb2b4b29e..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/checkpoint.py
+++ /dev/null
@@ -1,707 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import io
-import os
-import os.path as osp
-import pkgutil
-import re
-import time
-import warnings
-from collections import OrderedDict
-from importlib import import_module
-from tempfile import TemporaryDirectory
-
-import torch
-import torchvision
-from torch.optim import Optimizer
-from torch.utils import model_zoo
-
-import annotator.uniformer.mmcv as mmcv
-from ..fileio import FileClient
-from ..fileio import load as load_file
-from ..parallel import is_module_wrapper
-from ..utils import mkdir_or_exist
-from .dist_utils import get_dist_info
-
-ENV_MMCV_HOME = 'MMCV_HOME'
-ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
-DEFAULT_CACHE_DIR = '~/.cache'
-
-
-def _get_mmcv_home():
-    mmcv_home = os.path.expanduser(
-        os.getenv(
-            ENV_MMCV_HOME,
-            os.path.join(
-                os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv')))
-
-    mkdir_or_exist(mmcv_home)
-    return mmcv_home
-
-
-def load_state_dict(module, state_dict, strict=False, logger=None):
-    """Load state_dict to a module.
-
-    This method is modified from :meth:`torch.nn.Module.load_state_dict`.
-    Default value for ``strict`` is set to ``False`` and the message for
-    param mismatch will be shown even if strict is False.
-
-    Args:
-        module (Module): Module that receives the state_dict.
-        state_dict (OrderedDict): Weights.
-        strict (bool): whether to strictly enforce that the keys
-            in :attr:`state_dict` match the keys returned by this module's
-            :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
-        logger (:obj:`logging.Logger`, optional): Logger to log the error
-            message. If not specified, print function will be used.
-    """
-    unexpected_keys = []
-    all_missing_keys = []
-    err_msg = []
-
-    metadata = getattr(state_dict, '_metadata', None)
-    state_dict = state_dict.copy()
-    if metadata is not None:
-        state_dict._metadata = metadata
-
-    # use _load_from_state_dict to enable checkpoint version control
-    def load(module, prefix=''):
-        # recursively check parallel module in case that the model has a
-        # complicated structure, e.g., nn.Module(nn.Module(DDP))
-        if is_module_wrapper(module):
-            module = module.module
-        local_metadata = {} if metadata is None else metadata.get(
-            prefix[:-1], {})
-        module._load_from_state_dict(state_dict, prefix, local_metadata, True,
-                                     all_missing_keys, unexpected_keys,
-                                     err_msg)
-        for name, child in module._modules.items():
-            if child is not None:
-                load(child, prefix + name + '.')
-
-    load(module)
-    load = None  # break load->load reference cycle
-
-    # ignore "num_batches_tracked" of BN layers
-    missing_keys = [
-        key for key in all_missing_keys if 'num_batches_tracked' not in key
-    ]
-
-    if unexpected_keys:
-        err_msg.append('unexpected key in source '
-                       f'state_dict: {", ".join(unexpected_keys)}\n')
-    if missing_keys:
-        err_msg.append(
-            f'missing keys in source state_dict: {", ".join(missing_keys)}\n')
-
-    rank, _ = get_dist_info()
-    if len(err_msg) > 0 and rank == 0:
-        err_msg.insert(
-            0, 'The model and loaded state dict do not match exactly\n')
-        err_msg = '\n'.join(err_msg)
-        if strict:
-            raise RuntimeError(err_msg)
-        elif logger is not None:
-            logger.warning(err_msg)
-        else:
-            print(err_msg)
-
-
-def get_torchvision_models():
-    model_urls = dict()
-    for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__):
-        if ispkg:
-            continue
-        _zoo = import_module(f'torchvision.models.{name}')
-        if hasattr(_zoo, 'model_urls'):
-            _urls = getattr(_zoo, 'model_urls')
-            model_urls.update(_urls)
-    return model_urls
-
-
-def get_external_models():
-    mmcv_home = _get_mmcv_home()
-    default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json')
-    default_urls = load_file(default_json_path)
-    assert isinstance(default_urls, dict)
-    external_json_path = osp.join(mmcv_home, 'open_mmlab.json')
-    if osp.exists(external_json_path):
-        external_urls = load_file(external_json_path)
-        assert isinstance(external_urls, dict)
-        default_urls.update(external_urls)
-
-    return default_urls
-
-
-def get_mmcls_models():
-    mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json')
-    mmcls_urls = load_file(mmcls_json_path)
-
-    return mmcls_urls
-
-
-def get_deprecated_model_names():
-    deprecate_json_path = osp.join(mmcv.__path__[0],
-                                   'model_zoo/deprecated.json')
-    deprecate_urls = load_file(deprecate_json_path)
-    assert isinstance(deprecate_urls, dict)
-
-    return deprecate_urls
-
-
-def _process_mmcls_checkpoint(checkpoint):
-    state_dict = checkpoint['state_dict']
-    new_state_dict = OrderedDict()
-    for k, v in state_dict.items():
-        if k.startswith('backbone.'):
-            new_state_dict[k[9:]] = v
-    new_checkpoint = dict(state_dict=new_state_dict)
-
-    return new_checkpoint
-
-
-class CheckpointLoader:
-    """A general checkpoint loader to manage all schemes."""
-
-    _schemes = {}
-
-    @classmethod
-    def _register_scheme(cls, prefixes, loader, force=False):
-        if isinstance(prefixes, str):
-            prefixes = [prefixes]
-        else:
-            assert isinstance(prefixes, (list, tuple))
-        for prefix in prefixes:
-            if (prefix not in cls._schemes) or force:
-                cls._schemes[prefix] = loader
-            else:
-                raise KeyError(
-                    f'{prefix} is already registered as a loader backend, '
-                    'add "force=True" if you want to override it')
-        # sort, longer prefixes take priority
-        cls._schemes = OrderedDict(
-            sorted(cls._schemes.items(), key=lambda t: t[0], reverse=True))
-
-    @classmethod
-    def register_scheme(cls, prefixes, loader=None, force=False):
-        """Register a loader to CheckpointLoader.
-
-        This method can be used as a normal class method or a decorator.
-
-        Args:
-            prefixes (str or list[str] or tuple[str]):
-            The prefix of the registered loader.
-            loader (function, optional): The loader function to be registered.
-                When this method is used as a decorator, loader is None.
-                Defaults to None.
-            force (bool, optional): Whether to override the loader
-                if the prefix has already been registered. Defaults to False.
-        """
-
-        if loader is not None:
-            cls._register_scheme(prefixes, loader, force=force)
-            return
-
-        def _register(loader_cls):
-            cls._register_scheme(prefixes, loader_cls, force=force)
-            return loader_cls
-
-        return _register
-
-    @classmethod
-    def _get_checkpoint_loader(cls, path):
-        """Finds a loader that supports the given path. Falls back to the local
-        loader if no other loader is found.
-
-        Args:
-            path (str): checkpoint path
-
-        Returns:
-            loader (function): checkpoint loader
-        """
-
-        for p in cls._schemes:
-            if path.startswith(p):
-                return cls._schemes[p]
-
-    @classmethod
-    def load_checkpoint(cls, filename, map_location=None, logger=None):
-        """load checkpoint through URL scheme path.
-
-        Args:
-            filename (str): checkpoint file name with given prefix
-            map_location (str, optional): Same as :func:`torch.load`.
-                Default: None
-            logger (:mod:`logging.Logger`, optional): The logger for message.
-                Default: None
-
-        Returns:
-            dict or OrderedDict: The loaded checkpoint.
-        """
-
-        checkpoint_loader = cls._get_checkpoint_loader(filename)
-        class_name = checkpoint_loader.__name__
-        mmcv.print_log(
-            f'load checkpoint from {class_name[10:]} path: {filename}', logger)
-        return checkpoint_loader(filename, map_location)
-
-
-@CheckpointLoader.register_scheme(prefixes='')
-def load_from_local(filename, map_location):
-    """load checkpoint by local file path.
-
-    Args:
-        filename (str): local checkpoint file path
-        map_location (str, optional): Same as :func:`torch.load`.
-
-    Returns:
-        dict or OrderedDict: The loaded checkpoint.
-    """
-
-    if not osp.isfile(filename):
-        raise IOError(f'{filename} is not a checkpoint file')
-    checkpoint = torch.load(filename, map_location=map_location)
-    return checkpoint
-
-
-@CheckpointLoader.register_scheme(prefixes=('http://', 'https://'))
-def load_from_http(filename, map_location=None, model_dir=None):
-    """load checkpoint through HTTP or HTTPS scheme path. In distributed
-    setting, this function only download checkpoint at local rank 0.
-
-    Args:
-        filename (str): checkpoint file path with modelzoo or
-            torchvision prefix
-        map_location (str, optional): Same as :func:`torch.load`.
-        model_dir (string, optional): directory in which to save the object,
-            Default: None
-
-    Returns:
-        dict or OrderedDict: The loaded checkpoint.
-    """
-    rank, world_size = get_dist_info()
-    rank = int(os.environ.get('LOCAL_RANK', rank))
-    if rank == 0:
-        checkpoint = model_zoo.load_url(
-            filename, model_dir=model_dir, map_location=map_location)
-    if world_size > 1:
-        torch.distributed.barrier()
-        if rank > 0:
-            checkpoint = model_zoo.load_url(
-                filename, model_dir=model_dir, map_location=map_location)
-    return checkpoint
-
-
-@CheckpointLoader.register_scheme(prefixes='pavi://')
-def load_from_pavi(filename, map_location=None):
-    """load checkpoint through the file path prefixed with pavi. In distributed
-    setting, this function download ckpt at all ranks to different temporary
-    directories.
-
-    Args:
-        filename (str): checkpoint file path with pavi prefix
-        map_location (str, optional): Same as :func:`torch.load`.
-          Default: None
-
-    Returns:
-        dict or OrderedDict: The loaded checkpoint.
-    """
-    assert filename.startswith('pavi://'), \
-        f'Expected filename startswith `pavi://`, but get {filename}'
-    model_path = filename[7:]
-
-    try:
-        from pavi import modelcloud
-    except ImportError:
-        raise ImportError(
-            'Please install pavi to load checkpoint from modelcloud.')
-
-    model = modelcloud.get(model_path)
-    with TemporaryDirectory() as tmp_dir:
-        downloaded_file = osp.join(tmp_dir, model.name)
-        model.download(downloaded_file)
-        checkpoint = torch.load(downloaded_file, map_location=map_location)
-    return checkpoint
-
-
-@CheckpointLoader.register_scheme(prefixes='s3://')
-def load_from_ceph(filename, map_location=None, backend='petrel'):
-    """load checkpoint through the file path prefixed with s3.  In distributed
-    setting, this function download ckpt at all ranks to different temporary
-    directories.
-
-    Args:
-        filename (str): checkpoint file path with s3 prefix
-        map_location (str, optional): Same as :func:`torch.load`.
-        backend (str, optional): The storage backend type. Options are 'ceph',
-            'petrel'. Default: 'petrel'.
-
-    .. warning::
-        :class:`mmcv.fileio.file_client.CephBackend` will be deprecated,
-        please use :class:`mmcv.fileio.file_client.PetrelBackend` instead.
-
-    Returns:
-        dict or OrderedDict: The loaded checkpoint.
-    """
-    allowed_backends = ['ceph', 'petrel']
-    if backend not in allowed_backends:
-        raise ValueError(f'Load from Backend {backend} is not supported.')
-
-    if backend == 'ceph':
-        warnings.warn(
-            'CephBackend will be deprecated, please use PetrelBackend instead')
-
-    # CephClient and PetrelBackend have the same prefix 's3://' and the latter
-    # will be chosen as default. If PetrelBackend can not be instantiated
-    # successfully, the CephClient will be chosen.
-    try:
-        file_client = FileClient(backend=backend)
-    except ImportError:
-        allowed_backends.remove(backend)
-        file_client = FileClient(backend=allowed_backends[0])
-
-    with io.BytesIO(file_client.get(filename)) as buffer:
-        checkpoint = torch.load(buffer, map_location=map_location)
-    return checkpoint
-
-
-@CheckpointLoader.register_scheme(prefixes=('modelzoo://', 'torchvision://'))
-def load_from_torchvision(filename, map_location=None):
-    """load checkpoint through the file path prefixed with modelzoo or
-    torchvision.
-
-    Args:
-        filename (str): checkpoint file path with modelzoo or
-            torchvision prefix
-        map_location (str, optional): Same as :func:`torch.load`.
-
-    Returns:
-        dict or OrderedDict: The loaded checkpoint.
-    """
-    model_urls = get_torchvision_models()
-    if filename.startswith('modelzoo://'):
-        warnings.warn('The URL scheme of "modelzoo://" is deprecated, please '
-                      'use "torchvision://" instead')
-        model_name = filename[11:]
-    else:
-        model_name = filename[14:]
-    return load_from_http(model_urls[model_name], map_location=map_location)
-
-
-@CheckpointLoader.register_scheme(prefixes=('open-mmlab://', 'openmmlab://'))
-def load_from_openmmlab(filename, map_location=None):
-    """load checkpoint through the file path prefixed with open-mmlab or
-    openmmlab.
-
-    Args:
-        filename (str): checkpoint file path with open-mmlab or
-        openmmlab prefix
-        map_location (str, optional): Same as :func:`torch.load`.
-          Default: None
-
-    Returns:
-        dict or OrderedDict: The loaded checkpoint.
-    """
-
-    model_urls = get_external_models()
-    prefix_str = 'open-mmlab://'
-    if filename.startswith(prefix_str):
-        model_name = filename[13:]
-    else:
-        model_name = filename[12:]
-        prefix_str = 'openmmlab://'
-
-    deprecated_urls = get_deprecated_model_names()
-    if model_name in deprecated_urls:
-        warnings.warn(f'{prefix_str}{model_name} is deprecated in favor '
-                      f'of {prefix_str}{deprecated_urls[model_name]}')
-        model_name = deprecated_urls[model_name]
-    model_url = model_urls[model_name]
-    # check if is url
-    if model_url.startswith(('http://', 'https://')):
-        checkpoint = load_from_http(model_url, map_location=map_location)
-    else:
-        filename = osp.join(_get_mmcv_home(), model_url)
-        if not osp.isfile(filename):
-            raise IOError(f'{filename} is not a checkpoint file')
-        checkpoint = torch.load(filename, map_location=map_location)
-    return checkpoint
-
-
-@CheckpointLoader.register_scheme(prefixes='mmcls://')
-def load_from_mmcls(filename, map_location=None):
-    """load checkpoint through the file path prefixed with mmcls.
-
-    Args:
-        filename (str): checkpoint file path with mmcls prefix
-        map_location (str, optional): Same as :func:`torch.load`.
-
-    Returns:
-        dict or OrderedDict: The loaded checkpoint.
-    """
-
-    model_urls = get_mmcls_models()
-    model_name = filename[8:]
-    checkpoint = load_from_http(
-        model_urls[model_name], map_location=map_location)
-    checkpoint = _process_mmcls_checkpoint(checkpoint)
-    return checkpoint
-
-
-def _load_checkpoint(filename, map_location=None, logger=None):
-    """Load checkpoint from somewhere (modelzoo, file, url).
-
-    Args:
-        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
-            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
-            details.
-        map_location (str, optional): Same as :func:`torch.load`.
-           Default: None.
-        logger (:mod:`logging.Logger`, optional): The logger for error message.
-           Default: None
-
-    Returns:
-        dict or OrderedDict: The loaded checkpoint. It can be either an
-           OrderedDict storing model weights or a dict containing other
-           information, which depends on the checkpoint.
-    """
-    return CheckpointLoader.load_checkpoint(filename, map_location, logger)
-
-
-def _load_checkpoint_with_prefix(prefix, filename, map_location=None):
-    """Load partial pretrained model with specific prefix.
-
-    Args:
-        prefix (str): The prefix of sub-module.
-        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
-            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
-            details.
-        map_location (str | None): Same as :func:`torch.load`. Default: None.
-
-    Returns:
-        dict or OrderedDict: The loaded checkpoint.
-    """
-
-    checkpoint = _load_checkpoint(filename, map_location=map_location)
-
-    if 'state_dict' in checkpoint:
-        state_dict = checkpoint['state_dict']
-    else:
-        state_dict = checkpoint
-    if not prefix.endswith('.'):
-        prefix += '.'
-    prefix_len = len(prefix)
-
-    state_dict = {
-        k[prefix_len:]: v
-        for k, v in state_dict.items() if k.startswith(prefix)
-    }
-
-    assert state_dict, f'{prefix} is not in the pretrained model'
-    return state_dict
-
-
-def load_checkpoint(model,
-                    filename,
-                    map_location=None,
-                    strict=False,
-                    logger=None,
-                    revise_keys=[(r'^module\.', '')]):
-    """Load checkpoint from a file or URI.
-
-    Args:
-        model (Module): Module to load checkpoint.
-        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
-            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
-            details.
-        map_location (str): Same as :func:`torch.load`.
-        strict (bool): Whether to allow different params for the model and
-            checkpoint.
-        logger (:mod:`logging.Logger` or None): The logger for error message.
-        revise_keys (list): A list of customized keywords to modify the
-            state_dict in checkpoint. Each item is a (pattern, replacement)
-            pair of the regular expression operations. Default: strip
-            the prefix 'module.' by [(r'^module\\.', '')].
-
-    Returns:
-        dict or OrderedDict: The loaded checkpoint.
-    """
-    checkpoint = _load_checkpoint(filename, map_location, logger)
-    # OrderedDict is a subclass of dict
-    if not isinstance(checkpoint, dict):
-        raise RuntimeError(
-            f'No state_dict found in checkpoint file {filename}')
-    # get state_dict from checkpoint
-    if 'state_dict' in checkpoint:
-        state_dict = checkpoint['state_dict']
-    else:
-        state_dict = checkpoint
-
-    # strip prefix of state_dict
-    metadata = getattr(state_dict, '_metadata', OrderedDict())
-    for p, r in revise_keys:
-        state_dict = OrderedDict(
-            {re.sub(p, r, k): v
-             for k, v in state_dict.items()})
-    # Keep metadata in state_dict
-    state_dict._metadata = metadata
-
-    # load state_dict
-    load_state_dict(model, state_dict, strict, logger)
-    return checkpoint
-
-
-def weights_to_cpu(state_dict):
-    """Copy a model state_dict to cpu.
-
-    Args:
-        state_dict (OrderedDict): Model weights on GPU.
-
-    Returns:
-        OrderedDict: Model weights on GPU.
-    """
-    state_dict_cpu = OrderedDict()
-    for key, val in state_dict.items():
-        state_dict_cpu[key] = val.cpu()
-    # Keep metadata in state_dict
-    state_dict_cpu._metadata = getattr(state_dict, '_metadata', OrderedDict())
-    return state_dict_cpu
-
-
-def _save_to_state_dict(module, destination, prefix, keep_vars):
-    """Saves module state to `destination` dictionary.
-
-    This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.
-
-    Args:
-        module (nn.Module): The module to generate state_dict.
-        destination (dict): A dict where state will be stored.
-        prefix (str): The prefix for parameters and buffers used in this
-            module.
-    """
-    for name, param in module._parameters.items():
-        if param is not None:
-            destination[prefix + name] = param if keep_vars else param.detach()
-    for name, buf in module._buffers.items():
-        # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d
-        if buf is not None:
-            destination[prefix + name] = buf if keep_vars else buf.detach()
-
-
-def get_state_dict(module, destination=None, prefix='', keep_vars=False):
-    """Returns a dictionary containing a whole state of the module.
-
-    Both parameters and persistent buffers (e.g. running averages) are
-    included. Keys are corresponding parameter and buffer names.
-
-    This method is modified from :meth:`torch.nn.Module.state_dict` to
-    recursively check parallel module in case that the model has a complicated
-    structure, e.g., nn.Module(nn.Module(DDP)).
-
-    Args:
-        module (nn.Module): The module to generate state_dict.
-        destination (OrderedDict): Returned dict for the state of the
-            module.
-        prefix (str): Prefix of the key.
-        keep_vars (bool): Whether to keep the variable property of the
-            parameters. Default: False.
-
-    Returns:
-        dict: A dictionary containing a whole state of the module.
-    """
-    # recursively check parallel module in case that the model has a
-    # complicated structure, e.g., nn.Module(nn.Module(DDP))
-    if is_module_wrapper(module):
-        module = module.module
-
-    # below is the same as torch.nn.Module.state_dict()
-    if destination is None:
-        destination = OrderedDict()
-        destination._metadata = OrderedDict()
-    destination._metadata[prefix[:-1]] = local_metadata = dict(
-        version=module._version)
-    _save_to_state_dict(module, destination, prefix, keep_vars)
-    for name, child in module._modules.items():
-        if child is not None:
-            get_state_dict(
-                child, destination, prefix + name + '.', keep_vars=keep_vars)
-    for hook in module._state_dict_hooks.values():
-        hook_result = hook(module, destination, prefix, local_metadata)
-        if hook_result is not None:
-            destination = hook_result
-    return destination
-
-
-def save_checkpoint(model,
-                    filename,
-                    optimizer=None,
-                    meta=None,
-                    file_client_args=None):
-    """Save checkpoint to file.
-
-    The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
-    ``optimizer``. By default ``meta`` will contain version and time info.
-
-    Args:
-        model (Module): Module whose params are to be saved.
-        filename (str): Checkpoint filename.
-        optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
-        meta (dict, optional): Metadata to be saved in checkpoint.
-        file_client_args (dict, optional): Arguments to instantiate a
-            FileClient. See :class:`mmcv.fileio.FileClient` for details.
-            Default: None.
-            `New in version 1.3.16.`
-    """
-    if meta is None:
-        meta = {}
-    elif not isinstance(meta, dict):
-        raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
-    meta.update(mmcv_version=mmcv.__version__, time=time.asctime())
-
-    if is_module_wrapper(model):
-        model = model.module
-
-    if hasattr(model, 'CLASSES') and model.CLASSES is not None:
-        # save class name to the meta
-        meta.update(CLASSES=model.CLASSES)
-
-    checkpoint = {
-        'meta': meta,
-        'state_dict': weights_to_cpu(get_state_dict(model))
-    }
-    # save optimizer state dict in the checkpoint
-    if isinstance(optimizer, Optimizer):
-        checkpoint['optimizer'] = optimizer.state_dict()
-    elif isinstance(optimizer, dict):
-        checkpoint['optimizer'] = {}
-        for name, optim in optimizer.items():
-            checkpoint['optimizer'][name] = optim.state_dict()
-
-    if filename.startswith('pavi://'):
-        if file_client_args is not None:
-            raise ValueError(
-                'file_client_args should be "None" if filename starts with'
-                f'"pavi://", but got {file_client_args}')
-        try:
-            from pavi import modelcloud
-            from pavi import exception
-        except ImportError:
-            raise ImportError(
-                'Please install pavi to load checkpoint from modelcloud.')
-        model_path = filename[7:]
-        root = modelcloud.Folder()
-        model_dir, model_name = osp.split(model_path)
-        try:
-            model = modelcloud.get(model_dir)
-        except exception.NodeNotFoundError:
-            model = root.create_training_model(model_dir)
-        with TemporaryDirectory() as tmp_dir:
-            checkpoint_file = osp.join(tmp_dir, model_name)
-            with open(checkpoint_file, 'wb') as f:
-                torch.save(checkpoint, f)
-                f.flush()
-            model.create_file(checkpoint_file, name=model_name)
-    else:
-        file_client = FileClient.infer_client(file_client_args, filename)
-        with io.BytesIO() as f:
-            torch.save(checkpoint, f)
-            file_client.put(f.getvalue(), filename)
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/default_constructor.py b/ControlNet/annotator/uniformer/mmcv/runner/default_constructor.py
deleted file mode 100644
index 3f1f5b44168768dfda3947393a63a6cf9cf50b41..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/default_constructor.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from .builder import RUNNER_BUILDERS, RUNNERS
-
-
-@RUNNER_BUILDERS.register_module()
-class DefaultRunnerConstructor:
-    """Default constructor for runners.
-
-    Custom existing `Runner` like `EpocBasedRunner` though `RunnerConstructor`.
-    For example, We can inject some new properties and functions for `Runner`.
-
-    Example:
-        >>> from annotator.uniformer.mmcv.runner import RUNNER_BUILDERS, build_runner
-        >>> # Define a new RunnerReconstructor
-        >>> @RUNNER_BUILDERS.register_module()
-        >>> class MyRunnerConstructor:
-        ...     def __init__(self, runner_cfg, default_args=None):
-        ...         if not isinstance(runner_cfg, dict):
-        ...             raise TypeError('runner_cfg should be a dict',
-        ...                             f'but got {type(runner_cfg)}')
-        ...         self.runner_cfg = runner_cfg
-        ...         self.default_args = default_args
-        ...
-        ...     def __call__(self):
-        ...         runner = RUNNERS.build(self.runner_cfg,
-        ...                                default_args=self.default_args)
-        ...         # Add new properties for existing runner
-        ...         runner.my_name = 'my_runner'
-        ...         runner.my_function = lambda self: print(self.my_name)
-        ...         ...
-        >>> # build your runner
-        >>> runner_cfg = dict(type='EpochBasedRunner', max_epochs=40,
-        ...                   constructor='MyRunnerConstructor')
-        >>> runner = build_runner(runner_cfg)
-    """
-
-    def __init__(self, runner_cfg, default_args=None):
-        if not isinstance(runner_cfg, dict):
-            raise TypeError('runner_cfg should be a dict',
-                            f'but got {type(runner_cfg)}')
-        self.runner_cfg = runner_cfg
-        self.default_args = default_args
-
-    def __call__(self):
-        return RUNNERS.build(self.runner_cfg, default_args=self.default_args)
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/dist_utils.py b/ControlNet/annotator/uniformer/mmcv/runner/dist_utils.py
deleted file mode 100644
index d3a1ef3fda5ceeb31bf15a73779da1b1903ab0fe..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/dist_utils.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import functools
-import os
-import subprocess
-from collections import OrderedDict
-
-import torch
-import torch.multiprocessing as mp
-from torch import distributed as dist
-from torch._utils import (_flatten_dense_tensors, _take_tensors,
-                          _unflatten_dense_tensors)
-
-
-def init_dist(launcher, backend='nccl', **kwargs):
-    if mp.get_start_method(allow_none=True) is None:
-        mp.set_start_method('spawn')
-    if launcher == 'pytorch':
-        _init_dist_pytorch(backend, **kwargs)
-    elif launcher == 'mpi':
-        _init_dist_mpi(backend, **kwargs)
-    elif launcher == 'slurm':
-        _init_dist_slurm(backend, **kwargs)
-    else:
-        raise ValueError(f'Invalid launcher type: {launcher}')
-
-
-def _init_dist_pytorch(backend, **kwargs):
-    # TODO: use local_rank instead of rank % num_gpus
-    rank = int(os.environ['RANK'])
-    num_gpus = torch.cuda.device_count()
-    torch.cuda.set_device(rank % num_gpus)
-    dist.init_process_group(backend=backend, **kwargs)
-
-
-def _init_dist_mpi(backend, **kwargs):
-    # TODO: use local_rank instead of rank % num_gpus
-    rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
-    num_gpus = torch.cuda.device_count()
-    torch.cuda.set_device(rank % num_gpus)
-    dist.init_process_group(backend=backend, **kwargs)
-
-
-def _init_dist_slurm(backend, port=None):
-    """Initialize slurm distributed training environment.
-
-    If argument ``port`` is not specified, then the master port will be system
-    environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
-    environment variable, then a default port ``29500`` will be used.
-
-    Args:
-        backend (str): Backend of torch.distributed.
-        port (int, optional): Master port. Defaults to None.
-    """
-    proc_id = int(os.environ['SLURM_PROCID'])
-    ntasks = int(os.environ['SLURM_NTASKS'])
-    node_list = os.environ['SLURM_NODELIST']
-    num_gpus = torch.cuda.device_count()
-    torch.cuda.set_device(proc_id % num_gpus)
-    addr = subprocess.getoutput(
-        f'scontrol show hostname {node_list} | head -n1')
-    # specify master port
-    if port is not None:
-        os.environ['MASTER_PORT'] = str(port)
-    elif 'MASTER_PORT' in os.environ:
-        pass  # use MASTER_PORT in the environment variable
-    else:
-        # 29500 is torch.distributed default port
-        os.environ['MASTER_PORT'] = '29500'
-    # use MASTER_ADDR in the environment variable if it already exists
-    if 'MASTER_ADDR' not in os.environ:
-        os.environ['MASTER_ADDR'] = addr
-    os.environ['WORLD_SIZE'] = str(ntasks)
-    os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
-    os.environ['RANK'] = str(proc_id)
-    dist.init_process_group(backend=backend)
-
-
-def get_dist_info():
-    if dist.is_available() and dist.is_initialized():
-        rank = dist.get_rank()
-        world_size = dist.get_world_size()
-    else:
-        rank = 0
-        world_size = 1
-    return rank, world_size
-
-
-def master_only(func):
-
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        rank, _ = get_dist_info()
-        if rank == 0:
-            return func(*args, **kwargs)
-
-    return wrapper
-
-
-def allreduce_params(params, coalesce=True, bucket_size_mb=-1):
-    """Allreduce parameters.
-
-    Args:
-        params (list[torch.Parameters]): List of parameters or buffers of a
-            model.
-        coalesce (bool, optional): Whether allreduce parameters as a whole.
-            Defaults to True.
-        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
-            Defaults to -1.
-    """
-    _, world_size = get_dist_info()
-    if world_size == 1:
-        return
-    params = [param.data for param in params]
-    if coalesce:
-        _allreduce_coalesced(params, world_size, bucket_size_mb)
-    else:
-        for tensor in params:
-            dist.all_reduce(tensor.div_(world_size))
-
-
-def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
-    """Allreduce gradients.
-
-    Args:
-        params (list[torch.Parameters]): List of parameters of a model
-        coalesce (bool, optional): Whether allreduce parameters as a whole.
-            Defaults to True.
-        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
-            Defaults to -1.
-    """
-    grads = [
-        param.grad.data for param in params
-        if param.requires_grad and param.grad is not None
-    ]
-    _, world_size = get_dist_info()
-    if world_size == 1:
-        return
-    if coalesce:
-        _allreduce_coalesced(grads, world_size, bucket_size_mb)
-    else:
-        for tensor in grads:
-            dist.all_reduce(tensor.div_(world_size))
-
-
-def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
-    if bucket_size_mb > 0:
-        bucket_size_bytes = bucket_size_mb * 1024 * 1024
-        buckets = _take_tensors(tensors, bucket_size_bytes)
-    else:
-        buckets = OrderedDict()
-        for tensor in tensors:
-            tp = tensor.type()
-            if tp not in buckets:
-                buckets[tp] = []
-            buckets[tp].append(tensor)
-        buckets = buckets.values()
-
-    for bucket in buckets:
-        flat_tensors = _flatten_dense_tensors(bucket)
-        dist.all_reduce(flat_tensors)
-        flat_tensors.div_(world_size)
-        for tensor, synced in zip(
-                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
-            tensor.copy_(synced)
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/epoch_based_runner.py b/ControlNet/annotator/uniformer/mmcv/runner/epoch_based_runner.py
deleted file mode 100644
index 766a9ce6afdf09cd11b1b15005f5132583011348..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/epoch_based_runner.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-import platform
-import shutil
-import time
-import warnings
-
-import torch
-
-import annotator.uniformer.mmcv as mmcv
-from .base_runner import BaseRunner
-from .builder import RUNNERS
-from .checkpoint import save_checkpoint
-from .utils import get_host_info
-
-
-@RUNNERS.register_module()
-class EpochBasedRunner(BaseRunner):
-    """Epoch-based Runner.
-
-    This runner train models epoch by epoch.
-    """
-
-    def run_iter(self, data_batch, train_mode, **kwargs):
-        if self.batch_processor is not None:
-            outputs = self.batch_processor(
-                self.model, data_batch, train_mode=train_mode, **kwargs)
-        elif train_mode:
-            outputs = self.model.train_step(data_batch, self.optimizer,
-                                            **kwargs)
-        else:
-            outputs = self.model.val_step(data_batch, self.optimizer, **kwargs)
-        if not isinstance(outputs, dict):
-            raise TypeError('"batch_processor()" or "model.train_step()"'
-                            'and "model.val_step()" must return a dict')
-        if 'log_vars' in outputs:
-            self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
-        self.outputs = outputs
-
-    def train(self, data_loader, **kwargs):
-        self.model.train()
-        self.mode = 'train'
-        self.data_loader = data_loader
-        self._max_iters = self._max_epochs * len(self.data_loader)
-        self.call_hook('before_train_epoch')
-        time.sleep(2)  # Prevent possible deadlock during epoch transition
-        for i, data_batch in enumerate(self.data_loader):
-            self._inner_iter = i
-            self.call_hook('before_train_iter')
-            self.run_iter(data_batch, train_mode=True, **kwargs)
-            self.call_hook('after_train_iter')
-            self._iter += 1
-
-        self.call_hook('after_train_epoch')
-        self._epoch += 1
-
-    @torch.no_grad()
-    def val(self, data_loader, **kwargs):
-        self.model.eval()
-        self.mode = 'val'
-        self.data_loader = data_loader
-        self.call_hook('before_val_epoch')
-        time.sleep(2)  # Prevent possible deadlock during epoch transition
-        for i, data_batch in enumerate(self.data_loader):
-            self._inner_iter = i
-            self.call_hook('before_val_iter')
-            self.run_iter(data_batch, train_mode=False)
-            self.call_hook('after_val_iter')
-
-        self.call_hook('after_val_epoch')
-
-    def run(self, data_loaders, workflow, max_epochs=None, **kwargs):
-        """Start running.
-
-        Args:
-            data_loaders (list[:obj:`DataLoader`]): Dataloaders for training
-                and validation.
-            workflow (list[tuple]): A list of (phase, epochs) to specify the
-                running order and epochs. E.g, [('train', 2), ('val', 1)] means
-                running 2 epochs for training and 1 epoch for validation,
-                iteratively.
-        """
-        assert isinstance(data_loaders, list)
-        assert mmcv.is_list_of(workflow, tuple)
-        assert len(data_loaders) == len(workflow)
-        if max_epochs is not None:
-            warnings.warn(
-                'setting max_epochs in run is deprecated, '
-                'please set max_epochs in runner_config', DeprecationWarning)
-            self._max_epochs = max_epochs
-
-        assert self._max_epochs is not None, (
-            'max_epochs must be specified during instantiation')
-
-        for i, flow in enumerate(workflow):
-            mode, epochs = flow
-            if mode == 'train':
-                self._max_iters = self._max_epochs * len(data_loaders[i])
-                break
-
-        work_dir = self.work_dir if self.work_dir is not None else 'NONE'
-        self.logger.info('Start running, host: %s, work_dir: %s',
-                         get_host_info(), work_dir)
-        self.logger.info('Hooks will be executed in the following order:\n%s',
-                         self.get_hook_info())
-        self.logger.info('workflow: %s, max: %d epochs', workflow,
-                         self._max_epochs)
-        self.call_hook('before_run')
-
-        while self.epoch < self._max_epochs:
-            for i, flow in enumerate(workflow):
-                mode, epochs = flow
-                if isinstance(mode, str):  # self.train()
-                    if not hasattr(self, mode):
-                        raise ValueError(
-                            f'runner has no method named "{mode}" to run an '
-                            'epoch')
-                    epoch_runner = getattr(self, mode)
-                else:
-                    raise TypeError(
-                        'mode in workflow must be a str, but got {}'.format(
-                            type(mode)))
-
-                for _ in range(epochs):
-                    if mode == 'train' and self.epoch >= self._max_epochs:
-                        break
-                    epoch_runner(data_loaders[i], **kwargs)
-
-        time.sleep(1)  # wait for some hooks like loggers to finish
-        self.call_hook('after_run')
-
-    def save_checkpoint(self,
-                        out_dir,
-                        filename_tmpl='epoch_{}.pth',
-                        save_optimizer=True,
-                        meta=None,
-                        create_symlink=True):
-        """Save the checkpoint.
-
-        Args:
-            out_dir (str): The directory that checkpoints are saved.
-            filename_tmpl (str, optional): The checkpoint filename template,
-                which contains a placeholder for the epoch number.
-                Defaults to 'epoch_{}.pth'.
-            save_optimizer (bool, optional): Whether to save the optimizer to
-                the checkpoint. Defaults to True.
-            meta (dict, optional): The meta information to be saved in the
-                checkpoint. Defaults to None.
-            create_symlink (bool, optional): Whether to create a symlink
-                "latest.pth" to point to the latest checkpoint.
-                Defaults to True.
-        """
-        if meta is None:
-            meta = {}
-        elif not isinstance(meta, dict):
-            raise TypeError(
-                f'meta should be a dict or None, but got {type(meta)}')
-        if self.meta is not None:
-            meta.update(self.meta)
-            # Note: meta.update(self.meta) should be done before
-            # meta.update(epoch=self.epoch + 1, iter=self.iter) otherwise
-            # there will be problems with resumed checkpoints.
-            # More details in https://github.com/open-mmlab/mmcv/pull/1108
-        meta.update(epoch=self.epoch + 1, iter=self.iter)
-
-        filename = filename_tmpl.format(self.epoch + 1)
-        filepath = osp.join(out_dir, filename)
-        optimizer = self.optimizer if save_optimizer else None
-        save_checkpoint(self.model, filepath, optimizer=optimizer, meta=meta)
-        # in some environments, `os.symlink` is not supported, you may need to
-        # set `create_symlink` to False
-        if create_symlink:
-            dst_file = osp.join(out_dir, 'latest.pth')
-            if platform.system() != 'Windows':
-                mmcv.symlink(filename, dst_file)
-            else:
-                shutil.copy(filepath, dst_file)
-
-
-@RUNNERS.register_module()
-class Runner(EpochBasedRunner):
-    """Deprecated name of EpochBasedRunner."""
-
-    def __init__(self, *args, **kwargs):
-        warnings.warn(
-            'Runner was deprecated, please use EpochBasedRunner instead')
-        super().__init__(*args, **kwargs)
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/fp16_utils.py b/ControlNet/annotator/uniformer/mmcv/runner/fp16_utils.py
deleted file mode 100644
index 1981011d6859192e3e663e29d13500d56ba47f6c..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/fp16_utils.py
+++ /dev/null
@@ -1,410 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import functools
-import warnings
-from collections import abc
-from inspect import getfullargspec
-
-import numpy as np
-import torch
-import torch.nn as nn
-
-from annotator.uniformer.mmcv.utils import TORCH_VERSION, digit_version
-from .dist_utils import allreduce_grads as _allreduce_grads
-
-try:
-    # If PyTorch version >= 1.6.0, torch.cuda.amp.autocast would be imported
-    # and used; otherwise, auto fp16 will adopt mmcv's implementation.
-    # Note that when PyTorch >= 1.6.0, we still cast tensor types to fp16
-    # manually, so the behavior may not be consistent with real amp.
-    from torch.cuda.amp import autocast
-except ImportError:
-    pass
-
-
-def cast_tensor_type(inputs, src_type, dst_type):
-    """Recursively convert Tensor in inputs from src_type to dst_type.
-
-    Args:
-        inputs: Inputs that to be casted.
-        src_type (torch.dtype): Source type..
-        dst_type (torch.dtype): Destination type.
-
-    Returns:
-        The same type with inputs, but all contained Tensors have been cast.
-    """
-    if isinstance(inputs, nn.Module):
-        return inputs
-    elif isinstance(inputs, torch.Tensor):
-        return inputs.to(dst_type)
-    elif isinstance(inputs, str):
-        return inputs
-    elif isinstance(inputs, np.ndarray):
-        return inputs
-    elif isinstance(inputs, abc.Mapping):
-        return type(inputs)({
-            k: cast_tensor_type(v, src_type, dst_type)
-            for k, v in inputs.items()
-        })
-    elif isinstance(inputs, abc.Iterable):
-        return type(inputs)(
-            cast_tensor_type(item, src_type, dst_type) for item in inputs)
-    else:
-        return inputs
-
-
-def auto_fp16(apply_to=None, out_fp32=False):
-    """Decorator to enable fp16 training automatically.
-
-    This decorator is useful when you write custom modules and want to support
-    mixed precision training. If inputs arguments are fp32 tensors, they will
-    be converted to fp16 automatically. Arguments other than fp32 tensors are
-    ignored. If you are using PyTorch >= 1.6, torch.cuda.amp is used as the
-    backend, otherwise, original mmcv implementation will be adopted.
-
-    Args:
-        apply_to (Iterable, optional): The argument names to be converted.
-            `None` indicates all arguments.
-        out_fp32 (bool): Whether to convert the output back to fp32.
-
-    Example:
-
-        >>> import torch.nn as nn
-        >>> class MyModule1(nn.Module):
-        >>>
-        >>>     # Convert x and y to fp16
-        >>>     @auto_fp16()
-        >>>     def forward(self, x, y):
-        >>>         pass
-
-        >>> import torch.nn as nn
-        >>> class MyModule2(nn.Module):
-        >>>
-        >>>     # convert pred to fp16
-        >>>     @auto_fp16(apply_to=('pred', ))
-        >>>     def do_something(self, pred, others):
-        >>>         pass
-    """
-
-    def auto_fp16_wrapper(old_func):
-
-        @functools.wraps(old_func)
-        def new_func(*args, **kwargs):
-            # check if the module has set the attribute `fp16_enabled`, if not,
-            # just fallback to the original method.
-            if not isinstance(args[0], torch.nn.Module):
-                raise TypeError('@auto_fp16 can only be used to decorate the '
-                                'method of nn.Module')
-            if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
-                return old_func(*args, **kwargs)
-
-            # get the arg spec of the decorated method
-            args_info = getfullargspec(old_func)
-            # get the argument names to be casted
-            args_to_cast = args_info.args if apply_to is None else apply_to
-            # convert the args that need to be processed
-            new_args = []
-            # NOTE: default args are not taken into consideration
-            if args:
-                arg_names = args_info.args[:len(args)]
-                for i, arg_name in enumerate(arg_names):
-                    if arg_name in args_to_cast:
-                        new_args.append(
-                            cast_tensor_type(args[i], torch.float, torch.half))
-                    else:
-                        new_args.append(args[i])
-            # convert the kwargs that need to be processed
-            new_kwargs = {}
-            if kwargs:
-                for arg_name, arg_value in kwargs.items():
-                    if arg_name in args_to_cast:
-                        new_kwargs[arg_name] = cast_tensor_type(
-                            arg_value, torch.float, torch.half)
-                    else:
-                        new_kwargs[arg_name] = arg_value
-            # apply converted arguments to the decorated method
-            if (TORCH_VERSION != 'parrots' and
-                    digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
-                with autocast(enabled=True):
-                    output = old_func(*new_args, **new_kwargs)
-            else:
-                output = old_func(*new_args, **new_kwargs)
-            # cast the results back to fp32 if necessary
-            if out_fp32:
-                output = cast_tensor_type(output, torch.half, torch.float)
-            return output
-
-        return new_func
-
-    return auto_fp16_wrapper
-
-
-def force_fp32(apply_to=None, out_fp16=False):
-    """Decorator to convert input arguments to fp32 in force.
-
-    This decorator is useful when you write custom modules and want to support
-    mixed precision training. If there are some inputs that must be processed
-    in fp32 mode, then this decorator can handle it. If inputs arguments are
-    fp16 tensors, they will be converted to fp32 automatically. Arguments other
-    than fp16 tensors are ignored. If you are using PyTorch >= 1.6,
-    torch.cuda.amp is used as the backend, otherwise, original mmcv
-    implementation will be adopted.
-
-    Args:
-        apply_to (Iterable, optional): The argument names to be converted.
-            `None` indicates all arguments.
-        out_fp16 (bool): Whether to convert the output back to fp16.
-
-    Example:
-
-        >>> import torch.nn as nn
-        >>> class MyModule1(nn.Module):
-        >>>
-        >>>     # Convert x and y to fp32
-        >>>     @force_fp32()
-        >>>     def loss(self, x, y):
-        >>>         pass
-
-        >>> import torch.nn as nn
-        >>> class MyModule2(nn.Module):
-        >>>
-        >>>     # convert pred to fp32
-        >>>     @force_fp32(apply_to=('pred', ))
-        >>>     def post_process(self, pred, others):
-        >>>         pass
-    """
-
-    def force_fp32_wrapper(old_func):
-
-        @functools.wraps(old_func)
-        def new_func(*args, **kwargs):
-            # check if the module has set the attribute `fp16_enabled`, if not,
-            # just fallback to the original method.
-            if not isinstance(args[0], torch.nn.Module):
-                raise TypeError('@force_fp32 can only be used to decorate the '
-                                'method of nn.Module')
-            if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
-                return old_func(*args, **kwargs)
-            # get the arg spec of the decorated method
-            args_info = getfullargspec(old_func)
-            # get the argument names to be casted
-            args_to_cast = args_info.args if apply_to is None else apply_to
-            # convert the args that need to be processed
-            new_args = []
-            if args:
-                arg_names = args_info.args[:len(args)]
-                for i, arg_name in enumerate(arg_names):
-                    if arg_name in args_to_cast:
-                        new_args.append(
-                            cast_tensor_type(args[i], torch.half, torch.float))
-                    else:
-                        new_args.append(args[i])
-            # convert the kwargs that need to be processed
-            new_kwargs = dict()
-            if kwargs:
-                for arg_name, arg_value in kwargs.items():
-                    if arg_name in args_to_cast:
-                        new_kwargs[arg_name] = cast_tensor_type(
-                            arg_value, torch.half, torch.float)
-                    else:
-                        new_kwargs[arg_name] = arg_value
-            # apply converted arguments to the decorated method
-            if (TORCH_VERSION != 'parrots' and
-                    digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
-                with autocast(enabled=False):
-                    output = old_func(*new_args, **new_kwargs)
-            else:
-                output = old_func(*new_args, **new_kwargs)
-            # cast the results back to fp32 if necessary
-            if out_fp16:
-                output = cast_tensor_type(output, torch.float, torch.half)
-            return output
-
-        return new_func
-
-    return force_fp32_wrapper
-
-
-def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
-    warnings.warning(
-        '"mmcv.runner.fp16_utils.allreduce_grads" is deprecated, and will be '
-        'removed in v2.8. Please switch to "mmcv.runner.allreduce_grads')
-    _allreduce_grads(params, coalesce=coalesce, bucket_size_mb=bucket_size_mb)
-
-
-def wrap_fp16_model(model):
-    """Wrap the FP32 model to FP16.
-
-    If you are using PyTorch >= 1.6, torch.cuda.amp is used as the
-    backend, otherwise, original mmcv implementation will be adopted.
-
-    For PyTorch >= 1.6, this function will
-    1. Set fp16 flag inside the model to True.
-
-    Otherwise:
-    1. Convert FP32 model to FP16.
-    2. Remain some necessary layers to be FP32, e.g., normalization layers.
-    3. Set `fp16_enabled` flag inside the model to True.
-
-    Args:
-        model (nn.Module): Model in FP32.
-    """
-    if (TORCH_VERSION == 'parrots'
-            or digit_version(TORCH_VERSION) < digit_version('1.6.0')):
-        # convert model to fp16
-        model.half()
-        # patch the normalization layers to make it work in fp32 mode
-        patch_norm_fp32(model)
-    # set `fp16_enabled` flag
-    for m in model.modules():
-        if hasattr(m, 'fp16_enabled'):
-            m.fp16_enabled = True
-
-
-def patch_norm_fp32(module):
-    """Recursively convert normalization layers from FP16 to FP32.
-
-    Args:
-        module (nn.Module): The modules to be converted in FP16.
-
-    Returns:
-        nn.Module: The converted module, the normalization layers have been
-            converted to FP32.
-    """
-    if isinstance(module, (nn.modules.batchnorm._BatchNorm, nn.GroupNorm)):
-        module.float()
-        if isinstance(module, nn.GroupNorm) or torch.__version__ < '1.3':
-            module.forward = patch_forward_method(module.forward, torch.half,
-                                                  torch.float)
-    for child in module.children():
-        patch_norm_fp32(child)
-    return module
-
-
-def patch_forward_method(func, src_type, dst_type, convert_output=True):
-    """Patch the forward method of a module.
-
-    Args:
-        func (callable): The original forward method.
-        src_type (torch.dtype): Type of input arguments to be converted from.
-        dst_type (torch.dtype): Type of input arguments to be converted to.
-        convert_output (bool): Whether to convert the output back to src_type.
-
-    Returns:
-        callable: The patched forward method.
-    """
-
-    def new_forward(*args, **kwargs):
-        output = func(*cast_tensor_type(args, src_type, dst_type),
-                      **cast_tensor_type(kwargs, src_type, dst_type))
-        if convert_output:
-            output = cast_tensor_type(output, dst_type, src_type)
-        return output
-
-    return new_forward
-
-
-class LossScaler:
-    """Class that manages loss scaling in mixed precision training which
-    supports both dynamic or static mode.
-
-    The implementation refers to
-    https://github.com/NVIDIA/apex/blob/master/apex/fp16_utils/loss_scaler.py.
-    Indirectly, by supplying ``mode='dynamic'`` for dynamic loss scaling.
-    It's important to understand how :class:`LossScaler` operates.
-    Loss scaling is designed to combat the problem of underflowing
-    gradients encountered at long times when training fp16 networks.
-    Dynamic loss scaling begins by attempting a very high loss
-    scale.  Ironically, this may result in OVERflowing gradients.
-    If overflowing gradients are encountered, :class:`FP16_Optimizer` then
-    skips the update step for this particular iteration/minibatch,
-    and :class:`LossScaler` adjusts the loss scale to a lower value.
-    If a certain number of iterations occur without overflowing gradients
-    detected,:class:`LossScaler` increases the loss scale once more.
-    In this way :class:`LossScaler` attempts to "ride the edge" of always
-    using the highest loss scale possible without incurring overflow.
-
-    Args:
-        init_scale (float): Initial loss scale value, default: 2**32.
-        scale_factor (float): Factor used when adjusting the loss scale.
-            Default: 2.
-        mode (str): Loss scaling mode. 'dynamic' or 'static'
-        scale_window (int): Number of consecutive iterations without an
-            overflow to wait before increasing the loss scale. Default: 1000.
-    """
-
-    def __init__(self,
-                 init_scale=2**32,
-                 mode='dynamic',
-                 scale_factor=2.,
-                 scale_window=1000):
-        self.cur_scale = init_scale
-        self.cur_iter = 0
-        assert mode in ('dynamic',
-                        'static'), 'mode can only be dynamic or static'
-        self.mode = mode
-        self.last_overflow_iter = -1
-        self.scale_factor = scale_factor
-        self.scale_window = scale_window
-
-    def has_overflow(self, params):
-        """Check if params contain overflow."""
-        if self.mode != 'dynamic':
-            return False
-        for p in params:
-            if p.grad is not None and LossScaler._has_inf_or_nan(p.grad.data):
-                return True
-        return False
-
-    def _has_inf_or_nan(x):
-        """Check if params contain NaN."""
-        try:
-            cpu_sum = float(x.float().sum())
-        except RuntimeError as instance:
-            if 'value cannot be converted' not in instance.args[0]:
-                raise
-            return True
-        else:
-            if cpu_sum == float('inf') or cpu_sum == -float('inf') \
-                    or cpu_sum != cpu_sum:
-                return True
-            return False
-
-    def update_scale(self, overflow):
-        """update the current loss scale value when overflow happens."""
-        if self.mode != 'dynamic':
-            return
-        if overflow:
-            self.cur_scale = max(self.cur_scale / self.scale_factor, 1)
-            self.last_overflow_iter = self.cur_iter
-        else:
-            if (self.cur_iter - self.last_overflow_iter) % \
-                    self.scale_window == 0:
-                self.cur_scale *= self.scale_factor
-        self.cur_iter += 1
-
-    def state_dict(self):
-        """Returns the state of the scaler as a :class:`dict`."""
-        return dict(
-            cur_scale=self.cur_scale,
-            cur_iter=self.cur_iter,
-            mode=self.mode,
-            last_overflow_iter=self.last_overflow_iter,
-            scale_factor=self.scale_factor,
-            scale_window=self.scale_window)
-
-    def load_state_dict(self, state_dict):
-        """Loads the loss_scaler state dict.
-
-        Args:
-           state_dict (dict): scaler state.
-        """
-        self.cur_scale = state_dict['cur_scale']
-        self.cur_iter = state_dict['cur_iter']
-        self.mode = state_dict['mode']
-        self.last_overflow_iter = state_dict['last_overflow_iter']
-        self.scale_factor = state_dict['scale_factor']
-        self.scale_window = state_dict['scale_window']
-
-    @property
-    def loss_scale(self):
-        return self.cur_scale
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/hooks/__init__.py b/ControlNet/annotator/uniformer/mmcv/runner/hooks/__init__.py
deleted file mode 100644
index 915af28cefab14a14c1188ed861161080fd138a3..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/hooks/__init__.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .checkpoint import CheckpointHook
-from .closure import ClosureHook
-from .ema import EMAHook
-from .evaluation import DistEvalHook, EvalHook
-from .hook import HOOKS, Hook
-from .iter_timer import IterTimerHook
-from .logger import (DvcliveLoggerHook, LoggerHook, MlflowLoggerHook,
-                     NeptuneLoggerHook, PaviLoggerHook, TensorboardLoggerHook,
-                     TextLoggerHook, WandbLoggerHook)
-from .lr_updater import LrUpdaterHook
-from .memory import EmptyCacheHook
-from .momentum_updater import MomentumUpdaterHook
-from .optimizer import (Fp16OptimizerHook, GradientCumulativeFp16OptimizerHook,
-                        GradientCumulativeOptimizerHook, OptimizerHook)
-from .profiler import ProfilerHook
-from .sampler_seed import DistSamplerSeedHook
-from .sync_buffer import SyncBuffersHook
-
-__all__ = [
-    'HOOKS', 'Hook', 'CheckpointHook', 'ClosureHook', 'LrUpdaterHook',
-    'OptimizerHook', 'Fp16OptimizerHook', 'IterTimerHook',
-    'DistSamplerSeedHook', 'EmptyCacheHook', 'LoggerHook', 'MlflowLoggerHook',
-    'PaviLoggerHook', 'TextLoggerHook', 'TensorboardLoggerHook',
-    'NeptuneLoggerHook', 'WandbLoggerHook', 'DvcliveLoggerHook',
-    'MomentumUpdaterHook', 'SyncBuffersHook', 'EMAHook', 'EvalHook',
-    'DistEvalHook', 'ProfilerHook', 'GradientCumulativeOptimizerHook',
-    'GradientCumulativeFp16OptimizerHook'
-]
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/hooks/checkpoint.py b/ControlNet/annotator/uniformer/mmcv/runner/hooks/checkpoint.py
deleted file mode 100644
index 6af3fae43ac4b35532641a81eb13557edfc7dfba..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/hooks/checkpoint.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-import warnings
-
-from annotator.uniformer.mmcv.fileio import FileClient
-from ..dist_utils import allreduce_params, master_only
-from .hook import HOOKS, Hook
-
-
-@HOOKS.register_module()
-class CheckpointHook(Hook):
-    """Save checkpoints periodically.
-
-    Args:
-        interval (int): The saving period. If ``by_epoch=True``, interval
-            indicates epochs, otherwise it indicates iterations.
-            Default: -1, which means "never".
-        by_epoch (bool): Saving checkpoints by epoch or by iteration.
-            Default: True.
-        save_optimizer (bool): Whether to save optimizer state_dict in the
-            checkpoint. It is usually used for resuming experiments.
-            Default: True.
-        out_dir (str, optional): The root directory to save checkpoints. If not
-            specified, ``runner.work_dir`` will be used by default. If
-            specified, the ``out_dir`` will be the concatenation of ``out_dir``
-            and the last level directory of ``runner.work_dir``.
-            `Changed in version 1.3.16.`
-        max_keep_ckpts (int, optional): The maximum checkpoints to keep.
-            In some cases we want only the latest few checkpoints and would
-            like to delete old ones to save the disk space.
-            Default: -1, which means unlimited.
-        save_last (bool, optional): Whether to force the last checkpoint to be
-            saved regardless of interval. Default: True.
-        sync_buffer (bool, optional): Whether to synchronize buffers in
-            different gpus. Default: False.
-        file_client_args (dict, optional): Arguments to instantiate a
-            FileClient. See :class:`mmcv.fileio.FileClient` for details.
-            Default: None.
-            `New in version 1.3.16.`
-
-    .. warning::
-        Before v1.3.16, the ``out_dir`` argument indicates the path where the
-        checkpoint is stored. However, since v1.3.16, ``out_dir`` indicates the
-        root directory and the final path to save checkpoint is the
-        concatenation of ``out_dir`` and the last level directory of
-        ``runner.work_dir``. Suppose the value of ``out_dir`` is "/path/of/A"
-        and the value of ``runner.work_dir`` is "/path/of/B", then the final
-        path will be "/path/of/A/B".
-    """
-
-    def __init__(self,
-                 interval=-1,
-                 by_epoch=True,
-                 save_optimizer=True,
-                 out_dir=None,
-                 max_keep_ckpts=-1,
-                 save_last=True,
-                 sync_buffer=False,
-                 file_client_args=None,
-                 **kwargs):
-        self.interval = interval
-        self.by_epoch = by_epoch
-        self.save_optimizer = save_optimizer
-        self.out_dir = out_dir
-        self.max_keep_ckpts = max_keep_ckpts
-        self.save_last = save_last
-        self.args = kwargs
-        self.sync_buffer = sync_buffer
-        self.file_client_args = file_client_args
-
-    def before_run(self, runner):
-        if not self.out_dir:
-            self.out_dir = runner.work_dir
-
-        self.file_client = FileClient.infer_client(self.file_client_args,
-                                                   self.out_dir)
-
-        # if `self.out_dir` is not equal to `runner.work_dir`, it means that
-        # `self.out_dir` is set so the final `self.out_dir` is the
-        # concatenation of `self.out_dir` and the last level directory of
-        # `runner.work_dir`
-        if self.out_dir != runner.work_dir:
-            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
-            self.out_dir = self.file_client.join_path(self.out_dir, basename)
-
-        runner.logger.info((f'Checkpoints will be saved to {self.out_dir} by '
-                            f'{self.file_client.name}.'))
-
-        # disable the create_symlink option because some file backends do not
-        # allow to create a symlink
-        if 'create_symlink' in self.args:
-            if self.args[
-                    'create_symlink'] and not self.file_client.allow_symlink:
-                self.args['create_symlink'] = False
-                warnings.warn(
-                    ('create_symlink is set as True by the user but is changed'
-                     'to be False because creating symbolic link is not '
-                     f'allowed in {self.file_client.name}'))
-        else:
-            self.args['create_symlink'] = self.file_client.allow_symlink
-
-    def after_train_epoch(self, runner):
-        if not self.by_epoch:
-            return
-
-        # save checkpoint for following cases:
-        # 1. every ``self.interval`` epochs
-        # 2. reach the last epoch of training
-        if self.every_n_epochs(
-                runner, self.interval) or (self.save_last
-                                           and self.is_last_epoch(runner)):
-            runner.logger.info(
-                f'Saving checkpoint at {runner.epoch + 1} epochs')
-            if self.sync_buffer:
-                allreduce_params(runner.model.buffers())
-            self._save_checkpoint(runner)
-
-    @master_only
-    def _save_checkpoint(self, runner):
-        """Save the current checkpoint and delete unwanted checkpoint."""
-        runner.save_checkpoint(
-            self.out_dir, save_optimizer=self.save_optimizer, **self.args)
-        if runner.meta is not None:
-            if self.by_epoch:
-                cur_ckpt_filename = self.args.get(
-                    'filename_tmpl', 'epoch_{}.pth').format(runner.epoch + 1)
-            else:
-                cur_ckpt_filename = self.args.get(
-                    'filename_tmpl', 'iter_{}.pth').format(runner.iter + 1)
-            runner.meta.setdefault('hook_msgs', dict())
-            runner.meta['hook_msgs']['last_ckpt'] = self.file_client.join_path(
-                self.out_dir, cur_ckpt_filename)
-        # remove other checkpoints
-        if self.max_keep_ckpts > 0:
-            if self.by_epoch:
-                name = 'epoch_{}.pth'
-                current_ckpt = runner.epoch + 1
-            else:
-                name = 'iter_{}.pth'
-                current_ckpt = runner.iter + 1
-            redundant_ckpts = range(
-                current_ckpt - self.max_keep_ckpts * self.interval, 0,
-                -self.interval)
-            filename_tmpl = self.args.get('filename_tmpl', name)
-            for _step in redundant_ckpts:
-                ckpt_path = self.file_client.join_path(
-                    self.out_dir, filename_tmpl.format(_step))
-                if self.file_client.isfile(ckpt_path):
-                    self.file_client.remove(ckpt_path)
-                else:
-                    break
-
-    def after_train_iter(self, runner):
-        if self.by_epoch:
-            return
-
-        # save checkpoint for following cases:
-        # 1. every ``self.interval`` iterations
-        # 2. reach the last iteration of training
-        if self.every_n_iters(
-                runner, self.interval) or (self.save_last
-                                           and self.is_last_iter(runner)):
-            runner.logger.info(
-                f'Saving checkpoint at {runner.iter + 1} iterations')
-            if self.sync_buffer:
-                allreduce_params(runner.model.buffers())
-            self._save_checkpoint(runner)
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/hooks/closure.py b/ControlNet/annotator/uniformer/mmcv/runner/hooks/closure.py
deleted file mode 100644
index b955f81f425be4ac3e6bb3f4aac653887989e872..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/hooks/closure.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .hook import HOOKS, Hook
-
-
-@HOOKS.register_module()
-class ClosureHook(Hook):
-
-    def __init__(self, fn_name, fn):
-        assert hasattr(self, fn_name)
-        assert callable(fn)
-        setattr(self, fn_name, fn)
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/hooks/ema.py b/ControlNet/annotator/uniformer/mmcv/runner/hooks/ema.py
deleted file mode 100644
index 15c7e68088f019802a59e7ae41cc1fe0c7f28f96..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/hooks/ema.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from ...parallel import is_module_wrapper
-from ..hooks.hook import HOOKS, Hook
-
-
-@HOOKS.register_module()
-class EMAHook(Hook):
-    r"""Exponential Moving Average Hook.
-
-    Use Exponential Moving Average on all parameters of model in training
-    process. All parameters have a ema backup, which update by the formula
-    as below. EMAHook takes priority over EvalHook and CheckpointSaverHook.
-
-        .. math::
-
-            \text{Xema\_{t+1}} = (1 - \text{momentum}) \times
-            \text{Xema\_{t}} +  \text{momentum} \times X_t
-
-    Args:
-        momentum (float): The momentum used for updating ema parameter.
-            Defaults to 0.0002.
-        interval (int): Update ema parameter every interval iteration.
-            Defaults to 1.
-        warm_up (int): During first warm_up steps, we may use smaller momentum
-            to update ema parameters more slowly. Defaults to 100.
-        resume_from (str): The checkpoint path. Defaults to None.
-    """
-
-    def __init__(self,
-                 momentum=0.0002,
-                 interval=1,
-                 warm_up=100,
-                 resume_from=None):
-        assert isinstance(interval, int) and interval > 0
-        self.warm_up = warm_up
-        self.interval = interval
-        assert momentum > 0 and momentum < 1
-        self.momentum = momentum**interval
-        self.checkpoint = resume_from
-
-    def before_run(self, runner):
-        """To resume model with it's ema parameters more friendly.
-
-        Register ema parameter as ``named_buffer`` to model
-        """
-        model = runner.model
-        if is_module_wrapper(model):
-            model = model.module
-        self.param_ema_buffer = {}
-        self.model_parameters = dict(model.named_parameters(recurse=True))
-        for name, value in self.model_parameters.items():
-            # "." is not allowed in module's buffer name
-            buffer_name = f"ema_{name.replace('.', '_')}"
-            self.param_ema_buffer[name] = buffer_name
-            model.register_buffer(buffer_name, value.data.clone())
-        self.model_buffers = dict(model.named_buffers(recurse=True))
-        if self.checkpoint is not None:
-            runner.resume(self.checkpoint)
-
-    def after_train_iter(self, runner):
-        """Update ema parameter every self.interval iterations."""
-        curr_step = runner.iter
-        # We warm up the momentum considering the instability at beginning
-        momentum = min(self.momentum,
-                       (1 + curr_step) / (self.warm_up + curr_step))
-        if curr_step % self.interval != 0:
-            return
-        for name, parameter in self.model_parameters.items():
-            buffer_name = self.param_ema_buffer[name]
-            buffer_parameter = self.model_buffers[buffer_name]
-            buffer_parameter.mul_(1 - momentum).add_(momentum, parameter.data)
-
-    def after_train_epoch(self, runner):
-        """We load parameter values from ema backup to model before the
-        EvalHook."""
-        self._swap_ema_parameters()
-
-    def before_train_epoch(self, runner):
-        """We recover model's parameter from ema backup after last epoch's
-        EvalHook."""
-        self._swap_ema_parameters()
-
-    def _swap_ema_parameters(self):
-        """Swap the parameter of model with parameter in ema_buffer."""
-        for name, value in self.model_parameters.items():
-            temp = value.data.clone()
-            ema_buffer = self.model_buffers[self.param_ema_buffer[name]]
-            value.data.copy_(ema_buffer.data)
-            ema_buffer.data.copy_(temp)
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/hooks/evaluation.py b/ControlNet/annotator/uniformer/mmcv/runner/hooks/evaluation.py
deleted file mode 100644
index 4d00999ce5665c53bded8de9e084943eee2d230d..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/hooks/evaluation.py
+++ /dev/null
@@ -1,509 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-import warnings
-from math import inf
-
-import torch.distributed as dist
-from torch.nn.modules.batchnorm import _BatchNorm
-from torch.utils.data import DataLoader
-
-from annotator.uniformer.mmcv.fileio import FileClient
-from annotator.uniformer.mmcv.utils import is_seq_of
-from .hook import Hook
-from .logger import LoggerHook
-
-
-class EvalHook(Hook):
-    """Non-Distributed evaluation hook.
-
-    This hook will regularly perform evaluation in a given interval when
-    performing in non-distributed environment.
-
-    Args:
-        dataloader (DataLoader): A PyTorch dataloader, whose dataset has
-            implemented ``evaluate`` function.
-        start (int | None, optional): Evaluation starting epoch. It enables
-            evaluation before the training starts if ``start`` <= the resuming
-            epoch. If None, whether to evaluate is merely decided by
-            ``interval``. Default: None.
-        interval (int): Evaluation interval. Default: 1.
-        by_epoch (bool): Determine perform evaluation by epoch or by iteration.
-            If set to True, it will perform by epoch. Otherwise, by iteration.
-            Default: True.
-        save_best (str, optional): If a metric is specified, it would measure
-            the best checkpoint during evaluation. The information about best
-            checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep
-            best score value and best checkpoint path, which will be also
-            loaded when resume checkpoint. Options are the evaluation metrics
-            on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox
-            detection and instance segmentation. ``AR@100`` for proposal
-            recall. If ``save_best`` is ``auto``, the first key of the returned
-            ``OrderedDict`` result will be used. Default: None.
-        rule (str | None, optional): Comparison rule for best score. If set to
-            None, it will infer a reasonable rule. Keys such as 'acc', 'top'
-            .etc will be inferred by 'greater' rule. Keys contain 'loss' will
-            be inferred by 'less' rule. Options are 'greater', 'less', None.
-            Default: None.
-        test_fn (callable, optional): test a model with samples from a
-            dataloader, and return the test results. If ``None``, the default
-            test function ``mmcv.engine.single_gpu_test`` will be used.
-            (default: ``None``)
-        greater_keys (List[str] | None, optional): Metric keys that will be
-            inferred by 'greater' comparison rule. If ``None``,
-            _default_greater_keys will be used. (default: ``None``)
-        less_keys (List[str] | None, optional): Metric keys that will be
-            inferred by 'less' comparison rule. If ``None``, _default_less_keys
-            will be used. (default: ``None``)
-        out_dir (str, optional): The root directory to save checkpoints. If not
-            specified, `runner.work_dir` will be used by default. If specified,
-            the `out_dir` will be the concatenation of `out_dir` and the last
-            level directory of `runner.work_dir`.
-            `New in version 1.3.16.`
-        file_client_args (dict): Arguments to instantiate a FileClient.
-            See :class:`mmcv.fileio.FileClient` for details. Default: None.
-            `New in version 1.3.16.`
-        **eval_kwargs: Evaluation arguments fed into the evaluate function of
-            the dataset.
-
-    Notes:
-        If new arguments are added for EvalHook, tools/test.py,
-        tools/eval_metric.py may be affected.
-    """
-
-    # Since the key for determine greater or less is related to the downstream
-    # tasks, downstream repos may need to overwrite the following inner
-    # variable accordingly.
-
-    rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y}
-    init_value_map = {'greater': -inf, 'less': inf}
-    _default_greater_keys = [
-        'acc', 'top', 'AR@', 'auc', 'precision', 'mAP', 'mDice', 'mIoU',
-        'mAcc', 'aAcc'
-    ]
-    _default_less_keys = ['loss']
-
-    def __init__(self,
-                 dataloader,
-                 start=None,
-                 interval=1,
-                 by_epoch=True,
-                 save_best=None,
-                 rule=None,
-                 test_fn=None,
-                 greater_keys=None,
-                 less_keys=None,
-                 out_dir=None,
-                 file_client_args=None,
-                 **eval_kwargs):
-        if not isinstance(dataloader, DataLoader):
-            raise TypeError(f'dataloader must be a pytorch DataLoader, '
-                            f'but got {type(dataloader)}')
-
-        if interval <= 0:
-            raise ValueError(f'interval must be a positive number, '
-                             f'but got {interval}')
-
-        assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean'
-
-        if start is not None and start < 0:
-            raise ValueError(f'The evaluation start epoch {start} is smaller '
-                             f'than 0')
-
-        self.dataloader = dataloader
-        self.interval = interval
-        self.start = start
-        self.by_epoch = by_epoch
-
-        assert isinstance(save_best, str) or save_best is None, \
-            '""save_best"" should be a str or None ' \
-            f'rather than {type(save_best)}'
-        self.save_best = save_best
-        self.eval_kwargs = eval_kwargs
-        self.initial_flag = True
-
-        if test_fn is None:
-            from annotator.uniformer.mmcv.engine import single_gpu_test
-            self.test_fn = single_gpu_test
-        else:
-            self.test_fn = test_fn
-
-        if greater_keys is None:
-            self.greater_keys = self._default_greater_keys
-        else:
-            if not isinstance(greater_keys, (list, tuple)):
-                greater_keys = (greater_keys, )
-            assert is_seq_of(greater_keys, str)
-            self.greater_keys = greater_keys
-
-        if less_keys is None:
-            self.less_keys = self._default_less_keys
-        else:
-            if not isinstance(less_keys, (list, tuple)):
-                less_keys = (less_keys, )
-            assert is_seq_of(less_keys, str)
-            self.less_keys = less_keys
-
-        if self.save_best is not None:
-            self.best_ckpt_path = None
-            self._init_rule(rule, self.save_best)
-
-        self.out_dir = out_dir
-        self.file_client_args = file_client_args
-
-    def _init_rule(self, rule, key_indicator):
-        """Initialize rule, key_indicator, comparison_func, and best score.
-
-        Here is the rule to determine which rule is used for key indicator
-        when the rule is not specific (note that the key indicator matching
-        is case-insensitive):
-        1. If the key indicator is in ``self.greater_keys``, the rule will be
-           specified as 'greater'.
-        2. Or if the key indicator is in ``self.less_keys``, the rule will be
-           specified as 'less'.
-        3. Or if the key indicator is equal to the substring in any one item
-           in ``self.greater_keys``, the rule will be specified as 'greater'.
-        4. Or if the key indicator is equal to the substring in any one item
-           in ``self.less_keys``, the rule will be specified as 'less'.
-
-        Args:
-            rule (str | None): Comparison rule for best score.
-            key_indicator (str | None): Key indicator to determine the
-                comparison rule.
-        """
-        if rule not in self.rule_map and rule is not None:
-            raise KeyError(f'rule must be greater, less or None, '
-                           f'but got {rule}.')
-
-        if rule is None:
-            if key_indicator != 'auto':
-                # `_lc` here means we use the lower case of keys for
-                # case-insensitive matching
-                key_indicator_lc = key_indicator.lower()
-                greater_keys = [key.lower() for key in self.greater_keys]
-                less_keys = [key.lower() for key in self.less_keys]
-
-                if key_indicator_lc in greater_keys:
-                    rule = 'greater'
-                elif key_indicator_lc in less_keys:
-                    rule = 'less'
-                elif any(key in key_indicator_lc for key in greater_keys):
-                    rule = 'greater'
-                elif any(key in key_indicator_lc for key in less_keys):
-                    rule = 'less'
-                else:
-                    raise ValueError(f'Cannot infer the rule for key '
-                                     f'{key_indicator}, thus a specific rule '
-                                     f'must be specified.')
-        self.rule = rule
-        self.key_indicator = key_indicator
-        if self.rule is not None:
-            self.compare_func = self.rule_map[self.rule]
-
-    def before_run(self, runner):
-        if not self.out_dir:
-            self.out_dir = runner.work_dir
-
-        self.file_client = FileClient.infer_client(self.file_client_args,
-                                                   self.out_dir)
-
-        # if `self.out_dir` is not equal to `runner.work_dir`, it means that
-        # `self.out_dir` is set so the final `self.out_dir` is the
-        # concatenation of `self.out_dir` and the last level directory of
-        # `runner.work_dir`
-        if self.out_dir != runner.work_dir:
-            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
-            self.out_dir = self.file_client.join_path(self.out_dir, basename)
-            runner.logger.info(
-                (f'The best checkpoint will be saved to {self.out_dir} by '
-                 f'{self.file_client.name}'))
-
-        if self.save_best is not None:
-            if runner.meta is None:
-                warnings.warn('runner.meta is None. Creating an empty one.')
-                runner.meta = dict()
-            runner.meta.setdefault('hook_msgs', dict())
-            self.best_ckpt_path = runner.meta['hook_msgs'].get(
-                'best_ckpt', None)
-
-    def before_train_iter(self, runner):
-        """Evaluate the model only at the start of training by iteration."""
-        if self.by_epoch or not self.initial_flag:
-            return
-        if self.start is not None and runner.iter >= self.start:
-            self.after_train_iter(runner)
-        self.initial_flag = False
-
-    def before_train_epoch(self, runner):
-        """Evaluate the model only at the start of training by epoch."""
-        if not (self.by_epoch and self.initial_flag):
-            return
-        if self.start is not None and runner.epoch >= self.start:
-            self.after_train_epoch(runner)
-        self.initial_flag = False
-
-    def after_train_iter(self, runner):
-        """Called after every training iter to evaluate the results."""
-        if not self.by_epoch and self._should_evaluate(runner):
-            # Because the priority of EvalHook is higher than LoggerHook, the
-            # training log and the evaluating log are mixed. Therefore,
-            # we need to dump the training log and clear it before evaluating
-            # log is generated. In addition, this problem will only appear in
-            # `IterBasedRunner` whose `self.by_epoch` is False, because
-            # `EpochBasedRunner` whose `self.by_epoch` is True calls
-            # `_do_evaluate` in `after_train_epoch` stage, and at this stage
-            # the training log has been printed, so it will not cause any
-            # problem. more details at
-            # https://github.com/open-mmlab/mmsegmentation/issues/694
-            for hook in runner._hooks:
-                if isinstance(hook, LoggerHook):
-                    hook.after_train_iter(runner)
-            runner.log_buffer.clear()
-
-            self._do_evaluate(runner)
-
-    def after_train_epoch(self, runner):
-        """Called after every training epoch to evaluate the results."""
-        if self.by_epoch and self._should_evaluate(runner):
-            self._do_evaluate(runner)
-
-    def _do_evaluate(self, runner):
-        """perform evaluation and save ckpt."""
-        results = self.test_fn(runner.model, self.dataloader)
-        runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
-        key_score = self.evaluate(runner, results)
-        # the key_score may be `None` so it needs to skip the action to save
-        # the best checkpoint
-        if self.save_best and key_score:
-            self._save_ckpt(runner, key_score)
-
-    def _should_evaluate(self, runner):
-        """Judge whether to perform evaluation.
-
-        Here is the rule to judge whether to perform evaluation:
-        1. It will not perform evaluation during the epoch/iteration interval,
-           which is determined by ``self.interval``.
-        2. It will not perform evaluation if the start time is larger than
-           current time.
-        3. It will not perform evaluation when current time is larger than
-           the start time but during epoch/iteration interval.
-
-        Returns:
-            bool: The flag indicating whether to perform evaluation.
-        """
-        if self.by_epoch:
-            current = runner.epoch
-            check_time = self.every_n_epochs
-        else:
-            current = runner.iter
-            check_time = self.every_n_iters
-
-        if self.start is None:
-            if not check_time(runner, self.interval):
-                # No evaluation during the interval.
-                return False
-        elif (current + 1) < self.start:
-            # No evaluation if start is larger than the current time.
-            return False
-        else:
-            # Evaluation only at epochs/iters 3, 5, 7...
-            # if start==3 and interval==2
-            if (current + 1 - self.start) % self.interval:
-                return False
-        return True
-
-    def _save_ckpt(self, runner, key_score):
-        """Save the best checkpoint.
-
-        It will compare the score according to the compare function, write
-        related information (best score, best checkpoint path) and save the
-        best checkpoint into ``work_dir``.
-        """
-        if self.by_epoch:
-            current = f'epoch_{runner.epoch + 1}'
-            cur_type, cur_time = 'epoch', runner.epoch + 1
-        else:
-            current = f'iter_{runner.iter + 1}'
-            cur_type, cur_time = 'iter', runner.iter + 1
-
-        best_score = runner.meta['hook_msgs'].get(
-            'best_score', self.init_value_map[self.rule])
-        if self.compare_func(key_score, best_score):
-            best_score = key_score
-            runner.meta['hook_msgs']['best_score'] = best_score
-
-            if self.best_ckpt_path and self.file_client.isfile(
-                    self.best_ckpt_path):
-                self.file_client.remove(self.best_ckpt_path)
-                runner.logger.info(
-                    (f'The previous best checkpoint {self.best_ckpt_path} was '
-                     'removed'))
-
-            best_ckpt_name = f'best_{self.key_indicator}_{current}.pth'
-            self.best_ckpt_path = self.file_client.join_path(
-                self.out_dir, best_ckpt_name)
-            runner.meta['hook_msgs']['best_ckpt'] = self.best_ckpt_path
-
-            runner.save_checkpoint(
-                self.out_dir, best_ckpt_name, create_symlink=False)
-            runner.logger.info(
-                f'Now best checkpoint is saved as {best_ckpt_name}.')
-            runner.logger.info(
-                f'Best {self.key_indicator} is {best_score:0.4f} '
-                f'at {cur_time} {cur_type}.')
-
-    def evaluate(self, runner, results):
-        """Evaluate the results.
-
-        Args:
-            runner (:obj:`mmcv.Runner`): The underlined training runner.
-            results (list): Output results.
-        """
-        eval_res = self.dataloader.dataset.evaluate(
-            results, logger=runner.logger, **self.eval_kwargs)
-
-        for name, val in eval_res.items():
-            runner.log_buffer.output[name] = val
-        runner.log_buffer.ready = True
-
-        if self.save_best is not None:
-            # If the performance of model is pool, the `eval_res` may be an
-            # empty dict and it will raise exception when `self.save_best` is
-            # not None. More details at
-            # https://github.com/open-mmlab/mmdetection/issues/6265.
-            if not eval_res:
-                warnings.warn(
-                    'Since `eval_res` is an empty dict, the behavior to save '
-                    'the best checkpoint will be skipped in this evaluation.')
-                return None
-
-            if self.key_indicator == 'auto':
-                # infer from eval_results
-                self._init_rule(self.rule, list(eval_res.keys())[0])
-            return eval_res[self.key_indicator]
-
-        return None
-
-
-class DistEvalHook(EvalHook):
-    """Distributed evaluation hook.
-
-    This hook will regularly perform evaluation in a given interval when
-    performing in distributed environment.
-
-    Args:
-        dataloader (DataLoader): A PyTorch dataloader, whose dataset has
-            implemented ``evaluate`` function.
-        start (int | None, optional): Evaluation starting epoch. It enables
-            evaluation before the training starts if ``start`` <= the resuming
-            epoch. If None, whether to evaluate is merely decided by
-            ``interval``. Default: None.
-        interval (int): Evaluation interval. Default: 1.
-        by_epoch (bool): Determine perform evaluation by epoch or by iteration.
-            If set to True, it will perform by epoch. Otherwise, by iteration.
-            default: True.
-        save_best (str, optional): If a metric is specified, it would measure
-            the best checkpoint during evaluation. The information about best
-            checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep
-            best score value and best checkpoint path, which will be also
-            loaded when resume checkpoint. Options are the evaluation metrics
-            on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox
-            detection and instance segmentation. ``AR@100`` for proposal
-            recall. If ``save_best`` is ``auto``, the first key of the returned
-            ``OrderedDict`` result will be used. Default: None.
-        rule (str | None, optional): Comparison rule for best score. If set to
-            None, it will infer a reasonable rule. Keys such as 'acc', 'top'
-            .etc will be inferred by 'greater' rule. Keys contain 'loss' will
-            be inferred by 'less' rule. Options are 'greater', 'less', None.
-            Default: None.
-        test_fn (callable, optional): test a model with samples from a
-            dataloader in a multi-gpu manner, and return the test results. If
-            ``None``, the default test function ``mmcv.engine.multi_gpu_test``
-            will be used. (default: ``None``)
-        tmpdir (str | None): Temporary directory to save the results of all
-            processes. Default: None.
-        gpu_collect (bool): Whether to use gpu or cpu to collect results.
-            Default: False.
-        broadcast_bn_buffer (bool): Whether to broadcast the
-            buffer(running_mean and running_var) of rank 0 to other rank
-            before evaluation. Default: True.
-        out_dir (str, optional): The root directory to save checkpoints. If not
-            specified, `runner.work_dir` will be used by default. If specified,
-            the `out_dir` will be the concatenation of `out_dir` and the last
-            level directory of `runner.work_dir`.
-        file_client_args (dict): Arguments to instantiate a FileClient.
-            See :class:`mmcv.fileio.FileClient` for details. Default: None.
-        **eval_kwargs: Evaluation arguments fed into the evaluate function of
-            the dataset.
-    """
-
-    def __init__(self,
-                 dataloader,
-                 start=None,
-                 interval=1,
-                 by_epoch=True,
-                 save_best=None,
-                 rule=None,
-                 test_fn=None,
-                 greater_keys=None,
-                 less_keys=None,
-                 broadcast_bn_buffer=True,
-                 tmpdir=None,
-                 gpu_collect=False,
-                 out_dir=None,
-                 file_client_args=None,
-                 **eval_kwargs):
-
-        if test_fn is None:
-            from annotator.uniformer.mmcv.engine import multi_gpu_test
-            test_fn = multi_gpu_test
-
-        super().__init__(
-            dataloader,
-            start=start,
-            interval=interval,
-            by_epoch=by_epoch,
-            save_best=save_best,
-            rule=rule,
-            test_fn=test_fn,
-            greater_keys=greater_keys,
-            less_keys=less_keys,
-            out_dir=out_dir,
-            file_client_args=file_client_args,
-            **eval_kwargs)
-
-        self.broadcast_bn_buffer = broadcast_bn_buffer
-        self.tmpdir = tmpdir
-        self.gpu_collect = gpu_collect
-
-    def _do_evaluate(self, runner):
-        """perform evaluation and save ckpt."""
-        # Synchronization of BatchNorm's buffer (running_mean
-        # and running_var) is not supported in the DDP of pytorch,
-        # which may cause the inconsistent performance of models in
-        # different ranks, so we broadcast BatchNorm's buffers
-        # of rank 0 to other ranks to avoid this.
-        if self.broadcast_bn_buffer:
-            model = runner.model
-            for name, module in model.named_modules():
-                if isinstance(module,
-                              _BatchNorm) and module.track_running_stats:
-                    dist.broadcast(module.running_var, 0)
-                    dist.broadcast(module.running_mean, 0)
-
-        tmpdir = self.tmpdir
-        if tmpdir is None:
-            tmpdir = osp.join(runner.work_dir, '.eval_hook')
-
-        results = self.test_fn(
-            runner.model,
-            self.dataloader,
-            tmpdir=tmpdir,
-            gpu_collect=self.gpu_collect)
-        if runner.rank == 0:
-            print('\n')
-            runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
-            key_score = self.evaluate(runner, results)
-            # the key_score may be `None` so it needs to skip the action to
-            # save the best checkpoint
-            if self.save_best and key_score:
-                self._save_ckpt(runner, key_score)
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/hooks/hook.py b/ControlNet/annotator/uniformer/mmcv/runner/hooks/hook.py
deleted file mode 100644
index b8855c107727ecf85b917c890fc8b7f6359238a4..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/hooks/hook.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from annotator.uniformer.mmcv.utils import Registry, is_method_overridden
-
-HOOKS = Registry('hook')
-
-
-class Hook:
-    stages = ('before_run', 'before_train_epoch', 'before_train_iter',
-              'after_train_iter', 'after_train_epoch', 'before_val_epoch',
-              'before_val_iter', 'after_val_iter', 'after_val_epoch',
-              'after_run')
-
-    def before_run(self, runner):
-        pass
-
-    def after_run(self, runner):
-        pass
-
-    def before_epoch(self, runner):
-        pass
-
-    def after_epoch(self, runner):
-        pass
-
-    def before_iter(self, runner):
-        pass
-
-    def after_iter(self, runner):
-        pass
-
-    def before_train_epoch(self, runner):
-        self.before_epoch(runner)
-
-    def before_val_epoch(self, runner):
-        self.before_epoch(runner)
-
-    def after_train_epoch(self, runner):
-        self.after_epoch(runner)
-
-    def after_val_epoch(self, runner):
-        self.after_epoch(runner)
-
-    def before_train_iter(self, runner):
-        self.before_iter(runner)
-
-    def before_val_iter(self, runner):
-        self.before_iter(runner)
-
-    def after_train_iter(self, runner):
-        self.after_iter(runner)
-
-    def after_val_iter(self, runner):
-        self.after_iter(runner)
-
-    def every_n_epochs(self, runner, n):
-        return (runner.epoch + 1) % n == 0 if n > 0 else False
-
-    def every_n_inner_iters(self, runner, n):
-        return (runner.inner_iter + 1) % n == 0 if n > 0 else False
-
-    def every_n_iters(self, runner, n):
-        return (runner.iter + 1) % n == 0 if n > 0 else False
-
-    def end_of_epoch(self, runner):
-        return runner.inner_iter + 1 == len(runner.data_loader)
-
-    def is_last_epoch(self, runner):
-        return runner.epoch + 1 == runner._max_epochs
-
-    def is_last_iter(self, runner):
-        return runner.iter + 1 == runner._max_iters
-
-    def get_triggered_stages(self):
-        trigger_stages = set()
-        for stage in Hook.stages:
-            if is_method_overridden(stage, Hook, self):
-                trigger_stages.add(stage)
-
-        # some methods will be triggered in multi stages
-        # use this dict to map method to stages.
-        method_stages_map = {
-            'before_epoch': ['before_train_epoch', 'before_val_epoch'],
-            'after_epoch': ['after_train_epoch', 'after_val_epoch'],
-            'before_iter': ['before_train_iter', 'before_val_iter'],
-            'after_iter': ['after_train_iter', 'after_val_iter'],
-        }
-
-        for method, map_stages in method_stages_map.items():
-            if is_method_overridden(method, Hook, self):
-                trigger_stages.update(map_stages)
-
-        return [stage for stage in Hook.stages if stage in trigger_stages]
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/hooks/iter_timer.py b/ControlNet/annotator/uniformer/mmcv/runner/hooks/iter_timer.py
deleted file mode 100644
index cfd5002fe85ffc6992155ac01003878064a1d9be..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/hooks/iter_timer.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import time
-
-from .hook import HOOKS, Hook
-
-
-@HOOKS.register_module()
-class IterTimerHook(Hook):
-
-    def before_epoch(self, runner):
-        self.t = time.time()
-
-    def before_iter(self, runner):
-        runner.log_buffer.update({'data_time': time.time() - self.t})
-
-    def after_iter(self, runner):
-        runner.log_buffer.update({'time': time.time() - self.t})
-        self.t = time.time()
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/__init__.py b/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/__init__.py
deleted file mode 100644
index a0b6b345640a895368ac8a647afef6f24333d90e..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .base import LoggerHook
-from .dvclive import DvcliveLoggerHook
-from .mlflow import MlflowLoggerHook
-from .neptune import NeptuneLoggerHook
-from .pavi import PaviLoggerHook
-from .tensorboard import TensorboardLoggerHook
-from .text import TextLoggerHook
-from .wandb import WandbLoggerHook
-
-__all__ = [
-    'LoggerHook', 'MlflowLoggerHook', 'PaviLoggerHook',
-    'TensorboardLoggerHook', 'TextLoggerHook', 'WandbLoggerHook',
-    'NeptuneLoggerHook', 'DvcliveLoggerHook'
-]
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/base.py b/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/base.py
deleted file mode 100644
index f845256729458ced821762a1b8ef881e17ff9955..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/base.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numbers
-from abc import ABCMeta, abstractmethod
-
-import numpy as np
-import torch
-
-from ..hook import Hook
-
-
-class LoggerHook(Hook):
-    """Base class for logger hooks.
-
-    Args:
-        interval (int): Logging interval (every k iterations).
-        ignore_last (bool): Ignore the log of last iterations in each epoch
-            if less than `interval`.
-        reset_flag (bool): Whether to clear the output buffer after logging.
-        by_epoch (bool): Whether EpochBasedRunner is used.
-    """
-
-    __metaclass__ = ABCMeta
-
-    def __init__(self,
-                 interval=10,
-                 ignore_last=True,
-                 reset_flag=False,
-                 by_epoch=True):
-        self.interval = interval
-        self.ignore_last = ignore_last
-        self.reset_flag = reset_flag
-        self.by_epoch = by_epoch
-
-    @abstractmethod
-    def log(self, runner):
-        pass
-
-    @staticmethod
-    def is_scalar(val, include_np=True, include_torch=True):
-        """Tell the input variable is a scalar or not.
-
-        Args:
-            val: Input variable.
-            include_np (bool): Whether include 0-d np.ndarray as a scalar.
-            include_torch (bool): Whether include 0-d torch.Tensor as a scalar.
-
-        Returns:
-            bool: True or False.
-        """
-        if isinstance(val, numbers.Number):
-            return True
-        elif include_np and isinstance(val, np.ndarray) and val.ndim == 0:
-            return True
-        elif include_torch and isinstance(val, torch.Tensor) and len(val) == 1:
-            return True
-        else:
-            return False
-
-    def get_mode(self, runner):
-        if runner.mode == 'train':
-            if 'time' in runner.log_buffer.output:
-                mode = 'train'
-            else:
-                mode = 'val'
-        elif runner.mode == 'val':
-            mode = 'val'
-        else:
-            raise ValueError(f"runner mode should be 'train' or 'val', "
-                             f'but got {runner.mode}')
-        return mode
-
-    def get_epoch(self, runner):
-        if runner.mode == 'train':
-            epoch = runner.epoch + 1
-        elif runner.mode == 'val':
-            # normal val mode
-            # runner.epoch += 1 has been done before val workflow
-            epoch = runner.epoch
-        else:
-            raise ValueError(f"runner mode should be 'train' or 'val', "
-                             f'but got {runner.mode}')
-        return epoch
-
-    def get_iter(self, runner, inner_iter=False):
-        """Get the current training iteration step."""
-        if self.by_epoch and inner_iter:
-            current_iter = runner.inner_iter + 1
-        else:
-            current_iter = runner.iter + 1
-        return current_iter
-
-    def get_lr_tags(self, runner):
-        tags = {}
-        lrs = runner.current_lr()
-        if isinstance(lrs, dict):
-            for name, value in lrs.items():
-                tags[f'learning_rate/{name}'] = value[0]
-        else:
-            tags['learning_rate'] = lrs[0]
-        return tags
-
-    def get_momentum_tags(self, runner):
-        tags = {}
-        momentums = runner.current_momentum()
-        if isinstance(momentums, dict):
-            for name, value in momentums.items():
-                tags[f'momentum/{name}'] = value[0]
-        else:
-            tags['momentum'] = momentums[0]
-        return tags
-
-    def get_loggable_tags(self,
-                          runner,
-                          allow_scalar=True,
-                          allow_text=False,
-                          add_mode=True,
-                          tags_to_skip=('time', 'data_time')):
-        tags = {}
-        for var, val in runner.log_buffer.output.items():
-            if var in tags_to_skip:
-                continue
-            if self.is_scalar(val) and not allow_scalar:
-                continue
-            if isinstance(val, str) and not allow_text:
-                continue
-            if add_mode:
-                var = f'{self.get_mode(runner)}/{var}'
-            tags[var] = val
-        tags.update(self.get_lr_tags(runner))
-        tags.update(self.get_momentum_tags(runner))
-        return tags
-
-    def before_run(self, runner):
-        for hook in runner.hooks[::-1]:
-            if isinstance(hook, LoggerHook):
-                hook.reset_flag = True
-                break
-
-    def before_epoch(self, runner):
-        runner.log_buffer.clear()  # clear logs of last epoch
-
-    def after_train_iter(self, runner):
-        if self.by_epoch and self.every_n_inner_iters(runner, self.interval):
-            runner.log_buffer.average(self.interval)
-        elif not self.by_epoch and self.every_n_iters(runner, self.interval):
-            runner.log_buffer.average(self.interval)
-        elif self.end_of_epoch(runner) and not self.ignore_last:
-            # not precise but more stable
-            runner.log_buffer.average(self.interval)
-
-        if runner.log_buffer.ready:
-            self.log(runner)
-            if self.reset_flag:
-                runner.log_buffer.clear_output()
-
-    def after_train_epoch(self, runner):
-        if runner.log_buffer.ready:
-            self.log(runner)
-            if self.reset_flag:
-                runner.log_buffer.clear_output()
-
-    def after_val_epoch(self, runner):
-        runner.log_buffer.average()
-        self.log(runner)
-        if self.reset_flag:
-            runner.log_buffer.clear_output()
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/dvclive.py b/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/dvclive.py
deleted file mode 100644
index 687cdc58c0336c92b1e4f9a410ba67ebaab2bc7a..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/dvclive.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from ...dist_utils import master_only
-from ..hook import HOOKS
-from .base import LoggerHook
-
-
-@HOOKS.register_module()
-class DvcliveLoggerHook(LoggerHook):
-    """Class to log metrics with dvclive.
-
-    It requires `dvclive`_ to be installed.
-
-    Args:
-        path (str): Directory where dvclive will write TSV log files.
-        interval (int): Logging interval (every k iterations).
-            Default 10.
-        ignore_last (bool): Ignore the log of last iterations in each epoch
-            if less than `interval`.
-            Default: True.
-        reset_flag (bool): Whether to clear the output buffer after logging.
-            Default: True.
-        by_epoch (bool): Whether EpochBasedRunner is used.
-            Default: True.
-
-    .. _dvclive:
-        https://dvc.org/doc/dvclive
-    """
-
-    def __init__(self,
-                 path,
-                 interval=10,
-                 ignore_last=True,
-                 reset_flag=True,
-                 by_epoch=True):
-
-        super(DvcliveLoggerHook, self).__init__(interval, ignore_last,
-                                                reset_flag, by_epoch)
-        self.path = path
-        self.import_dvclive()
-
-    def import_dvclive(self):
-        try:
-            import dvclive
-        except ImportError:
-            raise ImportError(
-                'Please run "pip install dvclive" to install dvclive')
-        self.dvclive = dvclive
-
-    @master_only
-    def before_run(self, runner):
-        self.dvclive.init(self.path)
-
-    @master_only
-    def log(self, runner):
-        tags = self.get_loggable_tags(runner)
-        if tags:
-            for k, v in tags.items():
-                self.dvclive.log(k, v, step=self.get_iter(runner))
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/mlflow.py b/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/mlflow.py
deleted file mode 100644
index f9a72592be47b534ce22573775fd5a7e8e86d72d..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/mlflow.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from ...dist_utils import master_only
-from ..hook import HOOKS
-from .base import LoggerHook
-
-
-@HOOKS.register_module()
-class MlflowLoggerHook(LoggerHook):
-
-    def __init__(self,
-                 exp_name=None,
-                 tags=None,
-                 log_model=True,
-                 interval=10,
-                 ignore_last=True,
-                 reset_flag=False,
-                 by_epoch=True):
-        """Class to log metrics and (optionally) a trained model to MLflow.
-
-        It requires `MLflow`_ to be installed.
-
-        Args:
-            exp_name (str, optional): Name of the experiment to be used.
-                Default None.
-                If not None, set the active experiment.
-                If experiment does not exist, an experiment with provided name
-                will be created.
-            tags (dict of str: str, optional): Tags for the current run.
-                Default None.
-                If not None, set tags for the current run.
-            log_model (bool, optional): Whether to log an MLflow artifact.
-                Default True.
-                If True, log runner.model as an MLflow artifact
-                for the current run.
-            interval (int): Logging interval (every k iterations).
-            ignore_last (bool): Ignore the log of last iterations in each epoch
-                if less than `interval`.
-            reset_flag (bool): Whether to clear the output buffer after logging
-            by_epoch (bool): Whether EpochBasedRunner is used.
-
-        .. _MLflow:
-            https://www.mlflow.org/docs/latest/index.html
-        """
-        super(MlflowLoggerHook, self).__init__(interval, ignore_last,
-                                               reset_flag, by_epoch)
-        self.import_mlflow()
-        self.exp_name = exp_name
-        self.tags = tags
-        self.log_model = log_model
-
-    def import_mlflow(self):
-        try:
-            import mlflow
-            import mlflow.pytorch as mlflow_pytorch
-        except ImportError:
-            raise ImportError(
-                'Please run "pip install mlflow" to install mlflow')
-        self.mlflow = mlflow
-        self.mlflow_pytorch = mlflow_pytorch
-
-    @master_only
-    def before_run(self, runner):
-        super(MlflowLoggerHook, self).before_run(runner)
-        if self.exp_name is not None:
-            self.mlflow.set_experiment(self.exp_name)
-        if self.tags is not None:
-            self.mlflow.set_tags(self.tags)
-
-    @master_only
-    def log(self, runner):
-        tags = self.get_loggable_tags(runner)
-        if tags:
-            self.mlflow.log_metrics(tags, step=self.get_iter(runner))
-
-    @master_only
-    def after_run(self, runner):
-        if self.log_model:
-            self.mlflow_pytorch.log_model(runner.model, 'models')
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/neptune.py b/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/neptune.py
deleted file mode 100644
index 7a38772b0c93a8608f32c6357b8616e77c139dc9..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/neptune.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from ...dist_utils import master_only
-from ..hook import HOOKS
-from .base import LoggerHook
-
-
-@HOOKS.register_module()
-class NeptuneLoggerHook(LoggerHook):
-    """Class to log metrics to NeptuneAI.
-
-    It requires `neptune-client` to be installed.
-
-    Args:
-        init_kwargs (dict): a dict contains the initialization keys as below:
-            - project (str): Name of a project in a form of
-                namespace/project_name. If None, the value of
-                NEPTUNE_PROJECT environment variable will be taken.
-            - api_token (str): User’s API token.
-                If None, the value of NEPTUNE_API_TOKEN environment
-                variable will be taken. Note: It is strongly recommended
-                to use NEPTUNE_API_TOKEN environment variable rather than
-                placing your API token in plain text in your source code.
-            - name (str, optional, default is 'Untitled'): Editable name of
-                the run. Name is displayed in the run's Details and in
-                Runs table as a column.
-            Check https://docs.neptune.ai/api-reference/neptune#init for
-                more init arguments.
-        interval (int): Logging interval (every k iterations).
-        ignore_last (bool): Ignore the log of last iterations in each epoch
-            if less than `interval`.
-        reset_flag (bool): Whether to clear the output buffer after logging
-        by_epoch (bool): Whether EpochBasedRunner is used.
-
-    .. _NeptuneAI:
-        https://docs.neptune.ai/you-should-know/logging-metadata
-    """
-
-    def __init__(self,
-                 init_kwargs=None,
-                 interval=10,
-                 ignore_last=True,
-                 reset_flag=True,
-                 with_step=True,
-                 by_epoch=True):
-
-        super(NeptuneLoggerHook, self).__init__(interval, ignore_last,
-                                                reset_flag, by_epoch)
-        self.import_neptune()
-        self.init_kwargs = init_kwargs
-        self.with_step = with_step
-
-    def import_neptune(self):
-        try:
-            import neptune.new as neptune
-        except ImportError:
-            raise ImportError(
-                'Please run "pip install neptune-client" to install neptune')
-        self.neptune = neptune
-        self.run = None
-
-    @master_only
-    def before_run(self, runner):
-        if self.init_kwargs:
-            self.run = self.neptune.init(**self.init_kwargs)
-        else:
-            self.run = self.neptune.init()
-
-    @master_only
-    def log(self, runner):
-        tags = self.get_loggable_tags(runner)
-        if tags:
-            for tag_name, tag_value in tags.items():
-                if self.with_step:
-                    self.run[tag_name].log(
-                        tag_value, step=self.get_iter(runner))
-                else:
-                    tags['global_step'] = self.get_iter(runner)
-                    self.run[tag_name].log(tags)
-
-    @master_only
-    def after_run(self, runner):
-        self.run.stop()
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/pavi.py b/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/pavi.py
deleted file mode 100644
index 1dcf146d8163aff1363e9764999b0a74d674a595..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/pavi.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import json
-import os
-import os.path as osp
-
-import torch
-import yaml
-
-import annotator.uniformer.mmcv as mmcv
-from ....parallel.utils import is_module_wrapper
-from ...dist_utils import master_only
-from ..hook import HOOKS
-from .base import LoggerHook
-
-
-@HOOKS.register_module()
-class PaviLoggerHook(LoggerHook):
-
-    def __init__(self,
-                 init_kwargs=None,
-                 add_graph=False,
-                 add_last_ckpt=False,
-                 interval=10,
-                 ignore_last=True,
-                 reset_flag=False,
-                 by_epoch=True,
-                 img_key='img_info'):
-        super(PaviLoggerHook, self).__init__(interval, ignore_last, reset_flag,
-                                             by_epoch)
-        self.init_kwargs = init_kwargs
-        self.add_graph = add_graph
-        self.add_last_ckpt = add_last_ckpt
-        self.img_key = img_key
-
-    @master_only
-    def before_run(self, runner):
-        super(PaviLoggerHook, self).before_run(runner)
-        try:
-            from pavi import SummaryWriter
-        except ImportError:
-            raise ImportError('Please run "pip install pavi" to install pavi.')
-
-        self.run_name = runner.work_dir.split('/')[-1]
-
-        if not self.init_kwargs:
-            self.init_kwargs = dict()
-        self.init_kwargs['name'] = self.run_name
-        self.init_kwargs['model'] = runner._model_name
-        if runner.meta is not None:
-            if 'config_dict' in runner.meta:
-                config_dict = runner.meta['config_dict']
-                assert isinstance(
-                    config_dict,
-                    dict), ('meta["config_dict"] has to be of a dict, '
-                            f'but got {type(config_dict)}')
-            elif 'config_file' in runner.meta:
-                config_file = runner.meta['config_file']
-                config_dict = dict(mmcv.Config.fromfile(config_file))
-            else:
-                config_dict = None
-            if config_dict is not None:
-                # 'max_.*iter' is parsed in pavi sdk as the maximum iterations
-                #  to properly set up the progress bar.
-                config_dict = config_dict.copy()
-                config_dict.setdefault('max_iter', runner.max_iters)
-                # non-serializable values are first converted in
-                # mmcv.dump to json
-                config_dict = json.loads(
-                    mmcv.dump(config_dict, file_format='json'))
-                session_text = yaml.dump(config_dict)
-                self.init_kwargs['session_text'] = session_text
-        self.writer = SummaryWriter(**self.init_kwargs)
-
-    def get_step(self, runner):
-        """Get the total training step/epoch."""
-        if self.get_mode(runner) == 'val' and self.by_epoch:
-            return self.get_epoch(runner)
-        else:
-            return self.get_iter(runner)
-
-    @master_only
-    def log(self, runner):
-        tags = self.get_loggable_tags(runner, add_mode=False)
-        if tags:
-            self.writer.add_scalars(
-                self.get_mode(runner), tags, self.get_step(runner))
-
-    @master_only
-    def after_run(self, runner):
-        if self.add_last_ckpt:
-            ckpt_path = osp.join(runner.work_dir, 'latest.pth')
-            if osp.islink(ckpt_path):
-                ckpt_path = osp.join(runner.work_dir, os.readlink(ckpt_path))
-
-            if osp.isfile(ckpt_path):
-                # runner.epoch += 1 has been done before `after_run`.
-                iteration = runner.epoch if self.by_epoch else runner.iter
-                return self.writer.add_snapshot_file(
-                    tag=self.run_name,
-                    snapshot_file_path=ckpt_path,
-                    iteration=iteration)
-
-        # flush the buffer and send a task ending signal to Pavi
-        self.writer.close()
-
-    @master_only
-    def before_epoch(self, runner):
-        if runner.epoch == 0 and self.add_graph:
-            if is_module_wrapper(runner.model):
-                _model = runner.model.module
-            else:
-                _model = runner.model
-            device = next(_model.parameters()).device
-            data = next(iter(runner.data_loader))
-            image = data[self.img_key][0:1].to(device)
-            with torch.no_grad():
-                self.writer.add_graph(_model, image)
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/tensorboard.py b/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/tensorboard.py
deleted file mode 100644
index 4dd5011dc08def6c09eef86d3ce5b124c9fc5372..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/tensorboard.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-
-from annotator.uniformer.mmcv.utils import TORCH_VERSION, digit_version
-from ...dist_utils import master_only
-from ..hook import HOOKS
-from .base import LoggerHook
-
-
-@HOOKS.register_module()
-class TensorboardLoggerHook(LoggerHook):
-
-    def __init__(self,
-                 log_dir=None,
-                 interval=10,
-                 ignore_last=True,
-                 reset_flag=False,
-                 by_epoch=True):
-        super(TensorboardLoggerHook, self).__init__(interval, ignore_last,
-                                                    reset_flag, by_epoch)
-        self.log_dir = log_dir
-
-    @master_only
-    def before_run(self, runner):
-        super(TensorboardLoggerHook, self).before_run(runner)
-        if (TORCH_VERSION == 'parrots'
-                or digit_version(TORCH_VERSION) < digit_version('1.1')):
-            try:
-                from tensorboardX import SummaryWriter
-            except ImportError:
-                raise ImportError('Please install tensorboardX to use '
-                                  'TensorboardLoggerHook.')
-        else:
-            try:
-                from torch.utils.tensorboard import SummaryWriter
-            except ImportError:
-                raise ImportError(
-                    'Please run "pip install future tensorboard" to install '
-                    'the dependencies to use torch.utils.tensorboard '
-                    '(applicable to PyTorch 1.1 or higher)')
-
-        if self.log_dir is None:
-            self.log_dir = osp.join(runner.work_dir, 'tf_logs')
-        self.writer = SummaryWriter(self.log_dir)
-
-    @master_only
-    def log(self, runner):
-        tags = self.get_loggable_tags(runner, allow_text=True)
-        for tag, val in tags.items():
-            if isinstance(val, str):
-                self.writer.add_text(tag, val, self.get_iter(runner))
-            else:
-                self.writer.add_scalar(tag, val, self.get_iter(runner))
-
-    @master_only
-    def after_run(self, runner):
-        self.writer.close()
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/text.py b/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/text.py
deleted file mode 100644
index 87b1a3eca9595a130121526f8b4c29915387ab35..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/text.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import datetime
-import os
-import os.path as osp
-from collections import OrderedDict
-
-import torch
-import torch.distributed as dist
-
-import annotator.uniformer.mmcv as mmcv
-from annotator.uniformer.mmcv.fileio.file_client import FileClient
-from annotator.uniformer.mmcv.utils import is_tuple_of, scandir
-from ..hook import HOOKS
-from .base import LoggerHook
-
-
-@HOOKS.register_module()
-class TextLoggerHook(LoggerHook):
-    """Logger hook in text.
-
-    In this logger hook, the information will be printed on terminal and
-    saved in json file.
-
-    Args:
-        by_epoch (bool, optional): Whether EpochBasedRunner is used.
-            Default: True.
-        interval (int, optional): Logging interval (every k iterations).
-            Default: 10.
-        ignore_last (bool, optional): Ignore the log of last iterations in each
-            epoch if less than :attr:`interval`. Default: True.
-        reset_flag (bool, optional): Whether to clear the output buffer after
-            logging. Default: False.
-        interval_exp_name (int, optional): Logging interval for experiment
-            name. This feature is to help users conveniently get the experiment
-            information from screen or log file. Default: 1000.
-        out_dir (str, optional): Logs are saved in ``runner.work_dir`` default.
-            If ``out_dir`` is specified, logs will be copied to a new directory
-            which is the concatenation of ``out_dir`` and the last level
-            directory of ``runner.work_dir``. Default: None.
-            `New in version 1.3.16.`
-        out_suffix (str or tuple[str], optional): Those filenames ending with
-            ``out_suffix`` will be copied to ``out_dir``.
-            Default: ('.log.json', '.log', '.py').
-            `New in version 1.3.16.`
-        keep_local (bool, optional): Whether to keep local log when
-            :attr:`out_dir` is specified. If False, the local log will be
-            removed. Default: True.
-            `New in version 1.3.16.`
-        file_client_args (dict, optional): Arguments to instantiate a
-            FileClient. See :class:`mmcv.fileio.FileClient` for details.
-            Default: None.
-            `New in version 1.3.16.`
-    """
-
-    def __init__(self,
-                 by_epoch=True,
-                 interval=10,
-                 ignore_last=True,
-                 reset_flag=False,
-                 interval_exp_name=1000,
-                 out_dir=None,
-                 out_suffix=('.log.json', '.log', '.py'),
-                 keep_local=True,
-                 file_client_args=None):
-        super(TextLoggerHook, self).__init__(interval, ignore_last, reset_flag,
-                                             by_epoch)
-        self.by_epoch = by_epoch
-        self.time_sec_tot = 0
-        self.interval_exp_name = interval_exp_name
-
-        if out_dir is None and file_client_args is not None:
-            raise ValueError(
-                'file_client_args should be "None" when `out_dir` is not'
-                'specified.')
-        self.out_dir = out_dir
-
-        if not (out_dir is None or isinstance(out_dir, str)
-                or is_tuple_of(out_dir, str)):
-            raise TypeError('out_dir should be  "None" or string or tuple of '
-                            'string, but got {out_dir}')
-        self.out_suffix = out_suffix
-
-        self.keep_local = keep_local
-        self.file_client_args = file_client_args
-        if self.out_dir is not None:
-            self.file_client = FileClient.infer_client(file_client_args,
-                                                       self.out_dir)
-
-    def before_run(self, runner):
-        super(TextLoggerHook, self).before_run(runner)
-
-        if self.out_dir is not None:
-            self.file_client = FileClient.infer_client(self.file_client_args,
-                                                       self.out_dir)
-            # The final `self.out_dir` is the concatenation of `self.out_dir`
-            # and the last level directory of `runner.work_dir`
-            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
-            self.out_dir = self.file_client.join_path(self.out_dir, basename)
-            runner.logger.info(
-                (f'Text logs will be saved to {self.out_dir} by '
-                 f'{self.file_client.name} after the training process.'))
-
-        self.start_iter = runner.iter
-        self.json_log_path = osp.join(runner.work_dir,
-                                      f'{runner.timestamp}.log.json')
-        if runner.meta is not None:
-            self._dump_log(runner.meta, runner)
-
-    def _get_max_memory(self, runner):
-        device = getattr(runner.model, 'output_device', None)
-        mem = torch.cuda.max_memory_allocated(device=device)
-        mem_mb = torch.tensor([mem / (1024 * 1024)],
-                              dtype=torch.int,
-                              device=device)
-        if runner.world_size > 1:
-            dist.reduce(mem_mb, 0, op=dist.ReduceOp.MAX)
-        return mem_mb.item()
-
-    def _log_info(self, log_dict, runner):
-        # print exp name for users to distinguish experiments
-        # at every ``interval_exp_name`` iterations and the end of each epoch
-        if runner.meta is not None and 'exp_name' in runner.meta:
-            if (self.every_n_iters(runner, self.interval_exp_name)) or (
-                    self.by_epoch and self.end_of_epoch(runner)):
-                exp_info = f'Exp name: {runner.meta["exp_name"]}'
-                runner.logger.info(exp_info)
-
-        if log_dict['mode'] == 'train':
-            if isinstance(log_dict['lr'], dict):
-                lr_str = []
-                for k, val in log_dict['lr'].items():
-                    lr_str.append(f'lr_{k}: {val:.3e}')
-                lr_str = ' '.join(lr_str)
-            else:
-                lr_str = f'lr: {log_dict["lr"]:.3e}'
-
-            # by epoch: Epoch [4][100/1000]
-            # by iter:  Iter [100/100000]
-            if self.by_epoch:
-                log_str = f'Epoch [{log_dict["epoch"]}]' \
-                          f'[{log_dict["iter"]}/{len(runner.data_loader)}]\t'
-            else:
-                log_str = f'Iter [{log_dict["iter"]}/{runner.max_iters}]\t'
-            log_str += f'{lr_str}, '
-
-            if 'time' in log_dict.keys():
-                self.time_sec_tot += (log_dict['time'] * self.interval)
-                time_sec_avg = self.time_sec_tot / (
-                    runner.iter - self.start_iter + 1)
-                eta_sec = time_sec_avg * (runner.max_iters - runner.iter - 1)
-                eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
-                log_str += f'eta: {eta_str}, '
-                log_str += f'time: {log_dict["time"]:.3f}, ' \
-                           f'data_time: {log_dict["data_time"]:.3f}, '
-                # statistic memory
-                if torch.cuda.is_available():
-                    log_str += f'memory: {log_dict["memory"]}, '
-        else:
-            # val/test time
-            # here 1000 is the length of the val dataloader
-            # by epoch: Epoch[val] [4][1000]
-            # by iter: Iter[val] [1000]
-            if self.by_epoch:
-                log_str = f'Epoch({log_dict["mode"]}) ' \
-                    f'[{log_dict["epoch"]}][{log_dict["iter"]}]\t'
-            else:
-                log_str = f'Iter({log_dict["mode"]}) [{log_dict["iter"]}]\t'
-
-        log_items = []
-        for name, val in log_dict.items():
-            # TODO: resolve this hack
-            # these items have been in log_str
-            if name in [
-                    'mode', 'Epoch', 'iter', 'lr', 'time', 'data_time',
-                    'memory', 'epoch'
-            ]:
-                continue
-            if isinstance(val, float):
-                val = f'{val:.4f}'
-            log_items.append(f'{name}: {val}')
-        log_str += ', '.join(log_items)
-
-        runner.logger.info(log_str)
-
-    def _dump_log(self, log_dict, runner):
-        # dump log in json format
-        json_log = OrderedDict()
-        for k, v in log_dict.items():
-            json_log[k] = self._round_float(v)
-        # only append log at last line
-        if runner.rank == 0:
-            with open(self.json_log_path, 'a+') as f:
-                mmcv.dump(json_log, f, file_format='json')
-                f.write('\n')
-
-    def _round_float(self, items):
-        if isinstance(items, list):
-            return [self._round_float(item) for item in items]
-        elif isinstance(items, float):
-            return round(items, 5)
-        else:
-            return items
-
-    def log(self, runner):
-        if 'eval_iter_num' in runner.log_buffer.output:
-            # this doesn't modify runner.iter and is regardless of by_epoch
-            cur_iter = runner.log_buffer.output.pop('eval_iter_num')
-        else:
-            cur_iter = self.get_iter(runner, inner_iter=True)
-
-        log_dict = OrderedDict(
-            mode=self.get_mode(runner),
-            epoch=self.get_epoch(runner),
-            iter=cur_iter)
-
-        # only record lr of the first param group
-        cur_lr = runner.current_lr()
-        if isinstance(cur_lr, list):
-            log_dict['lr'] = cur_lr[0]
-        else:
-            assert isinstance(cur_lr, dict)
-            log_dict['lr'] = {}
-            for k, lr_ in cur_lr.items():
-                assert isinstance(lr_, list)
-                log_dict['lr'].update({k: lr_[0]})
-
-        if 'time' in runner.log_buffer.output:
-            # statistic memory
-            if torch.cuda.is_available():
-                log_dict['memory'] = self._get_max_memory(runner)
-
-        log_dict = dict(log_dict, **runner.log_buffer.output)
-
-        self._log_info(log_dict, runner)
-        self._dump_log(log_dict, runner)
-        return log_dict
-
-    def after_run(self, runner):
-        # copy or upload logs to self.out_dir
-        if self.out_dir is not None:
-            for filename in scandir(runner.work_dir, self.out_suffix, True):
-                local_filepath = osp.join(runner.work_dir, filename)
-                out_filepath = self.file_client.join_path(
-                    self.out_dir, filename)
-                with open(local_filepath, 'r') as f:
-                    self.file_client.put_text(f.read(), out_filepath)
-
-                runner.logger.info(
-                    (f'The file {local_filepath} has been uploaded to '
-                     f'{out_filepath}.'))
-
-                if not self.keep_local:
-                    os.remove(local_filepath)
-                    runner.logger.info(
-                        (f'{local_filepath} was removed due to the '
-                         '`self.keep_local=False`'))
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/wandb.py b/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/wandb.py
deleted file mode 100644
index 9f6808462eb79ab2b04806a5d9f0d3dd079b5ea9..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/hooks/logger/wandb.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from ...dist_utils import master_only
-from ..hook import HOOKS
-from .base import LoggerHook
-
-
-@HOOKS.register_module()
-class WandbLoggerHook(LoggerHook):
-
-    def __init__(self,
-                 init_kwargs=None,
-                 interval=10,
-                 ignore_last=True,
-                 reset_flag=False,
-                 commit=True,
-                 by_epoch=True,
-                 with_step=True):
-        super(WandbLoggerHook, self).__init__(interval, ignore_last,
-                                              reset_flag, by_epoch)
-        self.import_wandb()
-        self.init_kwargs = init_kwargs
-        self.commit = commit
-        self.with_step = with_step
-
-    def import_wandb(self):
-        try:
-            import wandb
-        except ImportError:
-            raise ImportError(
-                'Please run "pip install wandb" to install wandb')
-        self.wandb = wandb
-
-    @master_only
-    def before_run(self, runner):
-        super(WandbLoggerHook, self).before_run(runner)
-        if self.wandb is None:
-            self.import_wandb()
-        if self.init_kwargs:
-            self.wandb.init(**self.init_kwargs)
-        else:
-            self.wandb.init()
-
-    @master_only
-    def log(self, runner):
-        tags = self.get_loggable_tags(runner)
-        if tags:
-            if self.with_step:
-                self.wandb.log(
-                    tags, step=self.get_iter(runner), commit=self.commit)
-            else:
-                tags['global_step'] = self.get_iter(runner)
-                self.wandb.log(tags, commit=self.commit)
-
-    @master_only
-    def after_run(self, runner):
-        self.wandb.join()
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/hooks/lr_updater.py b/ControlNet/annotator/uniformer/mmcv/runner/hooks/lr_updater.py
deleted file mode 100644
index 6365908ddf6070086de2ffc0afada46ed2f32256..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/hooks/lr_updater.py
+++ /dev/null
@@ -1,670 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numbers
-from math import cos, pi
-
-import annotator.uniformer.mmcv as mmcv
-from .hook import HOOKS, Hook
-
-
-class LrUpdaterHook(Hook):
-    """LR Scheduler in MMCV.
-
-    Args:
-        by_epoch (bool): LR changes epoch by epoch
-        warmup (string): Type of warmup used. It can be None(use no warmup),
-            'constant', 'linear' or 'exp'
-        warmup_iters (int): The number of iterations or epochs that warmup
-            lasts
-        warmup_ratio (float): LR used at the beginning of warmup equals to
-            warmup_ratio * initial_lr
-        warmup_by_epoch (bool): When warmup_by_epoch == True, warmup_iters
-            means the number of epochs that warmup lasts, otherwise means the
-            number of iteration that warmup lasts
-    """
-
-    def __init__(self,
-                 by_epoch=True,
-                 warmup=None,
-                 warmup_iters=0,
-                 warmup_ratio=0.1,
-                 warmup_by_epoch=False):
-        # validate the "warmup" argument
-        if warmup is not None:
-            if warmup not in ['constant', 'linear', 'exp']:
-                raise ValueError(
-                    f'"{warmup}" is not a supported type for warming up, valid'
-                    ' types are "constant" and "linear"')
-        if warmup is not None:
-            assert warmup_iters > 0, \
-                '"warmup_iters" must be a positive integer'
-            assert 0 < warmup_ratio <= 1.0, \
-                '"warmup_ratio" must be in range (0,1]'
-
-        self.by_epoch = by_epoch
-        self.warmup = warmup
-        self.warmup_iters = warmup_iters
-        self.warmup_ratio = warmup_ratio
-        self.warmup_by_epoch = warmup_by_epoch
-
-        if self.warmup_by_epoch:
-            self.warmup_epochs = self.warmup_iters
-            self.warmup_iters = None
-        else:
-            self.warmup_epochs = None
-
-        self.base_lr = []  # initial lr for all param groups
-        self.regular_lr = []  # expected lr if no warming up is performed
-
-    def _set_lr(self, runner, lr_groups):
-        if isinstance(runner.optimizer, dict):
-            for k, optim in runner.optimizer.items():
-                for param_group, lr in zip(optim.param_groups, lr_groups[k]):
-                    param_group['lr'] = lr
-        else:
-            for param_group, lr in zip(runner.optimizer.param_groups,
-                                       lr_groups):
-                param_group['lr'] = lr
-
-    def get_lr(self, runner, base_lr):
-        raise NotImplementedError
-
-    def get_regular_lr(self, runner):
-        if isinstance(runner.optimizer, dict):
-            lr_groups = {}
-            for k in runner.optimizer.keys():
-                _lr_group = [
-                    self.get_lr(runner, _base_lr)
-                    for _base_lr in self.base_lr[k]
-                ]
-                lr_groups.update({k: _lr_group})
-
-            return lr_groups
-        else:
-            return [self.get_lr(runner, _base_lr) for _base_lr in self.base_lr]
-
-    def get_warmup_lr(self, cur_iters):
-
-        def _get_warmup_lr(cur_iters, regular_lr):
-            if self.warmup == 'constant':
-                warmup_lr = [_lr * self.warmup_ratio for _lr in regular_lr]
-            elif self.warmup == 'linear':
-                k = (1 - cur_iters / self.warmup_iters) * (1 -
-                                                           self.warmup_ratio)
-                warmup_lr = [_lr * (1 - k) for _lr in regular_lr]
-            elif self.warmup == 'exp':
-                k = self.warmup_ratio**(1 - cur_iters / self.warmup_iters)
-                warmup_lr = [_lr * k for _lr in regular_lr]
-            return warmup_lr
-
-        if isinstance(self.regular_lr, dict):
-            lr_groups = {}
-            for key, regular_lr in self.regular_lr.items():
-                lr_groups[key] = _get_warmup_lr(cur_iters, regular_lr)
-            return lr_groups
-        else:
-            return _get_warmup_lr(cur_iters, self.regular_lr)
-
-    def before_run(self, runner):
-        # NOTE: when resuming from a checkpoint, if 'initial_lr' is not saved,
-        # it will be set according to the optimizer params
-        if isinstance(runner.optimizer, dict):
-            self.base_lr = {}
-            for k, optim in runner.optimizer.items():
-                for group in optim.param_groups:
-                    group.setdefault('initial_lr', group['lr'])
-                _base_lr = [
-                    group['initial_lr'] for group in optim.param_groups
-                ]
-                self.base_lr.update({k: _base_lr})
-        else:
-            for group in runner.optimizer.param_groups:
-                group.setdefault('initial_lr', group['lr'])
-            self.base_lr = [
-                group['initial_lr'] for group in runner.optimizer.param_groups
-            ]
-
-    def before_train_epoch(self, runner):
-        if self.warmup_iters is None:
-            epoch_len = len(runner.data_loader)
-            self.warmup_iters = self.warmup_epochs * epoch_len
-
-        if not self.by_epoch:
-            return
-
-        self.regular_lr = self.get_regular_lr(runner)
-        self._set_lr(runner, self.regular_lr)
-
-    def before_train_iter(self, runner):
-        cur_iter = runner.iter
-        if not self.by_epoch:
-            self.regular_lr = self.get_regular_lr(runner)
-            if self.warmup is None or cur_iter >= self.warmup_iters:
-                self._set_lr(runner, self.regular_lr)
-            else:
-                warmup_lr = self.get_warmup_lr(cur_iter)
-                self._set_lr(runner, warmup_lr)
-        elif self.by_epoch:
-            if self.warmup is None or cur_iter > self.warmup_iters:
-                return
-            elif cur_iter == self.warmup_iters:
-                self._set_lr(runner, self.regular_lr)
-            else:
-                warmup_lr = self.get_warmup_lr(cur_iter)
-                self._set_lr(runner, warmup_lr)
-
-
-@HOOKS.register_module()
-class FixedLrUpdaterHook(LrUpdaterHook):
-
-    def __init__(self, **kwargs):
-        super(FixedLrUpdaterHook, self).__init__(**kwargs)
-
-    def get_lr(self, runner, base_lr):
-        return base_lr
-
-
-@HOOKS.register_module()
-class StepLrUpdaterHook(LrUpdaterHook):
-    """Step LR scheduler with min_lr clipping.
-
-    Args:
-        step (int | list[int]): Step to decay the LR. If an int value is given,
-            regard it as the decay interval. If a list is given, decay LR at
-            these steps.
-        gamma (float, optional): Decay LR ratio. Default: 0.1.
-        min_lr (float, optional): Minimum LR value to keep. If LR after decay
-            is lower than `min_lr`, it will be clipped to this value. If None
-            is given, we don't perform lr clipping. Default: None.
-    """
-
-    def __init__(self, step, gamma=0.1, min_lr=None, **kwargs):
-        if isinstance(step, list):
-            assert mmcv.is_list_of(step, int)
-            assert all([s > 0 for s in step])
-        elif isinstance(step, int):
-            assert step > 0
-        else:
-            raise TypeError('"step" must be a list or integer')
-        self.step = step
-        self.gamma = gamma
-        self.min_lr = min_lr
-        super(StepLrUpdaterHook, self).__init__(**kwargs)
-
-    def get_lr(self, runner, base_lr):
-        progress = runner.epoch if self.by_epoch else runner.iter
-
-        # calculate exponential term
-        if isinstance(self.step, int):
-            exp = progress // self.step
-        else:
-            exp = len(self.step)
-            for i, s in enumerate(self.step):
-                if progress < s:
-                    exp = i
-                    break
-
-        lr = base_lr * (self.gamma**exp)
-        if self.min_lr is not None:
-            # clip to a minimum value
-            lr = max(lr, self.min_lr)
-        return lr
-
-
-@HOOKS.register_module()
-class ExpLrUpdaterHook(LrUpdaterHook):
-
-    def __init__(self, gamma, **kwargs):
-        self.gamma = gamma
-        super(ExpLrUpdaterHook, self).__init__(**kwargs)
-
-    def get_lr(self, runner, base_lr):
-        progress = runner.epoch if self.by_epoch else runner.iter
-        return base_lr * self.gamma**progress
-
-
-@HOOKS.register_module()
-class PolyLrUpdaterHook(LrUpdaterHook):
-
-    def __init__(self, power=1., min_lr=0., **kwargs):
-        self.power = power
-        self.min_lr = min_lr
-        super(PolyLrUpdaterHook, self).__init__(**kwargs)
-
-    def get_lr(self, runner, base_lr):
-        if self.by_epoch:
-            progress = runner.epoch
-            max_progress = runner.max_epochs
-        else:
-            progress = runner.iter
-            max_progress = runner.max_iters
-        coeff = (1 - progress / max_progress)**self.power
-        return (base_lr - self.min_lr) * coeff + self.min_lr
-
-
-@HOOKS.register_module()
-class InvLrUpdaterHook(LrUpdaterHook):
-
-    def __init__(self, gamma, power=1., **kwargs):
-        self.gamma = gamma
-        self.power = power
-        super(InvLrUpdaterHook, self).__init__(**kwargs)
-
-    def get_lr(self, runner, base_lr):
-        progress = runner.epoch if self.by_epoch else runner.iter
-        return base_lr * (1 + self.gamma * progress)**(-self.power)
-
-
-@HOOKS.register_module()
-class CosineAnnealingLrUpdaterHook(LrUpdaterHook):
-
-    def __init__(self, min_lr=None, min_lr_ratio=None, **kwargs):
-        assert (min_lr is None) ^ (min_lr_ratio is None)
-        self.min_lr = min_lr
-        self.min_lr_ratio = min_lr_ratio
-        super(CosineAnnealingLrUpdaterHook, self).__init__(**kwargs)
-
-    def get_lr(self, runner, base_lr):
-        if self.by_epoch:
-            progress = runner.epoch
-            max_progress = runner.max_epochs
-        else:
-            progress = runner.iter
-            max_progress = runner.max_iters
-
-        if self.min_lr_ratio is not None:
-            target_lr = base_lr * self.min_lr_ratio
-        else:
-            target_lr = self.min_lr
-        return annealing_cos(base_lr, target_lr, progress / max_progress)
-
-
-@HOOKS.register_module()
-class FlatCosineAnnealingLrUpdaterHook(LrUpdaterHook):
-    """Flat + Cosine lr schedule.
-
-    Modified from https://github.com/fastai/fastai/blob/master/fastai/callback/schedule.py#L128 # noqa: E501
-
-    Args:
-        start_percent (float): When to start annealing the learning rate
-            after the percentage of the total training steps.
-            The value should be in range [0, 1).
-            Default: 0.75
-        min_lr (float, optional): The minimum lr. Default: None.
-        min_lr_ratio (float, optional): The ratio of minimum lr to the base lr.
-            Either `min_lr` or `min_lr_ratio` should be specified.
-            Default: None.
-    """
-
-    def __init__(self,
-                 start_percent=0.75,
-                 min_lr=None,
-                 min_lr_ratio=None,
-                 **kwargs):
-        assert (min_lr is None) ^ (min_lr_ratio is None)
-        if start_percent < 0 or start_percent > 1 or not isinstance(
-                start_percent, float):
-            raise ValueError(
-                'expected float between 0 and 1 start_percent, but '
-                f'got {start_percent}')
-        self.start_percent = start_percent
-        self.min_lr = min_lr
-        self.min_lr_ratio = min_lr_ratio
-        super(FlatCosineAnnealingLrUpdaterHook, self).__init__(**kwargs)
-
-    def get_lr(self, runner, base_lr):
-        if self.by_epoch:
-            start = round(runner.max_epochs * self.start_percent)
-            progress = runner.epoch - start
-            max_progress = runner.max_epochs - start
-        else:
-            start = round(runner.max_iters * self.start_percent)
-            progress = runner.iter - start
-            max_progress = runner.max_iters - start
-
-        if self.min_lr_ratio is not None:
-            target_lr = base_lr * self.min_lr_ratio
-        else:
-            target_lr = self.min_lr
-
-        if progress < 0:
-            return base_lr
-        else:
-            return annealing_cos(base_lr, target_lr, progress / max_progress)
-
-
-@HOOKS.register_module()
-class CosineRestartLrUpdaterHook(LrUpdaterHook):
-    """Cosine annealing with restarts learning rate scheme.
-
-    Args:
-        periods (list[int]): Periods for each cosine anneling cycle.
-        restart_weights (list[float], optional): Restart weights at each
-            restart iteration. Default: [1].
-        min_lr (float, optional): The minimum lr. Default: None.
-        min_lr_ratio (float, optional): The ratio of minimum lr to the base lr.
-            Either `min_lr` or `min_lr_ratio` should be specified.
-            Default: None.
-    """
-
-    def __init__(self,
-                 periods,
-                 restart_weights=[1],
-                 min_lr=None,
-                 min_lr_ratio=None,
-                 **kwargs):
-        assert (min_lr is None) ^ (min_lr_ratio is None)
-        self.periods = periods
-        self.min_lr = min_lr
-        self.min_lr_ratio = min_lr_ratio
-        self.restart_weights = restart_weights
-        assert (len(self.periods) == len(self.restart_weights)
-                ), 'periods and restart_weights should have the same length.'
-        super(CosineRestartLrUpdaterHook, self).__init__(**kwargs)
-
-        self.cumulative_periods = [
-            sum(self.periods[0:i + 1]) for i in range(0, len(self.periods))
-        ]
-
-    def get_lr(self, runner, base_lr):
-        if self.by_epoch:
-            progress = runner.epoch
-        else:
-            progress = runner.iter
-
-        if self.min_lr_ratio is not None:
-            target_lr = base_lr * self.min_lr_ratio
-        else:
-            target_lr = self.min_lr
-
-        idx = get_position_from_periods(progress, self.cumulative_periods)
-        current_weight = self.restart_weights[idx]
-        nearest_restart = 0 if idx == 0 else self.cumulative_periods[idx - 1]
-        current_periods = self.periods[idx]
-
-        alpha = min((progress - nearest_restart) / current_periods, 1)
-        return annealing_cos(base_lr, target_lr, alpha, current_weight)
-
-
-def get_position_from_periods(iteration, cumulative_periods):
-    """Get the position from a period list.
-
-    It will return the index of the right-closest number in the period list.
-    For example, the cumulative_periods = [100, 200, 300, 400],
-    if iteration == 50, return 0;
-    if iteration == 210, return 2;
-    if iteration == 300, return 3.
-
-    Args:
-        iteration (int): Current iteration.
-        cumulative_periods (list[int]): Cumulative period list.
-
-    Returns:
-        int: The position of the right-closest number in the period list.
-    """
-    for i, period in enumerate(cumulative_periods):
-        if iteration < period:
-            return i
-    raise ValueError(f'Current iteration {iteration} exceeds '
-                     f'cumulative_periods {cumulative_periods}')
-
-
-@HOOKS.register_module()
-class CyclicLrUpdaterHook(LrUpdaterHook):
-    """Cyclic LR Scheduler.
-
-    Implement the cyclical learning rate policy (CLR) described in
-    https://arxiv.org/pdf/1506.01186.pdf
-
-    Different from the original paper, we use cosine annealing rather than
-    triangular policy inside a cycle. This improves the performance in the
-    3D detection area.
-
-    Args:
-        by_epoch (bool): Whether to update LR by epoch.
-        target_ratio (tuple[float]): Relative ratio of the highest LR and the
-            lowest LR to the initial LR.
-        cyclic_times (int): Number of cycles during training
-        step_ratio_up (float): The ratio of the increasing process of LR in
-            the total cycle.
-        anneal_strategy (str): {'cos', 'linear'}
-            Specifies the annealing strategy: 'cos' for cosine annealing,
-            'linear' for linear annealing. Default: 'cos'.
-    """
-
-    def __init__(self,
-                 by_epoch=False,
-                 target_ratio=(10, 1e-4),
-                 cyclic_times=1,
-                 step_ratio_up=0.4,
-                 anneal_strategy='cos',
-                 **kwargs):
-        if isinstance(target_ratio, float):
-            target_ratio = (target_ratio, target_ratio / 1e5)
-        elif isinstance(target_ratio, tuple):
-            target_ratio = (target_ratio[0], target_ratio[0] / 1e5) \
-                if len(target_ratio) == 1 else target_ratio
-        else:
-            raise ValueError('target_ratio should be either float '
-                             f'or tuple, got {type(target_ratio)}')
-
-        assert len(target_ratio) == 2, \
-            '"target_ratio" must be list or tuple of two floats'
-        assert 0 <= step_ratio_up < 1.0, \
-            '"step_ratio_up" must be in range [0,1)'
-
-        self.target_ratio = target_ratio
-        self.cyclic_times = cyclic_times
-        self.step_ratio_up = step_ratio_up
-        self.lr_phases = []  # init lr_phases
-        # validate anneal_strategy
-        if anneal_strategy not in ['cos', 'linear']:
-            raise ValueError('anneal_strategy must be one of "cos" or '
-                             f'"linear", instead got {anneal_strategy}')
-        elif anneal_strategy == 'cos':
-            self.anneal_func = annealing_cos
-        elif anneal_strategy == 'linear':
-            self.anneal_func = annealing_linear
-
-        assert not by_epoch, \
-            'currently only support "by_epoch" = False'
-        super(CyclicLrUpdaterHook, self).__init__(by_epoch, **kwargs)
-
-    def before_run(self, runner):
-        super(CyclicLrUpdaterHook, self).before_run(runner)
-        # initiate lr_phases
-        # total lr_phases are separated as up and down
-        max_iter_per_phase = runner.max_iters // self.cyclic_times
-        iter_up_phase = int(self.step_ratio_up * max_iter_per_phase)
-        self.lr_phases.append(
-            [0, iter_up_phase, max_iter_per_phase, 1, self.target_ratio[0]])
-        self.lr_phases.append([
-            iter_up_phase, max_iter_per_phase, max_iter_per_phase,
-            self.target_ratio[0], self.target_ratio[1]
-        ])
-
-    def get_lr(self, runner, base_lr):
-        curr_iter = runner.iter
-        for (start_iter, end_iter, max_iter_per_phase, start_ratio,
-             end_ratio) in self.lr_phases:
-            curr_iter %= max_iter_per_phase
-            if start_iter <= curr_iter < end_iter:
-                progress = curr_iter - start_iter
-                return self.anneal_func(base_lr * start_ratio,
-                                        base_lr * end_ratio,
-                                        progress / (end_iter - start_iter))
-
-
-@HOOKS.register_module()
-class OneCycleLrUpdaterHook(LrUpdaterHook):
-    """One Cycle LR Scheduler.
-
-    The 1cycle learning rate policy changes the learning rate after every
-    batch. The one cycle learning rate policy is described in
-    https://arxiv.org/pdf/1708.07120.pdf
-
-    Args:
-        max_lr (float or list): Upper learning rate boundaries in the cycle
-            for each parameter group.
-        total_steps (int, optional): The total number of steps in the cycle.
-            Note that if a value is not provided here, it will be the max_iter
-            of runner. Default: None.
-        pct_start (float): The percentage of the cycle (in number of steps)
-            spent increasing the learning rate.
-            Default: 0.3
-        anneal_strategy (str): {'cos', 'linear'}
-            Specifies the annealing strategy: 'cos' for cosine annealing,
-            'linear' for linear annealing.
-            Default: 'cos'
-        div_factor (float): Determines the initial learning rate via
-            initial_lr = max_lr/div_factor
-            Default: 25
-        final_div_factor (float): Determines the minimum learning rate via
-            min_lr = initial_lr/final_div_factor
-            Default: 1e4
-        three_phase (bool): If three_phase is True, use a third phase of the
-            schedule to annihilate the learning rate according to
-            final_div_factor instead of modifying the second phase (the first
-            two phases will be symmetrical about the step indicated by
-            pct_start).
-            Default: False
-    """
-
-    def __init__(self,
-                 max_lr,
-                 total_steps=None,
-                 pct_start=0.3,
-                 anneal_strategy='cos',
-                 div_factor=25,
-                 final_div_factor=1e4,
-                 three_phase=False,
-                 **kwargs):
-        # validate by_epoch, currently only support by_epoch = False
-        if 'by_epoch' not in kwargs:
-            kwargs['by_epoch'] = False
-        else:
-            assert not kwargs['by_epoch'], \
-                'currently only support "by_epoch" = False'
-        if not isinstance(max_lr, (numbers.Number, list, dict)):
-            raise ValueError('the type of max_lr must be the one of list or '
-                             f'dict, but got {type(max_lr)}')
-        self._max_lr = max_lr
-        if total_steps is not None:
-            if not isinstance(total_steps, int):
-                raise ValueError('the type of total_steps must be int, but'
-                                 f'got {type(total_steps)}')
-            self.total_steps = total_steps
-        # validate pct_start
-        if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float):
-            raise ValueError('expected float between 0 and 1 pct_start, but '
-                             f'got {pct_start}')
-        self.pct_start = pct_start
-        # validate anneal_strategy
-        if anneal_strategy not in ['cos', 'linear']:
-            raise ValueError('anneal_strategy must be one of "cos" or '
-                             f'"linear", instead got {anneal_strategy}')
-        elif anneal_strategy == 'cos':
-            self.anneal_func = annealing_cos
-        elif anneal_strategy == 'linear':
-            self.anneal_func = annealing_linear
-        self.div_factor = div_factor
-        self.final_div_factor = final_div_factor
-        self.three_phase = three_phase
-        self.lr_phases = []  # init lr_phases
-        super(OneCycleLrUpdaterHook, self).__init__(**kwargs)
-
-    def before_run(self, runner):
-        if hasattr(self, 'total_steps'):
-            total_steps = self.total_steps
-        else:
-            total_steps = runner.max_iters
-        if total_steps < runner.max_iters:
-            raise ValueError(
-                'The total steps must be greater than or equal to max '
-                f'iterations {runner.max_iters} of runner, but total steps '
-                f'is {total_steps}.')
-
-        if isinstance(runner.optimizer, dict):
-            self.base_lr = {}
-            for k, optim in runner.optimizer.items():
-                _max_lr = format_param(k, optim, self._max_lr)
-                self.base_lr[k] = [lr / self.div_factor for lr in _max_lr]
-                for group, lr in zip(optim.param_groups, self.base_lr[k]):
-                    group.setdefault('initial_lr', lr)
-        else:
-            k = type(runner.optimizer).__name__
-            _max_lr = format_param(k, runner.optimizer, self._max_lr)
-            self.base_lr = [lr / self.div_factor for lr in _max_lr]
-            for group, lr in zip(runner.optimizer.param_groups, self.base_lr):
-                group.setdefault('initial_lr', lr)
-
-        if self.three_phase:
-            self.lr_phases.append(
-                [float(self.pct_start * total_steps) - 1, 1, self.div_factor])
-            self.lr_phases.append([
-                float(2 * self.pct_start * total_steps) - 2, self.div_factor, 1
-            ])
-            self.lr_phases.append(
-                [total_steps - 1, 1, 1 / self.final_div_factor])
-        else:
-            self.lr_phases.append(
-                [float(self.pct_start * total_steps) - 1, 1, self.div_factor])
-            self.lr_phases.append(
-                [total_steps - 1, self.div_factor, 1 / self.final_div_factor])
-
-    def get_lr(self, runner, base_lr):
-        curr_iter = runner.iter
-        start_iter = 0
-        for i, (end_iter, start_lr, end_lr) in enumerate(self.lr_phases):
-            if curr_iter <= end_iter:
-                pct = (curr_iter - start_iter) / (end_iter - start_iter)
-                lr = self.anneal_func(base_lr * start_lr, base_lr * end_lr,
-                                      pct)
-                break
-            start_iter = end_iter
-        return lr
-
-
-def annealing_cos(start, end, factor, weight=1):
-    """Calculate annealing cos learning rate.
-
-    Cosine anneal from `weight * start + (1 - weight) * end` to `end` as
-    percentage goes from 0.0 to 1.0.
-
-    Args:
-        start (float): The starting learning rate of the cosine annealing.
-        end (float): The ending learing rate of the cosine annealing.
-        factor (float): The coefficient of `pi` when calculating the current
-            percentage. Range from 0.0 to 1.0.
-        weight (float, optional): The combination factor of `start` and `end`
-            when calculating the actual starting learning rate. Default to 1.
-    """
-    cos_out = cos(pi * factor) + 1
-    return end + 0.5 * weight * (start - end) * cos_out
-
-
-def annealing_linear(start, end, factor):
-    """Calculate annealing linear learning rate.
-
-    Linear anneal from `start` to `end` as percentage goes from 0.0 to 1.0.
-
-    Args:
-        start (float): The starting learning rate of the linear annealing.
-        end (float): The ending learing rate of the linear annealing.
-        factor (float): The coefficient of `pi` when calculating the current
-            percentage. Range from 0.0 to 1.0.
-    """
-    return start + (end - start) * factor
-
-
-def format_param(name, optim, param):
-    if isinstance(param, numbers.Number):
-        return [param] * len(optim.param_groups)
-    elif isinstance(param, (list, tuple)):  # multi param groups
-        if len(param) != len(optim.param_groups):
-            raise ValueError(f'expected {len(optim.param_groups)} '
-                             f'values for {name}, got {len(param)}')
-        return param
-    else:  # multi optimizers
-        if name not in param:
-            raise KeyError(f'{name} is not found in {param.keys()}')
-        return param[name]
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/hooks/memory.py b/ControlNet/annotator/uniformer/mmcv/runner/hooks/memory.py
deleted file mode 100644
index 70cf9a838fb314e3bd3c07aadbc00921a81e83ed..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/hooks/memory.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from .hook import HOOKS, Hook
-
-
-@HOOKS.register_module()
-class EmptyCacheHook(Hook):
-
-    def __init__(self, before_epoch=False, after_epoch=True, after_iter=False):
-        self._before_epoch = before_epoch
-        self._after_epoch = after_epoch
-        self._after_iter = after_iter
-
-    def after_iter(self, runner):
-        if self._after_iter:
-            torch.cuda.empty_cache()
-
-    def before_epoch(self, runner):
-        if self._before_epoch:
-            torch.cuda.empty_cache()
-
-    def after_epoch(self, runner):
-        if self._after_epoch:
-            torch.cuda.empty_cache()
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/hooks/momentum_updater.py b/ControlNet/annotator/uniformer/mmcv/runner/hooks/momentum_updater.py
deleted file mode 100644
index 60437756ceedf06055ec349df69a25465738d3f0..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/hooks/momentum_updater.py
+++ /dev/null
@@ -1,493 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import annotator.uniformer.mmcv as mmcv
-from .hook import HOOKS, Hook
-from .lr_updater import annealing_cos, annealing_linear, format_param
-
-
-class MomentumUpdaterHook(Hook):
-
-    def __init__(self,
-                 by_epoch=True,
-                 warmup=None,
-                 warmup_iters=0,
-                 warmup_ratio=0.9):
-        # validate the "warmup" argument
-        if warmup is not None:
-            if warmup not in ['constant', 'linear', 'exp']:
-                raise ValueError(
-                    f'"{warmup}" is not a supported type for warming up, valid'
-                    ' types are "constant" and "linear"')
-        if warmup is not None:
-            assert warmup_iters > 0, \
-                '"warmup_iters" must be a positive integer'
-            assert 0 < warmup_ratio <= 1.0, \
-                '"warmup_momentum" must be in range (0,1]'
-
-        self.by_epoch = by_epoch
-        self.warmup = warmup
-        self.warmup_iters = warmup_iters
-        self.warmup_ratio = warmup_ratio
-
-        self.base_momentum = []  # initial momentum for all param groups
-        self.regular_momentum = [
-        ]  # expected momentum if no warming up is performed
-
-    def _set_momentum(self, runner, momentum_groups):
-        if isinstance(runner.optimizer, dict):
-            for k, optim in runner.optimizer.items():
-                for param_group, mom in zip(optim.param_groups,
-                                            momentum_groups[k]):
-                    if 'momentum' in param_group.keys():
-                        param_group['momentum'] = mom
-                    elif 'betas' in param_group.keys():
-                        param_group['betas'] = (mom, param_group['betas'][1])
-        else:
-            for param_group, mom in zip(runner.optimizer.param_groups,
-                                        momentum_groups):
-                if 'momentum' in param_group.keys():
-                    param_group['momentum'] = mom
-                elif 'betas' in param_group.keys():
-                    param_group['betas'] = (mom, param_group['betas'][1])
-
-    def get_momentum(self, runner, base_momentum):
-        raise NotImplementedError
-
-    def get_regular_momentum(self, runner):
-        if isinstance(runner.optimizer, dict):
-            momentum_groups = {}
-            for k in runner.optimizer.keys():
-                _momentum_group = [
-                    self.get_momentum(runner, _base_momentum)
-                    for _base_momentum in self.base_momentum[k]
-                ]
-                momentum_groups.update({k: _momentum_group})
-            return momentum_groups
-        else:
-            return [
-                self.get_momentum(runner, _base_momentum)
-                for _base_momentum in self.base_momentum
-            ]
-
-    def get_warmup_momentum(self, cur_iters):
-
-        def _get_warmup_momentum(cur_iters, regular_momentum):
-            if self.warmup == 'constant':
-                warmup_momentum = [
-                    _momentum / self.warmup_ratio
-                    for _momentum in self.regular_momentum
-                ]
-            elif self.warmup == 'linear':
-                k = (1 - cur_iters / self.warmup_iters) * (1 -
-                                                           self.warmup_ratio)
-                warmup_momentum = [
-                    _momentum / (1 - k) for _momentum in self.regular_mom
-                ]
-            elif self.warmup == 'exp':
-                k = self.warmup_ratio**(1 - cur_iters / self.warmup_iters)
-                warmup_momentum = [
-                    _momentum / k for _momentum in self.regular_mom
-                ]
-            return warmup_momentum
-
-        if isinstance(self.regular_momentum, dict):
-            momentum_groups = {}
-            for key, regular_momentum in self.regular_momentum.items():
-                momentum_groups[key] = _get_warmup_momentum(
-                    cur_iters, regular_momentum)
-            return momentum_groups
-        else:
-            return _get_warmup_momentum(cur_iters, self.regular_momentum)
-
-    def before_run(self, runner):
-        # NOTE: when resuming from a checkpoint,
-        # if 'initial_momentum' is not saved,
-        # it will be set according to the optimizer params
-        if isinstance(runner.optimizer, dict):
-            self.base_momentum = {}
-            for k, optim in runner.optimizer.items():
-                for group in optim.param_groups:
-                    if 'momentum' in group.keys():
-                        group.setdefault('initial_momentum', group['momentum'])
-                    else:
-                        group.setdefault('initial_momentum', group['betas'][0])
-                _base_momentum = [
-                    group['initial_momentum'] for group in optim.param_groups
-                ]
-                self.base_momentum.update({k: _base_momentum})
-        else:
-            for group in runner.optimizer.param_groups:
-                if 'momentum' in group.keys():
-                    group.setdefault('initial_momentum', group['momentum'])
-                else:
-                    group.setdefault('initial_momentum', group['betas'][0])
-            self.base_momentum = [
-                group['initial_momentum']
-                for group in runner.optimizer.param_groups
-            ]
-
-    def before_train_epoch(self, runner):
-        if not self.by_epoch:
-            return
-        self.regular_mom = self.get_regular_momentum(runner)
-        self._set_momentum(runner, self.regular_mom)
-
-    def before_train_iter(self, runner):
-        cur_iter = runner.iter
-        if not self.by_epoch:
-            self.regular_mom = self.get_regular_momentum(runner)
-            if self.warmup is None or cur_iter >= self.warmup_iters:
-                self._set_momentum(runner, self.regular_mom)
-            else:
-                warmup_momentum = self.get_warmup_momentum(cur_iter)
-                self._set_momentum(runner, warmup_momentum)
-        elif self.by_epoch:
-            if self.warmup is None or cur_iter > self.warmup_iters:
-                return
-            elif cur_iter == self.warmup_iters:
-                self._set_momentum(runner, self.regular_mom)
-            else:
-                warmup_momentum = self.get_warmup_momentum(cur_iter)
-                self._set_momentum(runner, warmup_momentum)
-
-
-@HOOKS.register_module()
-class StepMomentumUpdaterHook(MomentumUpdaterHook):
-    """Step momentum scheduler with min value clipping.
-
-    Args:
-        step (int | list[int]): Step to decay the momentum. If an int value is
-            given, regard it as the decay interval. If a list is given, decay
-            momentum at these steps.
-        gamma (float, optional): Decay momentum ratio. Default: 0.5.
-        min_momentum (float, optional): Minimum momentum value to keep. If
-            momentum after decay is lower than this value, it will be clipped
-            accordingly. If None is given, we don't perform lr clipping.
-            Default: None.
-    """
-
-    def __init__(self, step, gamma=0.5, min_momentum=None, **kwargs):
-        if isinstance(step, list):
-            assert mmcv.is_list_of(step, int)
-            assert all([s > 0 for s in step])
-        elif isinstance(step, int):
-            assert step > 0
-        else:
-            raise TypeError('"step" must be a list or integer')
-        self.step = step
-        self.gamma = gamma
-        self.min_momentum = min_momentum
-        super(StepMomentumUpdaterHook, self).__init__(**kwargs)
-
-    def get_momentum(self, runner, base_momentum):
-        progress = runner.epoch if self.by_epoch else runner.iter
-
-        # calculate exponential term
-        if isinstance(self.step, int):
-            exp = progress // self.step
-        else:
-            exp = len(self.step)
-            for i, s in enumerate(self.step):
-                if progress < s:
-                    exp = i
-                    break
-
-        momentum = base_momentum * (self.gamma**exp)
-        if self.min_momentum is not None:
-            # clip to a minimum value
-            momentum = max(momentum, self.min_momentum)
-        return momentum
-
-
-@HOOKS.register_module()
-class CosineAnnealingMomentumUpdaterHook(MomentumUpdaterHook):
-
-    def __init__(self, min_momentum=None, min_momentum_ratio=None, **kwargs):
-        assert (min_momentum is None) ^ (min_momentum_ratio is None)
-        self.min_momentum = min_momentum
-        self.min_momentum_ratio = min_momentum_ratio
-        super(CosineAnnealingMomentumUpdaterHook, self).__init__(**kwargs)
-
-    def get_momentum(self, runner, base_momentum):
-        if self.by_epoch:
-            progress = runner.epoch
-            max_progress = runner.max_epochs
-        else:
-            progress = runner.iter
-            max_progress = runner.max_iters
-        if self.min_momentum_ratio is not None:
-            target_momentum = base_momentum * self.min_momentum_ratio
-        else:
-            target_momentum = self.min_momentum
-        return annealing_cos(base_momentum, target_momentum,
-                             progress / max_progress)
-
-
-@HOOKS.register_module()
-class CyclicMomentumUpdaterHook(MomentumUpdaterHook):
-    """Cyclic momentum Scheduler.
-
-    Implement the cyclical momentum scheduler policy described in
-    https://arxiv.org/pdf/1708.07120.pdf
-
-    This momentum scheduler usually used together with the CyclicLRUpdater
-    to improve the performance in the 3D detection area.
-
-    Attributes:
-        target_ratio (tuple[float]): Relative ratio of the lowest momentum and
-            the highest momentum to the initial momentum.
-        cyclic_times (int): Number of cycles during training
-        step_ratio_up (float): The ratio of the increasing process of momentum
-            in  the total cycle.
-        by_epoch (bool): Whether to update momentum by epoch.
-    """
-
-    def __init__(self,
-                 by_epoch=False,
-                 target_ratio=(0.85 / 0.95, 1),
-                 cyclic_times=1,
-                 step_ratio_up=0.4,
-                 **kwargs):
-        if isinstance(target_ratio, float):
-            target_ratio = (target_ratio, target_ratio / 1e5)
-        elif isinstance(target_ratio, tuple):
-            target_ratio = (target_ratio[0], target_ratio[0] / 1e5) \
-                if len(target_ratio) == 1 else target_ratio
-        else:
-            raise ValueError('target_ratio should be either float '
-                             f'or tuple, got {type(target_ratio)}')
-
-        assert len(target_ratio) == 2, \
-            '"target_ratio" must be list or tuple of two floats'
-        assert 0 <= step_ratio_up < 1.0, \
-            '"step_ratio_up" must be in range [0,1)'
-
-        self.target_ratio = target_ratio
-        self.cyclic_times = cyclic_times
-        self.step_ratio_up = step_ratio_up
-        self.momentum_phases = []  # init momentum_phases
-        # currently only support by_epoch=False
-        assert not by_epoch, \
-            'currently only support "by_epoch" = False'
-        super(CyclicMomentumUpdaterHook, self).__init__(by_epoch, **kwargs)
-
-    def before_run(self, runner):
-        super(CyclicMomentumUpdaterHook, self).before_run(runner)
-        # initiate momentum_phases
-        # total momentum_phases are separated as up and down
-        max_iter_per_phase = runner.max_iters // self.cyclic_times
-        iter_up_phase = int(self.step_ratio_up * max_iter_per_phase)
-        self.momentum_phases.append(
-            [0, iter_up_phase, max_iter_per_phase, 1, self.target_ratio[0]])
-        self.momentum_phases.append([
-            iter_up_phase, max_iter_per_phase, max_iter_per_phase,
-            self.target_ratio[0], self.target_ratio[1]
-        ])
-
-    def get_momentum(self, runner, base_momentum):
-        curr_iter = runner.iter
-        for (start_iter, end_iter, max_iter_per_phase, start_ratio,
-             end_ratio) in self.momentum_phases:
-            curr_iter %= max_iter_per_phase
-            if start_iter <= curr_iter < end_iter:
-                progress = curr_iter - start_iter
-                return annealing_cos(base_momentum * start_ratio,
-                                     base_momentum * end_ratio,
-                                     progress / (end_iter - start_iter))
-
-
-@HOOKS.register_module()
-class OneCycleMomentumUpdaterHook(MomentumUpdaterHook):
-    """OneCycle momentum Scheduler.
-
-    This momentum scheduler usually used together with the OneCycleLrUpdater
-    to improve the performance.
-
-    Args:
-        base_momentum (float or list): Lower momentum boundaries in the cycle
-            for each parameter group. Note that momentum is cycled inversely
-            to learning rate; at the peak of a cycle, momentum is
-            'base_momentum' and learning rate is 'max_lr'.
-            Default: 0.85
-        max_momentum (float or list): Upper momentum boundaries in the cycle
-            for each parameter group. Functionally,
-            it defines the cycle amplitude (max_momentum - base_momentum).
-            Note that momentum is cycled inversely
-            to learning rate; at the start of a cycle, momentum is
-            'max_momentum' and learning rate is 'base_lr'
-            Default: 0.95
-        pct_start (float): The percentage of the cycle (in number of steps)
-            spent increasing the learning rate.
-            Default: 0.3
-        anneal_strategy (str): {'cos', 'linear'}
-            Specifies the annealing strategy: 'cos' for cosine annealing,
-            'linear' for linear annealing.
-            Default: 'cos'
-        three_phase (bool): If three_phase is True, use a third phase of the
-            schedule to annihilate the learning rate according to
-            final_div_factor instead of modifying the second phase (the first
-            two phases will be symmetrical about the step indicated by
-            pct_start).
-            Default: False
-    """
-
-    def __init__(self,
-                 base_momentum=0.85,
-                 max_momentum=0.95,
-                 pct_start=0.3,
-                 anneal_strategy='cos',
-                 three_phase=False,
-                 **kwargs):
-        # validate by_epoch, currently only support by_epoch=False
-        if 'by_epoch' not in kwargs:
-            kwargs['by_epoch'] = False
-        else:
-            assert not kwargs['by_epoch'], \
-                'currently only support "by_epoch" = False'
-        if not isinstance(base_momentum, (float, list, dict)):
-            raise ValueError('base_momentum must be the type among of float,'
-                             'list or dict.')
-        self._base_momentum = base_momentum
-        if not isinstance(max_momentum, (float, list, dict)):
-            raise ValueError('max_momentum must be the type among of float,'
-                             'list or dict.')
-        self._max_momentum = max_momentum
-        # validate pct_start
-        if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float):
-            raise ValueError('Expected float between 0 and 1 pct_start, but '
-                             f'got {pct_start}')
-        self.pct_start = pct_start
-        # validate anneal_strategy
-        if anneal_strategy not in ['cos', 'linear']:
-            raise ValueError('anneal_strategy must by one of "cos" or '
-                             f'"linear", instead got {anneal_strategy}')
-        elif anneal_strategy == 'cos':
-            self.anneal_func = annealing_cos
-        elif anneal_strategy == 'linear':
-            self.anneal_func = annealing_linear
-        self.three_phase = three_phase
-        self.momentum_phases = []  # init momentum_phases
-        super(OneCycleMomentumUpdaterHook, self).__init__(**kwargs)
-
-    def before_run(self, runner):
-        if isinstance(runner.optimizer, dict):
-            for k, optim in runner.optimizer.items():
-                if ('momentum' not in optim.defaults
-                        and 'betas' not in optim.defaults):
-                    raise ValueError('optimizer must support momentum with'
-                                     'option enabled')
-                self.use_beta1 = 'betas' in optim.defaults
-                _base_momentum = format_param(k, optim, self._base_momentum)
-                _max_momentum = format_param(k, optim, self._max_momentum)
-                for group, b_momentum, m_momentum in zip(
-                        optim.param_groups, _base_momentum, _max_momentum):
-                    if self.use_beta1:
-                        _, beta2 = group['betas']
-                        group['betas'] = (m_momentum, beta2)
-                    else:
-                        group['momentum'] = m_momentum
-                    group['base_momentum'] = b_momentum
-                    group['max_momentum'] = m_momentum
-        else:
-            optim = runner.optimizer
-            if ('momentum' not in optim.defaults
-                    and 'betas' not in optim.defaults):
-                raise ValueError('optimizer must support momentum with'
-                                 'option enabled')
-            self.use_beta1 = 'betas' in optim.defaults
-            k = type(optim).__name__
-            _base_momentum = format_param(k, optim, self._base_momentum)
-            _max_momentum = format_param(k, optim, self._max_momentum)
-            for group, b_momentum, m_momentum in zip(optim.param_groups,
-                                                     _base_momentum,
-                                                     _max_momentum):
-                if self.use_beta1:
-                    _, beta2 = group['betas']
-                    group['betas'] = (m_momentum, beta2)
-                else:
-                    group['momentum'] = m_momentum
-                group['base_momentum'] = b_momentum
-                group['max_momentum'] = m_momentum
-
-        if self.three_phase:
-            self.momentum_phases.append({
-                'end_iter':
-                float(self.pct_start * runner.max_iters) - 1,
-                'start_momentum':
-                'max_momentum',
-                'end_momentum':
-                'base_momentum'
-            })
-            self.momentum_phases.append({
-                'end_iter':
-                float(2 * self.pct_start * runner.max_iters) - 2,
-                'start_momentum':
-                'base_momentum',
-                'end_momentum':
-                'max_momentum'
-            })
-            self.momentum_phases.append({
-                'end_iter': runner.max_iters - 1,
-                'start_momentum': 'max_momentum',
-                'end_momentum': 'max_momentum'
-            })
-        else:
-            self.momentum_phases.append({
-                'end_iter':
-                float(self.pct_start * runner.max_iters) - 1,
-                'start_momentum':
-                'max_momentum',
-                'end_momentum':
-                'base_momentum'
-            })
-            self.momentum_phases.append({
-                'end_iter': runner.max_iters - 1,
-                'start_momentum': 'base_momentum',
-                'end_momentum': 'max_momentum'
-            })
-
-    def _set_momentum(self, runner, momentum_groups):
-        if isinstance(runner.optimizer, dict):
-            for k, optim in runner.optimizer.items():
-                for param_group, mom in zip(optim.param_groups,
-                                            momentum_groups[k]):
-                    if 'momentum' in param_group.keys():
-                        param_group['momentum'] = mom
-                    elif 'betas' in param_group.keys():
-                        param_group['betas'] = (mom, param_group['betas'][1])
-        else:
-            for param_group, mom in zip(runner.optimizer.param_groups,
-                                        momentum_groups):
-                if 'momentum' in param_group.keys():
-                    param_group['momentum'] = mom
-                elif 'betas' in param_group.keys():
-                    param_group['betas'] = (mom, param_group['betas'][1])
-
-    def get_momentum(self, runner, param_group):
-        curr_iter = runner.iter
-        start_iter = 0
-        for i, phase in enumerate(self.momentum_phases):
-            end_iter = phase['end_iter']
-            if curr_iter <= end_iter or i == len(self.momentum_phases) - 1:
-                pct = (curr_iter - start_iter) / (end_iter - start_iter)
-                momentum = self.anneal_func(
-                    param_group[phase['start_momentum']],
-                    param_group[phase['end_momentum']], pct)
-                break
-            start_iter = end_iter
-        return momentum
-
-    def get_regular_momentum(self, runner):
-        if isinstance(runner.optimizer, dict):
-            momentum_groups = {}
-            for k, optim in runner.optimizer.items():
-                _momentum_group = [
-                    self.get_momentum(runner, param_group)
-                    for param_group in optim.param_groups
-                ]
-                momentum_groups.update({k: _momentum_group})
-            return momentum_groups
-        else:
-            momentum_groups = []
-            for param_group in runner.optimizer.param_groups:
-                momentum_groups.append(self.get_momentum(runner, param_group))
-            return momentum_groups
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/hooks/optimizer.py b/ControlNet/annotator/uniformer/mmcv/runner/hooks/optimizer.py
deleted file mode 100644
index 4ef3e9ff8f9c6926e32bdf027612267b64ed80df..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/hooks/optimizer.py
+++ /dev/null
@@ -1,508 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-from collections import defaultdict
-from itertools import chain
-
-from torch.nn.utils import clip_grad
-
-from annotator.uniformer.mmcv.utils import TORCH_VERSION, _BatchNorm, digit_version
-from ..dist_utils import allreduce_grads
-from ..fp16_utils import LossScaler, wrap_fp16_model
-from .hook import HOOKS, Hook
-
-try:
-    # If PyTorch version >= 1.6.0, torch.cuda.amp.GradScaler would be imported
-    # and used; otherwise, auto fp16 will adopt mmcv's implementation.
-    from torch.cuda.amp import GradScaler
-except ImportError:
-    pass
-
-
-@HOOKS.register_module()
-class OptimizerHook(Hook):
-
-    def __init__(self, grad_clip=None):
-        self.grad_clip = grad_clip
-
-    def clip_grads(self, params):
-        params = list(
-            filter(lambda p: p.requires_grad and p.grad is not None, params))
-        if len(params) > 0:
-            return clip_grad.clip_grad_norm_(params, **self.grad_clip)
-
-    def after_train_iter(self, runner):
-        runner.optimizer.zero_grad()
-        runner.outputs['loss'].backward()
-        if self.grad_clip is not None:
-            grad_norm = self.clip_grads(runner.model.parameters())
-            if grad_norm is not None:
-                # Add grad norm to the logger
-                runner.log_buffer.update({'grad_norm': float(grad_norm)},
-                                         runner.outputs['num_samples'])
-        runner.optimizer.step()
-
-
-@HOOKS.register_module()
-class GradientCumulativeOptimizerHook(OptimizerHook):
-    """Optimizer Hook implements multi-iters gradient cumulating.
-
-    Args:
-        cumulative_iters (int, optional): Num of gradient cumulative iters.
-            The optimizer will step every `cumulative_iters` iters.
-            Defaults to 1.
-
-    Examples:
-        >>> # Use cumulative_iters to simulate a large batch size
-        >>> # It is helpful when the hardware cannot handle a large batch size.
-        >>> loader = DataLoader(data, batch_size=64)
-        >>> optim_hook = GradientCumulativeOptimizerHook(cumulative_iters=4)
-        >>> # almost equals to
-        >>> loader = DataLoader(data, batch_size=256)
-        >>> optim_hook = OptimizerHook()
-    """
-
-    def __init__(self, cumulative_iters=1, **kwargs):
-        super(GradientCumulativeOptimizerHook, self).__init__(**kwargs)
-
-        assert isinstance(cumulative_iters, int) and cumulative_iters > 0, \
-            f'cumulative_iters only accepts positive int, but got ' \
-            f'{type(cumulative_iters)} instead.'
-
-        self.cumulative_iters = cumulative_iters
-        self.divisible_iters = 0
-        self.remainder_iters = 0
-        self.initialized = False
-
-    def has_batch_norm(self, module):
-        if isinstance(module, _BatchNorm):
-            return True
-        for m in module.children():
-            if self.has_batch_norm(m):
-                return True
-        return False
-
-    def _init(self, runner):
-        if runner.iter % self.cumulative_iters != 0:
-            runner.logger.warning(
-                'Resume iter number is not divisible by cumulative_iters in '
-                'GradientCumulativeOptimizerHook, which means the gradient of '
-                'some iters is lost and the result may be influenced slightly.'
-            )
-
-        if self.has_batch_norm(runner.model) and self.cumulative_iters > 1:
-            runner.logger.warning(
-                'GradientCumulativeOptimizerHook may slightly decrease '
-                'performance if the model has BatchNorm layers.')
-
-        residual_iters = runner.max_iters - runner.iter
-
-        self.divisible_iters = (
-            residual_iters // self.cumulative_iters * self.cumulative_iters)
-        self.remainder_iters = residual_iters - self.divisible_iters
-
-        self.initialized = True
-
-    def after_train_iter(self, runner):
-        if not self.initialized:
-            self._init(runner)
-
-        if runner.iter < self.divisible_iters:
-            loss_factor = self.cumulative_iters
-        else:
-            loss_factor = self.remainder_iters
-        loss = runner.outputs['loss']
-        loss = loss / loss_factor
-        loss.backward()
-
-        if (self.every_n_iters(runner, self.cumulative_iters)
-                or self.is_last_iter(runner)):
-
-            if self.grad_clip is not None:
-                grad_norm = self.clip_grads(runner.model.parameters())
-                if grad_norm is not None:
-                    # Add grad norm to the logger
-                    runner.log_buffer.update({'grad_norm': float(grad_norm)},
-                                             runner.outputs['num_samples'])
-            runner.optimizer.step()
-            runner.optimizer.zero_grad()
-
-
-if (TORCH_VERSION != 'parrots'
-        and digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
-
-    @HOOKS.register_module()
-    class Fp16OptimizerHook(OptimizerHook):
-        """FP16 optimizer hook (using PyTorch's implementation).
-
-        If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend,
-        to take care of the optimization procedure.
-
-        Args:
-            loss_scale (float | str | dict): Scale factor configuration.
-                If loss_scale is a float, static loss scaling will be used with
-                the specified scale. If loss_scale is a string, it must be
-                'dynamic', then dynamic loss scaling will be used.
-                It can also be a dict containing arguments of GradScalar.
-                Defaults to 512. For Pytorch >= 1.6, mmcv uses official
-                implementation of GradScaler. If you use a dict version of
-                loss_scale to create GradScaler, please refer to:
-                https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler
-                for the parameters.
-
-        Examples:
-            >>> loss_scale = dict(
-            ...     init_scale=65536.0,
-            ...     growth_factor=2.0,
-            ...     backoff_factor=0.5,
-            ...     growth_interval=2000
-            ... )
-            >>> optimizer_hook = Fp16OptimizerHook(loss_scale=loss_scale)
-        """
-
-        def __init__(self,
-                     grad_clip=None,
-                     coalesce=True,
-                     bucket_size_mb=-1,
-                     loss_scale=512.,
-                     distributed=True):
-            self.grad_clip = grad_clip
-            self.coalesce = coalesce
-            self.bucket_size_mb = bucket_size_mb
-            self.distributed = distributed
-            self._scale_update_param = None
-            if loss_scale == 'dynamic':
-                self.loss_scaler = GradScaler()
-            elif isinstance(loss_scale, float):
-                self._scale_update_param = loss_scale
-                self.loss_scaler = GradScaler(init_scale=loss_scale)
-            elif isinstance(loss_scale, dict):
-                self.loss_scaler = GradScaler(**loss_scale)
-            else:
-                raise ValueError('loss_scale must be of type float, dict, or '
-                                 f'"dynamic", got {loss_scale}')
-
-        def before_run(self, runner):
-            """Preparing steps before Mixed Precision Training."""
-            # wrap model mode to fp16
-            wrap_fp16_model(runner.model)
-            # resume from state dict
-            if 'fp16' in runner.meta and 'loss_scaler' in runner.meta['fp16']:
-                scaler_state_dict = runner.meta['fp16']['loss_scaler']
-                self.loss_scaler.load_state_dict(scaler_state_dict)
-
-        def copy_grads_to_fp32(self, fp16_net, fp32_weights):
-            """Copy gradients from fp16 model to fp32 weight copy."""
-            for fp32_param, fp16_param in zip(fp32_weights,
-                                              fp16_net.parameters()):
-                if fp16_param.grad is not None:
-                    if fp32_param.grad is None:
-                        fp32_param.grad = fp32_param.data.new(
-                            fp32_param.size())
-                    fp32_param.grad.copy_(fp16_param.grad)
-
-        def copy_params_to_fp16(self, fp16_net, fp32_weights):
-            """Copy updated params from fp32 weight copy to fp16 model."""
-            for fp16_param, fp32_param in zip(fp16_net.parameters(),
-                                              fp32_weights):
-                fp16_param.data.copy_(fp32_param.data)
-
-        def after_train_iter(self, runner):
-            """Backward optimization steps for Mixed Precision Training. For
-            dynamic loss scaling, please refer to
-            https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.
-
-            1. Scale the loss by a scale factor.
-            2. Backward the loss to obtain the gradients.
-            3. Unscale the optimizer’s gradient tensors.
-            4. Call optimizer.step() and update scale factor.
-            5. Save loss_scaler state_dict for resume purpose.
-            """
-            # clear grads of last iteration
-            runner.model.zero_grad()
-            runner.optimizer.zero_grad()
-
-            self.loss_scaler.scale(runner.outputs['loss']).backward()
-            self.loss_scaler.unscale_(runner.optimizer)
-            # grad clip
-            if self.grad_clip is not None:
-                grad_norm = self.clip_grads(runner.model.parameters())
-                if grad_norm is not None:
-                    # Add grad norm to the logger
-                    runner.log_buffer.update({'grad_norm': float(grad_norm)},
-                                             runner.outputs['num_samples'])
-            # backward and update scaler
-            self.loss_scaler.step(runner.optimizer)
-            self.loss_scaler.update(self._scale_update_param)
-
-            # save state_dict of loss_scaler
-            runner.meta.setdefault(
-                'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
-
-    @HOOKS.register_module()
-    class GradientCumulativeFp16OptimizerHook(GradientCumulativeOptimizerHook,
-                                              Fp16OptimizerHook):
-        """Fp16 optimizer Hook (using PyTorch's implementation) implements
-        multi-iters gradient cumulating.
-
-        If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend,
-        to take care of the optimization procedure.
-        """
-
-        def __init__(self, *args, **kwargs):
-            super(GradientCumulativeFp16OptimizerHook,
-                  self).__init__(*args, **kwargs)
-
-        def after_train_iter(self, runner):
-            if not self.initialized:
-                self._init(runner)
-
-            if runner.iter < self.divisible_iters:
-                loss_factor = self.cumulative_iters
-            else:
-                loss_factor = self.remainder_iters
-            loss = runner.outputs['loss']
-            loss = loss / loss_factor
-
-            self.loss_scaler.scale(loss).backward()
-
-            if (self.every_n_iters(runner, self.cumulative_iters)
-                    or self.is_last_iter(runner)):
-
-                # copy fp16 grads in the model to fp32 params in the optimizer
-                self.loss_scaler.unscale_(runner.optimizer)
-
-                if self.grad_clip is not None:
-                    grad_norm = self.clip_grads(runner.model.parameters())
-                    if grad_norm is not None:
-                        # Add grad norm to the logger
-                        runner.log_buffer.update(
-                            {'grad_norm': float(grad_norm)},
-                            runner.outputs['num_samples'])
-
-                # backward and update scaler
-                self.loss_scaler.step(runner.optimizer)
-                self.loss_scaler.update(self._scale_update_param)
-
-                # save state_dict of loss_scaler
-                runner.meta.setdefault(
-                    'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
-
-                # clear grads
-                runner.model.zero_grad()
-                runner.optimizer.zero_grad()
-
-else:
-
-    @HOOKS.register_module()
-    class Fp16OptimizerHook(OptimizerHook):
-        """FP16 optimizer hook (mmcv's implementation).
-
-        The steps of fp16 optimizer is as follows.
-        1. Scale the loss value.
-        2. BP in the fp16 model.
-        2. Copy gradients from fp16 model to fp32 weights.
-        3. Update fp32 weights.
-        4. Copy updated parameters from fp32 weights to fp16 model.
-
-        Refer to https://arxiv.org/abs/1710.03740 for more details.
-
-        Args:
-            loss_scale (float | str | dict): Scale factor configuration.
-                If loss_scale is a float, static loss scaling will be used with
-                the specified scale. If loss_scale is a string, it must be
-                'dynamic', then dynamic loss scaling will be used.
-                It can also be a dict containing arguments of LossScaler.
-                Defaults to 512.
-        """
-
-        def __init__(self,
-                     grad_clip=None,
-                     coalesce=True,
-                     bucket_size_mb=-1,
-                     loss_scale=512.,
-                     distributed=True):
-            self.grad_clip = grad_clip
-            self.coalesce = coalesce
-            self.bucket_size_mb = bucket_size_mb
-            self.distributed = distributed
-            if loss_scale == 'dynamic':
-                self.loss_scaler = LossScaler(mode='dynamic')
-            elif isinstance(loss_scale, float):
-                self.loss_scaler = LossScaler(
-                    init_scale=loss_scale, mode='static')
-            elif isinstance(loss_scale, dict):
-                self.loss_scaler = LossScaler(**loss_scale)
-            else:
-                raise ValueError('loss_scale must be of type float, dict, or '
-                                 f'"dynamic", got {loss_scale}')
-
-        def before_run(self, runner):
-            """Preparing steps before Mixed Precision Training.
-
-            1. Make a master copy of fp32 weights for optimization.
-            2. Convert the main model from fp32 to fp16.
-            """
-            # keep a copy of fp32 weights
-            old_groups = runner.optimizer.param_groups
-            runner.optimizer.param_groups = copy.deepcopy(
-                runner.optimizer.param_groups)
-            state = defaultdict(dict)
-            p_map = {
-                old_p: p
-                for old_p, p in zip(
-                    chain(*(g['params'] for g in old_groups)),
-                    chain(*(g['params']
-                            for g in runner.optimizer.param_groups)))
-            }
-            for k, v in runner.optimizer.state.items():
-                state[p_map[k]] = v
-            runner.optimizer.state = state
-            # convert model to fp16
-            wrap_fp16_model(runner.model)
-            # resume from state dict
-            if 'fp16' in runner.meta and 'loss_scaler' in runner.meta['fp16']:
-                scaler_state_dict = runner.meta['fp16']['loss_scaler']
-                self.loss_scaler.load_state_dict(scaler_state_dict)
-
-        def copy_grads_to_fp32(self, fp16_net, fp32_weights):
-            """Copy gradients from fp16 model to fp32 weight copy."""
-            for fp32_param, fp16_param in zip(fp32_weights,
-                                              fp16_net.parameters()):
-                if fp16_param.grad is not None:
-                    if fp32_param.grad is None:
-                        fp32_param.grad = fp32_param.data.new(
-                            fp32_param.size())
-                    fp32_param.grad.copy_(fp16_param.grad)
-
-        def copy_params_to_fp16(self, fp16_net, fp32_weights):
-            """Copy updated params from fp32 weight copy to fp16 model."""
-            for fp16_param, fp32_param in zip(fp16_net.parameters(),
-                                              fp32_weights):
-                fp16_param.data.copy_(fp32_param.data)
-
-        def after_train_iter(self, runner):
-            """Backward optimization steps for Mixed Precision Training. For
-            dynamic loss scaling, please refer `loss_scalar.py`
-
-            1. Scale the loss by a scale factor.
-            2. Backward the loss to obtain the gradients (fp16).
-            3. Copy gradients from the model to the fp32 weight copy.
-            4. Scale the gradients back and update the fp32 weight copy.
-            5. Copy back the params from fp32 weight copy to the fp16 model.
-            6. Save loss_scaler state_dict for resume purpose.
-            """
-            # clear grads of last iteration
-            runner.model.zero_grad()
-            runner.optimizer.zero_grad()
-            # scale the loss value
-            scaled_loss = runner.outputs['loss'] * self.loss_scaler.loss_scale
-            scaled_loss.backward()
-            # copy fp16 grads in the model to fp32 params in the optimizer
-
-            fp32_weights = []
-            for param_group in runner.optimizer.param_groups:
-                fp32_weights += param_group['params']
-            self.copy_grads_to_fp32(runner.model, fp32_weights)
-            # allreduce grads
-            if self.distributed:
-                allreduce_grads(fp32_weights, self.coalesce,
-                                self.bucket_size_mb)
-
-            has_overflow = self.loss_scaler.has_overflow(fp32_weights)
-            # if has overflow, skip this iteration
-            if not has_overflow:
-                # scale the gradients back
-                for param in fp32_weights:
-                    if param.grad is not None:
-                        param.grad.div_(self.loss_scaler.loss_scale)
-                if self.grad_clip is not None:
-                    grad_norm = self.clip_grads(fp32_weights)
-                    if grad_norm is not None:
-                        # Add grad norm to the logger
-                        runner.log_buffer.update(
-                            {'grad_norm': float(grad_norm)},
-                            runner.outputs['num_samples'])
-                # update fp32 params
-                runner.optimizer.step()
-                # copy fp32 params to the fp16 model
-                self.copy_params_to_fp16(runner.model, fp32_weights)
-            self.loss_scaler.update_scale(has_overflow)
-            if has_overflow:
-                runner.logger.warning('Check overflow, downscale loss scale '
-                                      f'to {self.loss_scaler.cur_scale}')
-
-            # save state_dict of loss_scaler
-            runner.meta.setdefault(
-                'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
-
-    @HOOKS.register_module()
-    class GradientCumulativeFp16OptimizerHook(GradientCumulativeOptimizerHook,
-                                              Fp16OptimizerHook):
-        """Fp16 optimizer Hook (using mmcv implementation) implements multi-
-        iters gradient cumulating."""
-
-        def __init__(self, *args, **kwargs):
-            super(GradientCumulativeFp16OptimizerHook,
-                  self).__init__(*args, **kwargs)
-
-        def after_train_iter(self, runner):
-            if not self.initialized:
-                self._init(runner)
-
-            if runner.iter < self.divisible_iters:
-                loss_factor = self.cumulative_iters
-            else:
-                loss_factor = self.remainder_iters
-
-            loss = runner.outputs['loss']
-            loss = loss / loss_factor
-
-            # scale the loss value
-            scaled_loss = loss * self.loss_scaler.loss_scale
-            scaled_loss.backward()
-
-            if (self.every_n_iters(runner, self.cumulative_iters)
-                    or self.is_last_iter(runner)):
-
-                # copy fp16 grads in the model to fp32 params in the optimizer
-                fp32_weights = []
-                for param_group in runner.optimizer.param_groups:
-                    fp32_weights += param_group['params']
-                self.copy_grads_to_fp32(runner.model, fp32_weights)
-                # allreduce grads
-                if self.distributed:
-                    allreduce_grads(fp32_weights, self.coalesce,
-                                    self.bucket_size_mb)
-
-                has_overflow = self.loss_scaler.has_overflow(fp32_weights)
-                # if has overflow, skip this iteration
-                if not has_overflow:
-                    # scale the gradients back
-                    for param in fp32_weights:
-                        if param.grad is not None:
-                            param.grad.div_(self.loss_scaler.loss_scale)
-                    if self.grad_clip is not None:
-                        grad_norm = self.clip_grads(fp32_weights)
-                        if grad_norm is not None:
-                            # Add grad norm to the logger
-                            runner.log_buffer.update(
-                                {'grad_norm': float(grad_norm)},
-                                runner.outputs['num_samples'])
-                    # update fp32 params
-                    runner.optimizer.step()
-                    # copy fp32 params to the fp16 model
-                    self.copy_params_to_fp16(runner.model, fp32_weights)
-                else:
-                    runner.logger.warning(
-                        'Check overflow, downscale loss scale '
-                        f'to {self.loss_scaler.cur_scale}')
-
-                self.loss_scaler.update_scale(has_overflow)
-
-                # save state_dict of loss_scaler
-                runner.meta.setdefault(
-                    'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
-
-                # clear grads
-                runner.model.zero_grad()
-                runner.optimizer.zero_grad()
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/hooks/profiler.py b/ControlNet/annotator/uniformer/mmcv/runner/hooks/profiler.py
deleted file mode 100644
index b70236997eec59c2209ef351ae38863b4112d0ec..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/hooks/profiler.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-from typing import Callable, List, Optional, Union
-
-import torch
-
-from ..dist_utils import master_only
-from .hook import HOOKS, Hook
-
-
-@HOOKS.register_module()
-class ProfilerHook(Hook):
-    """Profiler to analyze performance during training.
-
-    PyTorch Profiler is a tool that allows the collection of the performance
-    metrics during the training. More details on Profiler can be found at
-    https://pytorch.org/docs/1.8.1/profiler.html#torch.profiler.profile
-
-    Args:
-        by_epoch (bool): Profile performance by epoch or by iteration.
-            Default: True.
-        profile_iters (int): Number of iterations for profiling.
-            If ``by_epoch=True``, profile_iters indicates that they are the
-            first profile_iters epochs at the beginning of the
-            training, otherwise it indicates the first profile_iters
-            iterations. Default: 1.
-        activities (list[str]): List of activity groups (CPU, CUDA) to use in
-            profiling. Default: ['cpu', 'cuda'].
-        schedule (dict, optional): Config of generating the callable schedule.
-            if schedule is None, profiler will not add step markers into the
-            trace and table view. Default: None.
-        on_trace_ready (callable, dict): Either a handler or a dict of generate
-            handler. Default: None.
-        record_shapes (bool): Save information about operator's input shapes.
-            Default: False.
-        profile_memory (bool): Track tensor memory allocation/deallocation.
-            Default: False.
-        with_stack (bool): Record source information (file and line number)
-            for the ops. Default: False.
-        with_flops (bool): Use formula to estimate the FLOPS of specific
-            operators (matrix multiplication and 2D convolution).
-            Default: False.
-        json_trace_path (str, optional): Exports the collected trace in Chrome
-            JSON format. Default: None.
-
-    Example:
-        >>> runner = ... # instantiate a Runner
-        >>> # tensorboard trace
-        >>> trace_config = dict(type='tb_trace', dir_name='work_dir')
-        >>> profiler_config = dict(on_trace_ready=trace_config)
-        >>> runner.register_profiler_hook(profiler_config)
-        >>> runner.run(data_loaders=[trainloader], workflow=[('train', 1)])
-    """
-
-    def __init__(self,
-                 by_epoch: bool = True,
-                 profile_iters: int = 1,
-                 activities: List[str] = ['cpu', 'cuda'],
-                 schedule: Optional[dict] = None,
-                 on_trace_ready: Optional[Union[Callable, dict]] = None,
-                 record_shapes: bool = False,
-                 profile_memory: bool = False,
-                 with_stack: bool = False,
-                 with_flops: bool = False,
-                 json_trace_path: Optional[str] = None) -> None:
-        try:
-            from torch import profiler  # torch version >= 1.8.1
-        except ImportError:
-            raise ImportError('profiler is the new feature of torch1.8.1, '
-                              f'but your version is {torch.__version__}')
-
-        assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean.'
-        self.by_epoch = by_epoch
-
-        if profile_iters < 1:
-            raise ValueError('profile_iters should be greater than 0, but got '
-                             f'{profile_iters}')
-        self.profile_iters = profile_iters
-
-        if not isinstance(activities, list):
-            raise ValueError(
-                f'activities should be list, but got {type(activities)}')
-        self.activities = []
-        for activity in activities:
-            activity = activity.lower()
-            if activity == 'cpu':
-                self.activities.append(profiler.ProfilerActivity.CPU)
-            elif activity == 'cuda':
-                self.activities.append(profiler.ProfilerActivity.CUDA)
-            else:
-                raise ValueError(
-                    f'activity should be "cpu" or "cuda", but got {activity}')
-
-        if schedule is not None:
-            self.schedule = profiler.schedule(**schedule)
-        else:
-            self.schedule = None
-
-        self.on_trace_ready = on_trace_ready
-        self.record_shapes = record_shapes
-        self.profile_memory = profile_memory
-        self.with_stack = with_stack
-        self.with_flops = with_flops
-        self.json_trace_path = json_trace_path
-
-    @master_only
-    def before_run(self, runner):
-        if self.by_epoch and runner.max_epochs < self.profile_iters:
-            raise ValueError('self.profile_iters should not be greater than '
-                             f'{runner.max_epochs}')
-
-        if not self.by_epoch and runner.max_iters < self.profile_iters:
-            raise ValueError('self.profile_iters should not be greater than '
-                             f'{runner.max_iters}')
-
-        if callable(self.on_trace_ready):  # handler
-            _on_trace_ready = self.on_trace_ready
-        elif isinstance(self.on_trace_ready, dict):  # config of handler
-            trace_cfg = self.on_trace_ready.copy()
-            trace_type = trace_cfg.pop('type')  # log_trace handler
-            if trace_type == 'log_trace':
-
-                def _log_handler(prof):
-                    print(prof.key_averages().table(**trace_cfg))
-
-                _on_trace_ready = _log_handler
-            elif trace_type == 'tb_trace':  # tensorboard_trace handler
-                try:
-                    import torch_tb_profiler  # noqa: F401
-                except ImportError:
-                    raise ImportError('please run "pip install '
-                                      'torch-tb-profiler" to install '
-                                      'torch_tb_profiler')
-                _on_trace_ready = torch.profiler.tensorboard_trace_handler(
-                    **trace_cfg)
-            else:
-                raise ValueError('trace_type should be "log_trace" or '
-                                 f'"tb_trace", but got {trace_type}')
-        elif self.on_trace_ready is None:
-            _on_trace_ready = None  # type: ignore
-        else:
-            raise ValueError('on_trace_ready should be handler, dict or None, '
-                             f'but got {type(self.on_trace_ready)}')
-
-        if runner.max_epochs > 1:
-            warnings.warn(f'profiler will profile {runner.max_epochs} epochs '
-                          'instead of 1 epoch. Since profiler will slow down '
-                          'the training, it is recommended to train 1 epoch '
-                          'with ProfilerHook and adjust your setting according'
-                          ' to the profiler summary. During normal training '
-                          '(epoch > 1), you may disable the ProfilerHook.')
-
-        self.profiler = torch.profiler.profile(
-            activities=self.activities,
-            schedule=self.schedule,
-            on_trace_ready=_on_trace_ready,
-            record_shapes=self.record_shapes,
-            profile_memory=self.profile_memory,
-            with_stack=self.with_stack,
-            with_flops=self.with_flops)
-
-        self.profiler.__enter__()
-        runner.logger.info('profiler is profiling...')
-
-    @master_only
-    def after_train_epoch(self, runner):
-        if self.by_epoch and runner.epoch == self.profile_iters - 1:
-            runner.logger.info('profiler may take a few minutes...')
-            self.profiler.__exit__(None, None, None)
-            if self.json_trace_path is not None:
-                self.profiler.export_chrome_trace(self.json_trace_path)
-
-    @master_only
-    def after_train_iter(self, runner):
-        self.profiler.step()
-        if not self.by_epoch and runner.iter == self.profile_iters - 1:
-            runner.logger.info('profiler may take a few minutes...')
-            self.profiler.__exit__(None, None, None)
-            if self.json_trace_path is not None:
-                self.profiler.export_chrome_trace(self.json_trace_path)
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/hooks/sampler_seed.py b/ControlNet/annotator/uniformer/mmcv/runner/hooks/sampler_seed.py
deleted file mode 100644
index ee0dc6bdd8df5775857028aaed5444c0f59caf80..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/hooks/sampler_seed.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .hook import HOOKS, Hook
-
-
-@HOOKS.register_module()
-class DistSamplerSeedHook(Hook):
-    """Data-loading sampler for distributed training.
-
-    When distributed training, it is only useful in conjunction with
-    :obj:`EpochBasedRunner`, while :obj:`IterBasedRunner` achieves the same
-    purpose with :obj:`IterLoader`.
-    """
-
-    def before_epoch(self, runner):
-        if hasattr(runner.data_loader.sampler, 'set_epoch'):
-            # in case the data loader uses `SequentialSampler` in Pytorch
-            runner.data_loader.sampler.set_epoch(runner.epoch)
-        elif hasattr(runner.data_loader.batch_sampler.sampler, 'set_epoch'):
-            # batch sampler in pytorch warps the sampler as its attributes.
-            runner.data_loader.batch_sampler.sampler.set_epoch(runner.epoch)
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/hooks/sync_buffer.py b/ControlNet/annotator/uniformer/mmcv/runner/hooks/sync_buffer.py
deleted file mode 100644
index 6376b7ff894280cb2782243b25e8973650591577..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/hooks/sync_buffer.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from ..dist_utils import allreduce_params
-from .hook import HOOKS, Hook
-
-
-@HOOKS.register_module()
-class SyncBuffersHook(Hook):
-    """Synchronize model buffers such as running_mean and running_var in BN at
-    the end of each epoch.
-
-    Args:
-        distributed (bool): Whether distributed training is used. It is
-          effective only for distributed training. Defaults to True.
-    """
-
-    def __init__(self, distributed=True):
-        self.distributed = distributed
-
-    def after_epoch(self, runner):
-        """All-reduce model buffers at the end of each epoch."""
-        if self.distributed:
-            allreduce_params(runner.model.buffers())
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/iter_based_runner.py b/ControlNet/annotator/uniformer/mmcv/runner/iter_based_runner.py
deleted file mode 100644
index 1df4de8c0285669dec9b014dfd1f3dd1600f0831..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/iter_based_runner.py
+++ /dev/null
@@ -1,273 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-import platform
-import shutil
-import time
-import warnings
-
-import torch
-from torch.optim import Optimizer
-
-import annotator.uniformer.mmcv as mmcv
-from .base_runner import BaseRunner
-from .builder import RUNNERS
-from .checkpoint import save_checkpoint
-from .hooks import IterTimerHook
-from .utils import get_host_info
-
-
-class IterLoader:
-
-    def __init__(self, dataloader):
-        self._dataloader = dataloader
-        self.iter_loader = iter(self._dataloader)
-        self._epoch = 0
-
-    @property
-    def epoch(self):
-        return self._epoch
-
-    def __next__(self):
-        try:
-            data = next(self.iter_loader)
-        except StopIteration:
-            self._epoch += 1
-            if hasattr(self._dataloader.sampler, 'set_epoch'):
-                self._dataloader.sampler.set_epoch(self._epoch)
-            time.sleep(2)  # Prevent possible deadlock during epoch transition
-            self.iter_loader = iter(self._dataloader)
-            data = next(self.iter_loader)
-
-        return data
-
-    def __len__(self):
-        return len(self._dataloader)
-
-
-@RUNNERS.register_module()
-class IterBasedRunner(BaseRunner):
-    """Iteration-based Runner.
-
-    This runner train models iteration by iteration.
-    """
-
-    def train(self, data_loader, **kwargs):
-        self.model.train()
-        self.mode = 'train'
-        self.data_loader = data_loader
-        self._epoch = data_loader.epoch
-        data_batch = next(data_loader)
-        self.call_hook('before_train_iter')
-        outputs = self.model.train_step(data_batch, self.optimizer, **kwargs)
-        if not isinstance(outputs, dict):
-            raise TypeError('model.train_step() must return a dict')
-        if 'log_vars' in outputs:
-            self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
-        self.outputs = outputs
-        self.call_hook('after_train_iter')
-        self._inner_iter += 1
-        self._iter += 1
-
-    @torch.no_grad()
-    def val(self, data_loader, **kwargs):
-        self.model.eval()
-        self.mode = 'val'
-        self.data_loader = data_loader
-        data_batch = next(data_loader)
-        self.call_hook('before_val_iter')
-        outputs = self.model.val_step(data_batch, **kwargs)
-        if not isinstance(outputs, dict):
-            raise TypeError('model.val_step() must return a dict')
-        if 'log_vars' in outputs:
-            self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
-        self.outputs = outputs
-        self.call_hook('after_val_iter')
-        self._inner_iter += 1
-
-    def run(self, data_loaders, workflow, max_iters=None, **kwargs):
-        """Start running.
-
-        Args:
-            data_loaders (list[:obj:`DataLoader`]): Dataloaders for training
-                and validation.
-            workflow (list[tuple]): A list of (phase, iters) to specify the
-                running order and iterations. E.g, [('train', 10000),
-                ('val', 1000)] means running 10000 iterations for training and
-                1000 iterations for validation, iteratively.
-        """
-        assert isinstance(data_loaders, list)
-        assert mmcv.is_list_of(workflow, tuple)
-        assert len(data_loaders) == len(workflow)
-        if max_iters is not None:
-            warnings.warn(
-                'setting max_iters in run is deprecated, '
-                'please set max_iters in runner_config', DeprecationWarning)
-            self._max_iters = max_iters
-        assert self._max_iters is not None, (
-            'max_iters must be specified during instantiation')
-
-        work_dir = self.work_dir if self.work_dir is not None else 'NONE'
-        self.logger.info('Start running, host: %s, work_dir: %s',
-                         get_host_info(), work_dir)
-        self.logger.info('Hooks will be executed in the following order:\n%s',
-                         self.get_hook_info())
-        self.logger.info('workflow: %s, max: %d iters', workflow,
-                         self._max_iters)
-        self.call_hook('before_run')
-
-        iter_loaders = [IterLoader(x) for x in data_loaders]
-
-        self.call_hook('before_epoch')
-
-        while self.iter < self._max_iters:
-            for i, flow in enumerate(workflow):
-                self._inner_iter = 0
-                mode, iters = flow
-                if not isinstance(mode, str) or not hasattr(self, mode):
-                    raise ValueError(
-                        'runner has no method named "{}" to run a workflow'.
-                        format(mode))
-                iter_runner = getattr(self, mode)
-                for _ in range(iters):
-                    if mode == 'train' and self.iter >= self._max_iters:
-                        break
-                    iter_runner(iter_loaders[i], **kwargs)
-
-        time.sleep(1)  # wait for some hooks like loggers to finish
-        self.call_hook('after_epoch')
-        self.call_hook('after_run')
-
-    def resume(self,
-               checkpoint,
-               resume_optimizer=True,
-               map_location='default'):
-        """Resume model from checkpoint.
-
-        Args:
-            checkpoint (str): Checkpoint to resume from.
-            resume_optimizer (bool, optional): Whether resume the optimizer(s)
-                if the checkpoint file includes optimizer(s). Default to True.
-            map_location (str, optional): Same as :func:`torch.load`.
-                Default to 'default'.
-        """
-        if map_location == 'default':
-            device_id = torch.cuda.current_device()
-            checkpoint = self.load_checkpoint(
-                checkpoint,
-                map_location=lambda storage, loc: storage.cuda(device_id))
-        else:
-            checkpoint = self.load_checkpoint(
-                checkpoint, map_location=map_location)
-
-        self._epoch = checkpoint['meta']['epoch']
-        self._iter = checkpoint['meta']['iter']
-        self._inner_iter = checkpoint['meta']['iter']
-        if 'optimizer' in checkpoint and resume_optimizer:
-            if isinstance(self.optimizer, Optimizer):
-                self.optimizer.load_state_dict(checkpoint['optimizer'])
-            elif isinstance(self.optimizer, dict):
-                for k in self.optimizer.keys():
-                    self.optimizer[k].load_state_dict(
-                        checkpoint['optimizer'][k])
-            else:
-                raise TypeError(
-                    'Optimizer should be dict or torch.optim.Optimizer '
-                    f'but got {type(self.optimizer)}')
-
-        self.logger.info(f'resumed from epoch: {self.epoch}, iter {self.iter}')
-
-    def save_checkpoint(self,
-                        out_dir,
-                        filename_tmpl='iter_{}.pth',
-                        meta=None,
-                        save_optimizer=True,
-                        create_symlink=True):
-        """Save checkpoint to file.
-
-        Args:
-            out_dir (str): Directory to save checkpoint files.
-            filename_tmpl (str, optional): Checkpoint file template.
-                Defaults to 'iter_{}.pth'.
-            meta (dict, optional): Metadata to be saved in checkpoint.
-                Defaults to None.
-            save_optimizer (bool, optional): Whether save optimizer.
-                Defaults to True.
-            create_symlink (bool, optional): Whether create symlink to the
-                latest checkpoint file. Defaults to True.
-        """
-        if meta is None:
-            meta = {}
-        elif not isinstance(meta, dict):
-            raise TypeError(
-                f'meta should be a dict or None, but got {type(meta)}')
-        if self.meta is not None:
-            meta.update(self.meta)
-            # Note: meta.update(self.meta) should be done before
-            # meta.update(epoch=self.epoch + 1, iter=self.iter) otherwise
-            # there will be problems with resumed checkpoints.
-            # More details in https://github.com/open-mmlab/mmcv/pull/1108
-        meta.update(epoch=self.epoch + 1, iter=self.iter)
-
-        filename = filename_tmpl.format(self.iter + 1)
-        filepath = osp.join(out_dir, filename)
-        optimizer = self.optimizer if save_optimizer else None
-        save_checkpoint(self.model, filepath, optimizer=optimizer, meta=meta)
-        # in some environments, `os.symlink` is not supported, you may need to
-        # set `create_symlink` to False
-        if create_symlink:
-            dst_file = osp.join(out_dir, 'latest.pth')
-            if platform.system() != 'Windows':
-                mmcv.symlink(filename, dst_file)
-            else:
-                shutil.copy(filepath, dst_file)
-
-    def register_training_hooks(self,
-                                lr_config,
-                                optimizer_config=None,
-                                checkpoint_config=None,
-                                log_config=None,
-                                momentum_config=None,
-                                custom_hooks_config=None):
-        """Register default hooks for iter-based training.
-
-        Checkpoint hook, optimizer stepper hook and logger hooks will be set to
-        `by_epoch=False` by default.
-
-        Default hooks include:
-
-        +----------------------+-------------------------+
-        | Hooks                | Priority                |
-        +======================+=========================+
-        | LrUpdaterHook        | VERY_HIGH (10)          |
-        +----------------------+-------------------------+
-        | MomentumUpdaterHook  | HIGH (30)               |
-        +----------------------+-------------------------+
-        | OptimizerStepperHook | ABOVE_NORMAL (40)       |
-        +----------------------+-------------------------+
-        | CheckpointSaverHook  | NORMAL (50)             |
-        +----------------------+-------------------------+
-        | IterTimerHook        | LOW (70)                |
-        +----------------------+-------------------------+
-        | LoggerHook(s)        | VERY_LOW (90)           |
-        +----------------------+-------------------------+
-        | CustomHook(s)        | defaults to NORMAL (50) |
-        +----------------------+-------------------------+
-
-        If custom hooks have same priority with default hooks, custom hooks
-        will be triggered after default hooks.
-        """
-        if checkpoint_config is not None:
-            checkpoint_config.setdefault('by_epoch', False)
-        if lr_config is not None:
-            lr_config.setdefault('by_epoch', False)
-        if log_config is not None:
-            for info in log_config['hooks']:
-                info.setdefault('by_epoch', False)
-        super(IterBasedRunner, self).register_training_hooks(
-            lr_config=lr_config,
-            momentum_config=momentum_config,
-            optimizer_config=optimizer_config,
-            checkpoint_config=checkpoint_config,
-            log_config=log_config,
-            timer_config=IterTimerHook(),
-            custom_hooks_config=custom_hooks_config)
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/log_buffer.py b/ControlNet/annotator/uniformer/mmcv/runner/log_buffer.py
deleted file mode 100644
index d949e2941c5400088c7cd8a1dc893d8b233ae785..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/log_buffer.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from collections import OrderedDict
-
-import numpy as np
-
-
-class LogBuffer:
-
-    def __init__(self):
-        self.val_history = OrderedDict()
-        self.n_history = OrderedDict()
-        self.output = OrderedDict()
-        self.ready = False
-
-    def clear(self):
-        self.val_history.clear()
-        self.n_history.clear()
-        self.clear_output()
-
-    def clear_output(self):
-        self.output.clear()
-        self.ready = False
-
-    def update(self, vars, count=1):
-        assert isinstance(vars, dict)
-        for key, var in vars.items():
-            if key not in self.val_history:
-                self.val_history[key] = []
-                self.n_history[key] = []
-            self.val_history[key].append(var)
-            self.n_history[key].append(count)
-
-    def average(self, n=0):
-        """Average latest n values or all values."""
-        assert n >= 0
-        for key in self.val_history:
-            values = np.array(self.val_history[key][-n:])
-            nums = np.array(self.n_history[key][-n:])
-            avg = np.sum(values * nums) / np.sum(nums)
-            self.output[key] = avg
-        self.ready = True
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/optimizer/__init__.py b/ControlNet/annotator/uniformer/mmcv/runner/optimizer/__init__.py
deleted file mode 100644
index 53c34d0470992cbc374f29681fdd00dc0e57968d..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/optimizer/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .builder import (OPTIMIZER_BUILDERS, OPTIMIZERS, build_optimizer,
-                      build_optimizer_constructor)
-from .default_constructor import DefaultOptimizerConstructor
-
-__all__ = [
-    'OPTIMIZER_BUILDERS', 'OPTIMIZERS', 'DefaultOptimizerConstructor',
-    'build_optimizer', 'build_optimizer_constructor'
-]
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/optimizer/builder.py b/ControlNet/annotator/uniformer/mmcv/runner/optimizer/builder.py
deleted file mode 100644
index f9234eed8f1f186d9d8dfda34562157ee39bdb3a..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/optimizer/builder.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import inspect
-
-import torch
-
-from ...utils import Registry, build_from_cfg
-
-OPTIMIZERS = Registry('optimizer')
-OPTIMIZER_BUILDERS = Registry('optimizer builder')
-
-
-def register_torch_optimizers():
-    torch_optimizers = []
-    for module_name in dir(torch.optim):
-        if module_name.startswith('__'):
-            continue
-        _optim = getattr(torch.optim, module_name)
-        if inspect.isclass(_optim) and issubclass(_optim,
-                                                  torch.optim.Optimizer):
-            OPTIMIZERS.register_module()(_optim)
-            torch_optimizers.append(module_name)
-    return torch_optimizers
-
-
-TORCH_OPTIMIZERS = register_torch_optimizers()
-
-
-def build_optimizer_constructor(cfg):
-    return build_from_cfg(cfg, OPTIMIZER_BUILDERS)
-
-
-def build_optimizer(model, cfg):
-    optimizer_cfg = copy.deepcopy(cfg)
-    constructor_type = optimizer_cfg.pop('constructor',
-                                         'DefaultOptimizerConstructor')
-    paramwise_cfg = optimizer_cfg.pop('paramwise_cfg', None)
-    optim_constructor = build_optimizer_constructor(
-        dict(
-            type=constructor_type,
-            optimizer_cfg=optimizer_cfg,
-            paramwise_cfg=paramwise_cfg))
-    optimizer = optim_constructor(model)
-    return optimizer
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/optimizer/default_constructor.py b/ControlNet/annotator/uniformer/mmcv/runner/optimizer/default_constructor.py
deleted file mode 100644
index 2c0da3503b75441738efe38d70352b55a210a34a..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/optimizer/default_constructor.py
+++ /dev/null
@@ -1,249 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-
-import torch
-from torch.nn import GroupNorm, LayerNorm
-
-from annotator.uniformer.mmcv.utils import _BatchNorm, _InstanceNorm, build_from_cfg, is_list_of
-from annotator.uniformer.mmcv.utils.ext_loader import check_ops_exist
-from .builder import OPTIMIZER_BUILDERS, OPTIMIZERS
-
-
-@OPTIMIZER_BUILDERS.register_module()
-class DefaultOptimizerConstructor:
-    """Default constructor for optimizers.
-
-    By default each parameter share the same optimizer settings, and we
-    provide an argument ``paramwise_cfg`` to specify parameter-wise settings.
-    It is a dict and may contain the following fields:
-
-    - ``custom_keys`` (dict): Specified parameters-wise settings by keys. If
-      one of the keys in ``custom_keys`` is a substring of the name of one
-      parameter, then the setting of the parameter will be specified by
-      ``custom_keys[key]`` and other setting like ``bias_lr_mult`` etc. will
-      be ignored. It should be noted that the aforementioned ``key`` is the
-      longest key that is a substring of the name of the parameter. If there
-      are multiple matched keys with the same length, then the key with lower
-      alphabet order will be chosen.
-      ``custom_keys[key]`` should be a dict and may contain fields ``lr_mult``
-      and ``decay_mult``. See Example 2 below.
-    - ``bias_lr_mult`` (float): It will be multiplied to the learning
-      rate for all bias parameters (except for those in normalization
-      layers and offset layers of DCN).
-    - ``bias_decay_mult`` (float): It will be multiplied to the weight
-      decay for all bias parameters (except for those in
-      normalization layers, depthwise conv layers, offset layers of DCN).
-    - ``norm_decay_mult`` (float): It will be multiplied to the weight
-      decay for all weight and bias parameters of normalization
-      layers.
-    - ``dwconv_decay_mult`` (float): It will be multiplied to the weight
-      decay for all weight and bias parameters of depthwise conv
-      layers.
-    - ``dcn_offset_lr_mult`` (float): It will be multiplied to the learning
-      rate for parameters of offset layer in the deformable convs
-      of a model.
-    - ``bypass_duplicate`` (bool): If true, the duplicate parameters
-      would not be added into optimizer. Default: False.
-
-    Note:
-        1. If the option ``dcn_offset_lr_mult`` is used, the constructor will
-            override the effect of ``bias_lr_mult`` in the bias of offset
-            layer. So be careful when using both ``bias_lr_mult`` and
-            ``dcn_offset_lr_mult``. If you wish to apply both of them to the
-            offset layer in deformable convs, set ``dcn_offset_lr_mult``
-            to the original ``dcn_offset_lr_mult`` * ``bias_lr_mult``.
-        2. If the option ``dcn_offset_lr_mult`` is used, the constructor will
-            apply it to all the DCN layers in the model. So be careful when
-            the model contains multiple DCN layers in places other than
-            backbone.
-
-    Args:
-        model (:obj:`nn.Module`): The model with parameters to be optimized.
-        optimizer_cfg (dict): The config dict of the optimizer.
-            Positional fields are
-
-                - `type`: class name of the optimizer.
-
-            Optional fields are
-
-                - any arguments of the corresponding optimizer type, e.g.,
-                  lr, weight_decay, momentum, etc.
-        paramwise_cfg (dict, optional): Parameter-wise options.
-
-    Example 1:
-        >>> model = torch.nn.modules.Conv1d(1, 1, 1)
-        >>> optimizer_cfg = dict(type='SGD', lr=0.01, momentum=0.9,
-        >>>                      weight_decay=0.0001)
-        >>> paramwise_cfg = dict(norm_decay_mult=0.)
-        >>> optim_builder = DefaultOptimizerConstructor(
-        >>>     optimizer_cfg, paramwise_cfg)
-        >>> optimizer = optim_builder(model)
-
-    Example 2:
-        >>> # assume model have attribute model.backbone and model.cls_head
-        >>> optimizer_cfg = dict(type='SGD', lr=0.01, weight_decay=0.95)
-        >>> paramwise_cfg = dict(custom_keys={
-                '.backbone': dict(lr_mult=0.1, decay_mult=0.9)})
-        >>> optim_builder = DefaultOptimizerConstructor(
-        >>>     optimizer_cfg, paramwise_cfg)
-        >>> optimizer = optim_builder(model)
-        >>> # Then the `lr` and `weight_decay` for model.backbone is
-        >>> # (0.01 * 0.1, 0.95 * 0.9). `lr` and `weight_decay` for
-        >>> # model.cls_head is (0.01, 0.95).
-    """
-
-    def __init__(self, optimizer_cfg, paramwise_cfg=None):
-        if not isinstance(optimizer_cfg, dict):
-            raise TypeError('optimizer_cfg should be a dict',
-                            f'but got {type(optimizer_cfg)}')
-        self.optimizer_cfg = optimizer_cfg
-        self.paramwise_cfg = {} if paramwise_cfg is None else paramwise_cfg
-        self.base_lr = optimizer_cfg.get('lr', None)
-        self.base_wd = optimizer_cfg.get('weight_decay', None)
-        self._validate_cfg()
-
-    def _validate_cfg(self):
-        if not isinstance(self.paramwise_cfg, dict):
-            raise TypeError('paramwise_cfg should be None or a dict, '
-                            f'but got {type(self.paramwise_cfg)}')
-
-        if 'custom_keys' in self.paramwise_cfg:
-            if not isinstance(self.paramwise_cfg['custom_keys'], dict):
-                raise TypeError(
-                    'If specified, custom_keys must be a dict, '
-                    f'but got {type(self.paramwise_cfg["custom_keys"])}')
-            if self.base_wd is None:
-                for key in self.paramwise_cfg['custom_keys']:
-                    if 'decay_mult' in self.paramwise_cfg['custom_keys'][key]:
-                        raise ValueError('base_wd should not be None')
-
-        # get base lr and weight decay
-        # weight_decay must be explicitly specified if mult is specified
-        if ('bias_decay_mult' in self.paramwise_cfg
-                or 'norm_decay_mult' in self.paramwise_cfg
-                or 'dwconv_decay_mult' in self.paramwise_cfg):
-            if self.base_wd is None:
-                raise ValueError('base_wd should not be None')
-
-    def _is_in(self, param_group, param_group_list):
-        assert is_list_of(param_group_list, dict)
-        param = set(param_group['params'])
-        param_set = set()
-        for group in param_group_list:
-            param_set.update(set(group['params']))
-
-        return not param.isdisjoint(param_set)
-
-    def add_params(self, params, module, prefix='', is_dcn_module=None):
-        """Add all parameters of module to the params list.
-
-        The parameters of the given module will be added to the list of param
-        groups, with specific rules defined by paramwise_cfg.
-
-        Args:
-            params (list[dict]): A list of param groups, it will be modified
-                in place.
-            module (nn.Module): The module to be added.
-            prefix (str): The prefix of the module
-            is_dcn_module (int|float|None): If the current module is a
-                submodule of DCN, `is_dcn_module` will be passed to
-                control conv_offset layer's learning rate. Defaults to None.
-        """
-        # get param-wise options
-        custom_keys = self.paramwise_cfg.get('custom_keys', {})
-        # first sort with alphabet order and then sort with reversed len of str
-        sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True)
-
-        bias_lr_mult = self.paramwise_cfg.get('bias_lr_mult', 1.)
-        bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', 1.)
-        norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', 1.)
-        dwconv_decay_mult = self.paramwise_cfg.get('dwconv_decay_mult', 1.)
-        bypass_duplicate = self.paramwise_cfg.get('bypass_duplicate', False)
-        dcn_offset_lr_mult = self.paramwise_cfg.get('dcn_offset_lr_mult', 1.)
-
-        # special rules for norm layers and depth-wise conv layers
-        is_norm = isinstance(module,
-                             (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm))
-        is_dwconv = (
-            isinstance(module, torch.nn.Conv2d)
-            and module.in_channels == module.groups)
-
-        for name, param in module.named_parameters(recurse=False):
-            param_group = {'params': [param]}
-            if not param.requires_grad:
-                params.append(param_group)
-                continue
-            if bypass_duplicate and self._is_in(param_group, params):
-                warnings.warn(f'{prefix} is duplicate. It is skipped since '
-                              f'bypass_duplicate={bypass_duplicate}')
-                continue
-            # if the parameter match one of the custom keys, ignore other rules
-            is_custom = False
-            for key in sorted_keys:
-                if key in f'{prefix}.{name}':
-                    is_custom = True
-                    lr_mult = custom_keys[key].get('lr_mult', 1.)
-                    param_group['lr'] = self.base_lr * lr_mult
-                    if self.base_wd is not None:
-                        decay_mult = custom_keys[key].get('decay_mult', 1.)
-                        param_group['weight_decay'] = self.base_wd * decay_mult
-                    break
-
-            if not is_custom:
-                # bias_lr_mult affects all bias parameters
-                # except for norm.bias dcn.conv_offset.bias
-                if name == 'bias' and not (is_norm or is_dcn_module):
-                    param_group['lr'] = self.base_lr * bias_lr_mult
-
-                if (prefix.find('conv_offset') != -1 and is_dcn_module
-                        and isinstance(module, torch.nn.Conv2d)):
-                    # deal with both dcn_offset's bias & weight
-                    param_group['lr'] = self.base_lr * dcn_offset_lr_mult
-
-                # apply weight decay policies
-                if self.base_wd is not None:
-                    # norm decay
-                    if is_norm:
-                        param_group[
-                            'weight_decay'] = self.base_wd * norm_decay_mult
-                    # depth-wise conv
-                    elif is_dwconv:
-                        param_group[
-                            'weight_decay'] = self.base_wd * dwconv_decay_mult
-                    # bias lr and decay
-                    elif name == 'bias' and not is_dcn_module:
-                        # TODO: current bias_decay_mult will have affect on DCN
-                        param_group[
-                            'weight_decay'] = self.base_wd * bias_decay_mult
-            params.append(param_group)
-
-        if check_ops_exist():
-            from annotator.uniformer.mmcv.ops import DeformConv2d, ModulatedDeformConv2d
-            is_dcn_module = isinstance(module,
-                                       (DeformConv2d, ModulatedDeformConv2d))
-        else:
-            is_dcn_module = False
-        for child_name, child_mod in module.named_children():
-            child_prefix = f'{prefix}.{child_name}' if prefix else child_name
-            self.add_params(
-                params,
-                child_mod,
-                prefix=child_prefix,
-                is_dcn_module=is_dcn_module)
-
-    def __call__(self, model):
-        if hasattr(model, 'module'):
-            model = model.module
-
-        optimizer_cfg = self.optimizer_cfg.copy()
-        # if no paramwise option is specified, just use the global setting
-        if not self.paramwise_cfg:
-            optimizer_cfg['params'] = model.parameters()
-            return build_from_cfg(optimizer_cfg, OPTIMIZERS)
-
-        # set param-wise lr and weight decay recursively
-        params = []
-        self.add_params(params, model)
-        optimizer_cfg['params'] = params
-
-        return build_from_cfg(optimizer_cfg, OPTIMIZERS)
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/priority.py b/ControlNet/annotator/uniformer/mmcv/runner/priority.py
deleted file mode 100644
index 64cc4e3a05f8d5b89ab6eb32461e6e80f1d62e67..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/priority.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from enum import Enum
-
-
-class Priority(Enum):
-    """Hook priority levels.
-
-    +--------------+------------+
-    | Level        | Value      |
-    +==============+============+
-    | HIGHEST      | 0          |
-    +--------------+------------+
-    | VERY_HIGH    | 10         |
-    +--------------+------------+
-    | HIGH         | 30         |
-    +--------------+------------+
-    | ABOVE_NORMAL | 40         |
-    +--------------+------------+
-    | NORMAL       | 50         |
-    +--------------+------------+
-    | BELOW_NORMAL | 60         |
-    +--------------+------------+
-    | LOW          | 70         |
-    +--------------+------------+
-    | VERY_LOW     | 90         |
-    +--------------+------------+
-    | LOWEST       | 100        |
-    +--------------+------------+
-    """
-
-    HIGHEST = 0
-    VERY_HIGH = 10
-    HIGH = 30
-    ABOVE_NORMAL = 40
-    NORMAL = 50
-    BELOW_NORMAL = 60
-    LOW = 70
-    VERY_LOW = 90
-    LOWEST = 100
-
-
-def get_priority(priority):
-    """Get priority value.
-
-    Args:
-        priority (int or str or :obj:`Priority`): Priority.
-
-    Returns:
-        int: The priority value.
-    """
-    if isinstance(priority, int):
-        if priority < 0 or priority > 100:
-            raise ValueError('priority must be between 0 and 100')
-        return priority
-    elif isinstance(priority, Priority):
-        return priority.value
-    elif isinstance(priority, str):
-        return Priority[priority.upper()].value
-    else:
-        raise TypeError('priority must be an integer or Priority enum value')
diff --git a/ControlNet/annotator/uniformer/mmcv/runner/utils.py b/ControlNet/annotator/uniformer/mmcv/runner/utils.py
deleted file mode 100644
index c5befb8e56ece50b5fecfd007b26f8a29124c0bd..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/runner/utils.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os
-import random
-import sys
-import time
-import warnings
-from getpass import getuser
-from socket import gethostname
-
-import numpy as np
-import torch
-
-import annotator.uniformer.mmcv as mmcv
-
-
-def get_host_info():
-    """Get hostname and username.
-
-    Return empty string if exception raised, e.g. ``getpass.getuser()`` will
-    lead to error in docker container
-    """
-    host = ''
-    try:
-        host = f'{getuser()}@{gethostname()}'
-    except Exception as e:
-        warnings.warn(f'Host or user not found: {str(e)}')
-    finally:
-        return host
-
-
-def get_time_str():
-    return time.strftime('%Y%m%d_%H%M%S', time.localtime())
-
-
-def obj_from_dict(info, parent=None, default_args=None):
-    """Initialize an object from dict.
-
-    The dict must contain the key "type", which indicates the object type, it
-    can be either a string or type, such as "list" or ``list``. Remaining
-    fields are treated as the arguments for constructing the object.
-
-    Args:
-        info (dict): Object types and arguments.
-        parent (:class:`module`): Module which may containing expected object
-            classes.
-        default_args (dict, optional): Default arguments for initializing the
-            object.
-
-    Returns:
-        any type: Object built from the dict.
-    """
-    assert isinstance(info, dict) and 'type' in info
-    assert isinstance(default_args, dict) or default_args is None
-    args = info.copy()
-    obj_type = args.pop('type')
-    if mmcv.is_str(obj_type):
-        if parent is not None:
-            obj_type = getattr(parent, obj_type)
-        else:
-            obj_type = sys.modules[obj_type]
-    elif not isinstance(obj_type, type):
-        raise TypeError('type must be a str or valid type, but '
-                        f'got {type(obj_type)}')
-    if default_args is not None:
-        for name, value in default_args.items():
-            args.setdefault(name, value)
-    return obj_type(**args)
-
-
-def set_random_seed(seed, deterministic=False, use_rank_shift=False):
-    """Set random seed.
-
-    Args:
-        seed (int): Seed to be used.
-        deterministic (bool): Whether to set the deterministic option for
-            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
-            to True and `torch.backends.cudnn.benchmark` to False.
-            Default: False.
-        rank_shift (bool): Whether to add rank number to the random seed to
-            have different random seed in different threads. Default: False.
-    """
-    if use_rank_shift:
-        rank, _ = mmcv.runner.get_dist_info()
-        seed += rank
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-    os.environ['PYTHONHASHSEED'] = str(seed)
-    if deterministic:
-        torch.backends.cudnn.deterministic = True
-        torch.backends.cudnn.benchmark = False
diff --git a/ControlNet/annotator/uniformer/mmcv/utils/__init__.py b/ControlNet/annotator/uniformer/mmcv/utils/__init__.py
deleted file mode 100644
index 378a0068432a371af364de9d73785901c0f83383..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/utils/__init__.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# flake8: noqa
-# Copyright (c) OpenMMLab. All rights reserved.
-from .config import Config, ConfigDict, DictAction
-from .misc import (check_prerequisites, concat_list, deprecated_api_warning,
-                   has_method, import_modules_from_strings, is_list_of,
-                   is_method_overridden, is_seq_of, is_str, is_tuple_of,
-                   iter_cast, list_cast, requires_executable, requires_package,
-                   slice_list, to_1tuple, to_2tuple, to_3tuple, to_4tuple,
-                   to_ntuple, tuple_cast)
-from .path import (check_file_exist, fopen, is_filepath, mkdir_or_exist,
-                   scandir, symlink)
-from .progressbar import (ProgressBar, track_iter_progress,
-                          track_parallel_progress, track_progress)
-from .testing import (assert_attrs_equal, assert_dict_contains_subset,
-                      assert_dict_has_keys, assert_is_norm_layer,
-                      assert_keys_equal, assert_params_all_zeros,
-                      check_python_script)
-from .timer import Timer, TimerError, check_time
-from .version_utils import digit_version, get_git_hash
-
-try:
-    import torch
-except ImportError:
-    __all__ = [
-        'Config', 'ConfigDict', 'DictAction', 'is_str', 'iter_cast',
-        'list_cast', 'tuple_cast', 'is_seq_of', 'is_list_of', 'is_tuple_of',
-        'slice_list', 'concat_list', 'check_prerequisites', 'requires_package',
-        'requires_executable', 'is_filepath', 'fopen', 'check_file_exist',
-        'mkdir_or_exist', 'symlink', 'scandir', 'ProgressBar',
-        'track_progress', 'track_iter_progress', 'track_parallel_progress',
-        'Timer', 'TimerError', 'check_time', 'deprecated_api_warning',
-        'digit_version', 'get_git_hash', 'import_modules_from_strings',
-        'assert_dict_contains_subset', 'assert_attrs_equal',
-        'assert_dict_has_keys', 'assert_keys_equal', 'check_python_script',
-        'to_1tuple', 'to_2tuple', 'to_3tuple', 'to_4tuple', 'to_ntuple',
-        'is_method_overridden', 'has_method'
-    ]
-else:
-    from .env import collect_env
-    from .logging import get_logger, print_log
-    from .parrots_jit import jit, skip_no_elena
-    from .parrots_wrapper import (
-        TORCH_VERSION, BuildExtension, CppExtension, CUDAExtension, DataLoader,
-        PoolDataLoader, SyncBatchNorm, _AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd,
-        _AvgPoolNd, _BatchNorm, _ConvNd, _ConvTransposeMixin, _InstanceNorm,
-        _MaxPoolNd, get_build_config, is_rocm_pytorch, _get_cuda_home)
-    from .registry import Registry, build_from_cfg
-    from .trace import is_jit_tracing
-    __all__ = [
-        'Config', 'ConfigDict', 'DictAction', 'collect_env', 'get_logger',
-        'print_log', 'is_str', 'iter_cast', 'list_cast', 'tuple_cast',
-        'is_seq_of', 'is_list_of', 'is_tuple_of', 'slice_list', 'concat_list',
-        'check_prerequisites', 'requires_package', 'requires_executable',
-        'is_filepath', 'fopen', 'check_file_exist', 'mkdir_or_exist',
-        'symlink', 'scandir', 'ProgressBar', 'track_progress',
-        'track_iter_progress', 'track_parallel_progress', 'Registry',
-        'build_from_cfg', 'Timer', 'TimerError', 'check_time', 'SyncBatchNorm',
-        '_AdaptiveAvgPoolNd', '_AdaptiveMaxPoolNd', '_AvgPoolNd', '_BatchNorm',
-        '_ConvNd', '_ConvTransposeMixin', '_InstanceNorm', '_MaxPoolNd',
-        'get_build_config', 'BuildExtension', 'CppExtension', 'CUDAExtension',
-        'DataLoader', 'PoolDataLoader', 'TORCH_VERSION',
-        'deprecated_api_warning', 'digit_version', 'get_git_hash',
-        'import_modules_from_strings', 'jit', 'skip_no_elena',
-        'assert_dict_contains_subset', 'assert_attrs_equal',
-        'assert_dict_has_keys', 'assert_keys_equal', 'assert_is_norm_layer',
-        'assert_params_all_zeros', 'check_python_script',
-        'is_method_overridden', 'is_jit_tracing', 'is_rocm_pytorch',
-        '_get_cuda_home', 'has_method'
-    ]
diff --git a/ControlNet/annotator/uniformer/mmcv/utils/config.py b/ControlNet/annotator/uniformer/mmcv/utils/config.py
deleted file mode 100644
index 17149353aefac6d737c67bb2f35a3a6cd2147b0a..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/utils/config.py
+++ /dev/null
@@ -1,688 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import ast
-import copy
-import os
-import os.path as osp
-import platform
-import shutil
-import sys
-import tempfile
-import uuid
-import warnings
-from argparse import Action, ArgumentParser
-from collections import abc
-from importlib import import_module
-
-from addict import Dict
-from yapf.yapflib.yapf_api import FormatCode
-
-from .misc import import_modules_from_strings
-from .path import check_file_exist
-
-if platform.system() == 'Windows':
-    import regex as re
-else:
-    import re
-
-BASE_KEY = '_base_'
-DELETE_KEY = '_delete_'
-DEPRECATION_KEY = '_deprecation_'
-RESERVED_KEYS = ['filename', 'text', 'pretty_text']
-
-
-class ConfigDict(Dict):
-
-    def __missing__(self, name):
-        raise KeyError(name)
-
-    def __getattr__(self, name):
-        try:
-            value = super(ConfigDict, self).__getattr__(name)
-        except KeyError:
-            ex = AttributeError(f"'{self.__class__.__name__}' object has no "
-                                f"attribute '{name}'")
-        except Exception as e:
-            ex = e
-        else:
-            return value
-        raise ex
-
-
-def add_args(parser, cfg, prefix=''):
-    for k, v in cfg.items():
-        if isinstance(v, str):
-            parser.add_argument('--' + prefix + k)
-        elif isinstance(v, int):
-            parser.add_argument('--' + prefix + k, type=int)
-        elif isinstance(v, float):
-            parser.add_argument('--' + prefix + k, type=float)
-        elif isinstance(v, bool):
-            parser.add_argument('--' + prefix + k, action='store_true')
-        elif isinstance(v, dict):
-            add_args(parser, v, prefix + k + '.')
-        elif isinstance(v, abc.Iterable):
-            parser.add_argument('--' + prefix + k, type=type(v[0]), nargs='+')
-        else:
-            print(f'cannot parse key {prefix + k} of type {type(v)}')
-    return parser
-
-
-class Config:
-    """A facility for config and config files.
-
-    It supports common file formats as configs: python/json/yaml. The interface
-    is the same as a dict object and also allows access config values as
-    attributes.
-
-    Example:
-        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
-        >>> cfg.a
-        1
-        >>> cfg.b
-        {'b1': [0, 1]}
-        >>> cfg.b.b1
-        [0, 1]
-        >>> cfg = Config.fromfile('tests/data/config/a.py')
-        >>> cfg.filename
-        "/home/kchen/projects/mmcv/tests/data/config/a.py"
-        >>> cfg.item4
-        'test'
-        >>> cfg
-        "Config [path: /home/kchen/projects/mmcv/tests/data/config/a.py]: "
-        "{'item1': [1, 2], 'item2': {'a': 0}, 'item3': True, 'item4': 'test'}"
-    """
-
-    @staticmethod
-    def _validate_py_syntax(filename):
-        with open(filename, 'r', encoding='utf-8') as f:
-            # Setting encoding explicitly to resolve coding issue on windows
-            content = f.read()
-        try:
-            ast.parse(content)
-        except SyntaxError as e:
-            raise SyntaxError('There are syntax errors in config '
-                              f'file {filename}: {e}')
-
-    @staticmethod
-    def _substitute_predefined_vars(filename, temp_config_name):
-        file_dirname = osp.dirname(filename)
-        file_basename = osp.basename(filename)
-        file_basename_no_extension = osp.splitext(file_basename)[0]
-        file_extname = osp.splitext(filename)[1]
-        support_templates = dict(
-            fileDirname=file_dirname,
-            fileBasename=file_basename,
-            fileBasenameNoExtension=file_basename_no_extension,
-            fileExtname=file_extname)
-        with open(filename, 'r', encoding='utf-8') as f:
-            # Setting encoding explicitly to resolve coding issue on windows
-            config_file = f.read()
-        for key, value in support_templates.items():
-            regexp = r'\{\{\s*' + str(key) + r'\s*\}\}'
-            value = value.replace('\\', '/')
-            config_file = re.sub(regexp, value, config_file)
-        with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file:
-            tmp_config_file.write(config_file)
-
-    @staticmethod
-    def _pre_substitute_base_vars(filename, temp_config_name):
-        """Substitute base variable placehoders to string, so that parsing
-        would work."""
-        with open(filename, 'r', encoding='utf-8') as f:
-            # Setting encoding explicitly to resolve coding issue on windows
-            config_file = f.read()
-        base_var_dict = {}
-        regexp = r'\{\{\s*' + BASE_KEY + r'\.([\w\.]+)\s*\}\}'
-        base_vars = set(re.findall(regexp, config_file))
-        for base_var in base_vars:
-            randstr = f'_{base_var}_{uuid.uuid4().hex.lower()[:6]}'
-            base_var_dict[randstr] = base_var
-            regexp = r'\{\{\s*' + BASE_KEY + r'\.' + base_var + r'\s*\}\}'
-            config_file = re.sub(regexp, f'"{randstr}"', config_file)
-        with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file:
-            tmp_config_file.write(config_file)
-        return base_var_dict
-
-    @staticmethod
-    def _substitute_base_vars(cfg, base_var_dict, base_cfg):
-        """Substitute variable strings to their actual values."""
-        cfg = copy.deepcopy(cfg)
-
-        if isinstance(cfg, dict):
-            for k, v in cfg.items():
-                if isinstance(v, str) and v in base_var_dict:
-                    new_v = base_cfg
-                    for new_k in base_var_dict[v].split('.'):
-                        new_v = new_v[new_k]
-                    cfg[k] = new_v
-                elif isinstance(v, (list, tuple, dict)):
-                    cfg[k] = Config._substitute_base_vars(
-                        v, base_var_dict, base_cfg)
-        elif isinstance(cfg, tuple):
-            cfg = tuple(
-                Config._substitute_base_vars(c, base_var_dict, base_cfg)
-                for c in cfg)
-        elif isinstance(cfg, list):
-            cfg = [
-                Config._substitute_base_vars(c, base_var_dict, base_cfg)
-                for c in cfg
-            ]
-        elif isinstance(cfg, str) and cfg in base_var_dict:
-            new_v = base_cfg
-            for new_k in base_var_dict[cfg].split('.'):
-                new_v = new_v[new_k]
-            cfg = new_v
-
-        return cfg
-
-    @staticmethod
-    def _file2dict(filename, use_predefined_variables=True):
-        filename = osp.abspath(osp.expanduser(filename))
-        check_file_exist(filename)
-        fileExtname = osp.splitext(filename)[1]
-        if fileExtname not in ['.py', '.json', '.yaml', '.yml']:
-            raise IOError('Only py/yml/yaml/json type are supported now!')
-
-        with tempfile.TemporaryDirectory() as temp_config_dir:
-            temp_config_file = tempfile.NamedTemporaryFile(
-                dir=temp_config_dir, suffix=fileExtname)
-            if platform.system() == 'Windows':
-                temp_config_file.close()
-            temp_config_name = osp.basename(temp_config_file.name)
-            # Substitute predefined variables
-            if use_predefined_variables:
-                Config._substitute_predefined_vars(filename,
-                                                   temp_config_file.name)
-            else:
-                shutil.copyfile(filename, temp_config_file.name)
-            # Substitute base variables from placeholders to strings
-            base_var_dict = Config._pre_substitute_base_vars(
-                temp_config_file.name, temp_config_file.name)
-
-            if filename.endswith('.py'):
-                temp_module_name = osp.splitext(temp_config_name)[0]
-                sys.path.insert(0, temp_config_dir)
-                Config._validate_py_syntax(filename)
-                mod = import_module(temp_module_name)
-                sys.path.pop(0)
-                cfg_dict = {
-                    name: value
-                    for name, value in mod.__dict__.items()
-                    if not name.startswith('__')
-                }
-                # delete imported module
-                del sys.modules[temp_module_name]
-            elif filename.endswith(('.yml', '.yaml', '.json')):
-                import annotator.uniformer.mmcv as mmcv
-                cfg_dict = mmcv.load(temp_config_file.name)
-            # close temp file
-            temp_config_file.close()
-
-        # check deprecation information
-        if DEPRECATION_KEY in cfg_dict:
-            deprecation_info = cfg_dict.pop(DEPRECATION_KEY)
-            warning_msg = f'The config file {filename} will be deprecated ' \
-                'in the future.'
-            if 'expected' in deprecation_info:
-                warning_msg += f' Please use {deprecation_info["expected"]} ' \
-                    'instead.'
-            if 'reference' in deprecation_info:
-                warning_msg += ' More information can be found at ' \
-                    f'{deprecation_info["reference"]}'
-            warnings.warn(warning_msg)
-
-        cfg_text = filename + '\n'
-        with open(filename, 'r', encoding='utf-8') as f:
-            # Setting encoding explicitly to resolve coding issue on windows
-            cfg_text += f.read()
-
-        if BASE_KEY in cfg_dict:
-            cfg_dir = osp.dirname(filename)
-            base_filename = cfg_dict.pop(BASE_KEY)
-            base_filename = base_filename if isinstance(
-                base_filename, list) else [base_filename]
-
-            cfg_dict_list = list()
-            cfg_text_list = list()
-            for f in base_filename:
-                _cfg_dict, _cfg_text = Config._file2dict(osp.join(cfg_dir, f))
-                cfg_dict_list.append(_cfg_dict)
-                cfg_text_list.append(_cfg_text)
-
-            base_cfg_dict = dict()
-            for c in cfg_dict_list:
-                duplicate_keys = base_cfg_dict.keys() & c.keys()
-                if len(duplicate_keys) > 0:
-                    raise KeyError('Duplicate key is not allowed among bases. '
-                                   f'Duplicate keys: {duplicate_keys}')
-                base_cfg_dict.update(c)
-
-            # Substitute base variables from strings to their actual values
-            cfg_dict = Config._substitute_base_vars(cfg_dict, base_var_dict,
-                                                    base_cfg_dict)
-
-            base_cfg_dict = Config._merge_a_into_b(cfg_dict, base_cfg_dict)
-            cfg_dict = base_cfg_dict
-
-            # merge cfg_text
-            cfg_text_list.append(cfg_text)
-            cfg_text = '\n'.join(cfg_text_list)
-
-        return cfg_dict, cfg_text
-
-    @staticmethod
-    def _merge_a_into_b(a, b, allow_list_keys=False):
-        """merge dict ``a`` into dict ``b`` (non-inplace).
-
-        Values in ``a`` will overwrite ``b``. ``b`` is copied first to avoid
-        in-place modifications.
-
-        Args:
-            a (dict): The source dict to be merged into ``b``.
-            b (dict): The origin dict to be fetch keys from ``a``.
-            allow_list_keys (bool): If True, int string keys (e.g. '0', '1')
-              are allowed in source ``a`` and will replace the element of the
-              corresponding index in b if b is a list. Default: False.
-
-        Returns:
-            dict: The modified dict of ``b`` using ``a``.
-
-        Examples:
-            # Normally merge a into b.
-            >>> Config._merge_a_into_b(
-            ...     dict(obj=dict(a=2)), dict(obj=dict(a=1)))
-            {'obj': {'a': 2}}
-
-            # Delete b first and merge a into b.
-            >>> Config._merge_a_into_b(
-            ...     dict(obj=dict(_delete_=True, a=2)), dict(obj=dict(a=1)))
-            {'obj': {'a': 2}}
-
-            # b is a list
-            >>> Config._merge_a_into_b(
-            ...     {'0': dict(a=2)}, [dict(a=1), dict(b=2)], True)
-            [{'a': 2}, {'b': 2}]
-        """
-        b = b.copy()
-        for k, v in a.items():
-            if allow_list_keys and k.isdigit() and isinstance(b, list):
-                k = int(k)
-                if len(b) <= k:
-                    raise KeyError(f'Index {k} exceeds the length of list {b}')
-                b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys)
-            elif isinstance(v,
-                            dict) and k in b and not v.pop(DELETE_KEY, False):
-                allowed_types = (dict, list) if allow_list_keys else dict
-                if not isinstance(b[k], allowed_types):
-                    raise TypeError(
-                        f'{k}={v} in child config cannot inherit from base '
-                        f'because {k} is a dict in the child config but is of '
-                        f'type {type(b[k])} in base config. You may set '
-                        f'`{DELETE_KEY}=True` to ignore the base config')
-                b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys)
-            else:
-                b[k] = v
-        return b
-
-    @staticmethod
-    def fromfile(filename,
-                 use_predefined_variables=True,
-                 import_custom_modules=True):
-        cfg_dict, cfg_text = Config._file2dict(filename,
-                                               use_predefined_variables)
-        if import_custom_modules and cfg_dict.get('custom_imports', None):
-            import_modules_from_strings(**cfg_dict['custom_imports'])
-        return Config(cfg_dict, cfg_text=cfg_text, filename=filename)
-
-    @staticmethod
-    def fromstring(cfg_str, file_format):
-        """Generate config from config str.
-
-        Args:
-            cfg_str (str): Config str.
-            file_format (str): Config file format corresponding to the
-               config str. Only py/yml/yaml/json type are supported now!
-
-        Returns:
-            obj:`Config`: Config obj.
-        """
-        if file_format not in ['.py', '.json', '.yaml', '.yml']:
-            raise IOError('Only py/yml/yaml/json type are supported now!')
-        if file_format != '.py' and 'dict(' in cfg_str:
-            # check if users specify a wrong suffix for python
-            warnings.warn(
-                'Please check "file_format", the file format may be .py')
-        with tempfile.NamedTemporaryFile(
-                'w', encoding='utf-8', suffix=file_format,
-                delete=False) as temp_file:
-            temp_file.write(cfg_str)
-            # on windows, previous implementation cause error
-            # see PR 1077 for details
-        cfg = Config.fromfile(temp_file.name)
-        os.remove(temp_file.name)
-        return cfg
-
-    @staticmethod
-    def auto_argparser(description=None):
-        """Generate argparser from config file automatically (experimental)"""
-        partial_parser = ArgumentParser(description=description)
-        partial_parser.add_argument('config', help='config file path')
-        cfg_file = partial_parser.parse_known_args()[0].config
-        cfg = Config.fromfile(cfg_file)
-        parser = ArgumentParser(description=description)
-        parser.add_argument('config', help='config file path')
-        add_args(parser, cfg)
-        return parser, cfg
-
-    def __init__(self, cfg_dict=None, cfg_text=None, filename=None):
-        if cfg_dict is None:
-            cfg_dict = dict()
-        elif not isinstance(cfg_dict, dict):
-            raise TypeError('cfg_dict must be a dict, but '
-                            f'got {type(cfg_dict)}')
-        for key in cfg_dict:
-            if key in RESERVED_KEYS:
-                raise KeyError(f'{key} is reserved for config file')
-
-        super(Config, self).__setattr__('_cfg_dict', ConfigDict(cfg_dict))
-        super(Config, self).__setattr__('_filename', filename)
-        if cfg_text:
-            text = cfg_text
-        elif filename:
-            with open(filename, 'r') as f:
-                text = f.read()
-        else:
-            text = ''
-        super(Config, self).__setattr__('_text', text)
-
-    @property
-    def filename(self):
-        return self._filename
-
-    @property
-    def text(self):
-        return self._text
-
-    @property
-    def pretty_text(self):
-
-        indent = 4
-
-        def _indent(s_, num_spaces):
-            s = s_.split('\n')
-            if len(s) == 1:
-                return s_
-            first = s.pop(0)
-            s = [(num_spaces * ' ') + line for line in s]
-            s = '\n'.join(s)
-            s = first + '\n' + s
-            return s
-
-        def _format_basic_types(k, v, use_mapping=False):
-            if isinstance(v, str):
-                v_str = f"'{v}'"
-            else:
-                v_str = str(v)
-
-            if use_mapping:
-                k_str = f"'{k}'" if isinstance(k, str) else str(k)
-                attr_str = f'{k_str}: {v_str}'
-            else:
-                attr_str = f'{str(k)}={v_str}'
-            attr_str = _indent(attr_str, indent)
-
-            return attr_str
-
-        def _format_list(k, v, use_mapping=False):
-            # check if all items in the list are dict
-            if all(isinstance(_, dict) for _ in v):
-                v_str = '[\n'
-                v_str += '\n'.join(
-                    f'dict({_indent(_format_dict(v_), indent)}),'
-                    for v_ in v).rstrip(',')
-                if use_mapping:
-                    k_str = f"'{k}'" if isinstance(k, str) else str(k)
-                    attr_str = f'{k_str}: {v_str}'
-                else:
-                    attr_str = f'{str(k)}={v_str}'
-                attr_str = _indent(attr_str, indent) + ']'
-            else:
-                attr_str = _format_basic_types(k, v, use_mapping)
-            return attr_str
-
-        def _contain_invalid_identifier(dict_str):
-            contain_invalid_identifier = False
-            for key_name in dict_str:
-                contain_invalid_identifier |= \
-                    (not str(key_name).isidentifier())
-            return contain_invalid_identifier
-
-        def _format_dict(input_dict, outest_level=False):
-            r = ''
-            s = []
-
-            use_mapping = _contain_invalid_identifier(input_dict)
-            if use_mapping:
-                r += '{'
-            for idx, (k, v) in enumerate(input_dict.items()):
-                is_last = idx >= len(input_dict) - 1
-                end = '' if outest_level or is_last else ','
-                if isinstance(v, dict):
-                    v_str = '\n' + _format_dict(v)
-                    if use_mapping:
-                        k_str = f"'{k}'" if isinstance(k, str) else str(k)
-                        attr_str = f'{k_str}: dict({v_str}'
-                    else:
-                        attr_str = f'{str(k)}=dict({v_str}'
-                    attr_str = _indent(attr_str, indent) + ')' + end
-                elif isinstance(v, list):
-                    attr_str = _format_list(k, v, use_mapping) + end
-                else:
-                    attr_str = _format_basic_types(k, v, use_mapping) + end
-
-                s.append(attr_str)
-            r += '\n'.join(s)
-            if use_mapping:
-                r += '}'
-            return r
-
-        cfg_dict = self._cfg_dict.to_dict()
-        text = _format_dict(cfg_dict, outest_level=True)
-        # copied from setup.cfg
-        yapf_style = dict(
-            based_on_style='pep8',
-            blank_line_before_nested_class_or_def=True,
-            split_before_expression_after_opening_paren=True)
-        text, _ = FormatCode(text, style_config=yapf_style, verify=True)
-
-        return text
-
-    def __repr__(self):
-        return f'Config (path: {self.filename}): {self._cfg_dict.__repr__()}'
-
-    def __len__(self):
-        return len(self._cfg_dict)
-
-    def __getattr__(self, name):
-        return getattr(self._cfg_dict, name)
-
-    def __getitem__(self, name):
-        return self._cfg_dict.__getitem__(name)
-
-    def __setattr__(self, name, value):
-        if isinstance(value, dict):
-            value = ConfigDict(value)
-        self._cfg_dict.__setattr__(name, value)
-
-    def __setitem__(self, name, value):
-        if isinstance(value, dict):
-            value = ConfigDict(value)
-        self._cfg_dict.__setitem__(name, value)
-
-    def __iter__(self):
-        return iter(self._cfg_dict)
-
-    def __getstate__(self):
-        return (self._cfg_dict, self._filename, self._text)
-
-    def __setstate__(self, state):
-        _cfg_dict, _filename, _text = state
-        super(Config, self).__setattr__('_cfg_dict', _cfg_dict)
-        super(Config, self).__setattr__('_filename', _filename)
-        super(Config, self).__setattr__('_text', _text)
-
-    def dump(self, file=None):
-        cfg_dict = super(Config, self).__getattribute__('_cfg_dict').to_dict()
-        if self.filename.endswith('.py'):
-            if file is None:
-                return self.pretty_text
-            else:
-                with open(file, 'w', encoding='utf-8') as f:
-                    f.write(self.pretty_text)
-        else:
-            import annotator.uniformer.mmcv as mmcv
-            if file is None:
-                file_format = self.filename.split('.')[-1]
-                return mmcv.dump(cfg_dict, file_format=file_format)
-            else:
-                mmcv.dump(cfg_dict, file)
-
-    def merge_from_dict(self, options, allow_list_keys=True):
-        """Merge list into cfg_dict.
-
-        Merge the dict parsed by MultipleKVAction into this cfg.
-
-        Examples:
-            >>> options = {'model.backbone.depth': 50,
-            ...            'model.backbone.with_cp':True}
-            >>> cfg = Config(dict(model=dict(backbone=dict(type='ResNet'))))
-            >>> cfg.merge_from_dict(options)
-            >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
-            >>> assert cfg_dict == dict(
-            ...     model=dict(backbone=dict(depth=50, with_cp=True)))
-
-            # Merge list element
-            >>> cfg = Config(dict(pipeline=[
-            ...     dict(type='LoadImage'), dict(type='LoadAnnotations')]))
-            >>> options = dict(pipeline={'0': dict(type='SelfLoadImage')})
-            >>> cfg.merge_from_dict(options, allow_list_keys=True)
-            >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
-            >>> assert cfg_dict == dict(pipeline=[
-            ...     dict(type='SelfLoadImage'), dict(type='LoadAnnotations')])
-
-        Args:
-            options (dict): dict of configs to merge from.
-            allow_list_keys (bool): If True, int string keys (e.g. '0', '1')
-              are allowed in ``options`` and will replace the element of the
-              corresponding index in the config if the config is a list.
-              Default: True.
-        """
-        option_cfg_dict = {}
-        for full_key, v in options.items():
-            d = option_cfg_dict
-            key_list = full_key.split('.')
-            for subkey in key_list[:-1]:
-                d.setdefault(subkey, ConfigDict())
-                d = d[subkey]
-            subkey = key_list[-1]
-            d[subkey] = v
-
-        cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
-        super(Config, self).__setattr__(
-            '_cfg_dict',
-            Config._merge_a_into_b(
-                option_cfg_dict, cfg_dict, allow_list_keys=allow_list_keys))
-
-
-class DictAction(Action):
-    """
-    argparse action to split an argument into KEY=VALUE form
-    on the first = and append to a dictionary. List options can
-    be passed as comma separated values, i.e 'KEY=V1,V2,V3', or with explicit
-    brackets, i.e. 'KEY=[V1,V2,V3]'. It also support nested brackets to build
-    list/tuple values. e.g. 'KEY=[(V1,V2),(V3,V4)]'
-    """
-
-    @staticmethod
-    def _parse_int_float_bool(val):
-        try:
-            return int(val)
-        except ValueError:
-            pass
-        try:
-            return float(val)
-        except ValueError:
-            pass
-        if val.lower() in ['true', 'false']:
-            return True if val.lower() == 'true' else False
-        return val
-
-    @staticmethod
-    def _parse_iterable(val):
-        """Parse iterable values in the string.
-
-        All elements inside '()' or '[]' are treated as iterable values.
-
-        Args:
-            val (str): Value string.
-
-        Returns:
-            list | tuple: The expanded list or tuple from the string.
-
-        Examples:
-            >>> DictAction._parse_iterable('1,2,3')
-            [1, 2, 3]
-            >>> DictAction._parse_iterable('[a, b, c]')
-            ['a', 'b', 'c']
-            >>> DictAction._parse_iterable('[(1, 2, 3), [a, b], c]')
-            [(1, 2, 3), ['a', 'b'], 'c']
-        """
-
-        def find_next_comma(string):
-            """Find the position of next comma in the string.
-
-            If no ',' is found in the string, return the string length. All
-            chars inside '()' and '[]' are treated as one element and thus ','
-            inside these brackets are ignored.
-            """
-            assert (string.count('(') == string.count(')')) and (
-                    string.count('[') == string.count(']')), \
-                f'Imbalanced brackets exist in {string}'
-            end = len(string)
-            for idx, char in enumerate(string):
-                pre = string[:idx]
-                # The string before this ',' is balanced
-                if ((char == ',') and (pre.count('(') == pre.count(')'))
-                        and (pre.count('[') == pre.count(']'))):
-                    end = idx
-                    break
-            return end
-
-        # Strip ' and " characters and replace whitespace.
-        val = val.strip('\'\"').replace(' ', '')
-        is_tuple = False
-        if val.startswith('(') and val.endswith(')'):
-            is_tuple = True
-            val = val[1:-1]
-        elif val.startswith('[') and val.endswith(']'):
-            val = val[1:-1]
-        elif ',' not in val:
-            # val is a single value
-            return DictAction._parse_int_float_bool(val)
-
-        values = []
-        while len(val) > 0:
-            comma_idx = find_next_comma(val)
-            element = DictAction._parse_iterable(val[:comma_idx])
-            values.append(element)
-            val = val[comma_idx + 1:]
-        if is_tuple:
-            values = tuple(values)
-        return values
-
-    def __call__(self, parser, namespace, values, option_string=None):
-        options = {}
-        for kv in values:
-            key, val = kv.split('=', maxsplit=1)
-            options[key] = self._parse_iterable(val)
-        setattr(namespace, self.dest, options)
diff --git a/ControlNet/annotator/uniformer/mmcv/utils/env.py b/ControlNet/annotator/uniformer/mmcv/utils/env.py
deleted file mode 100644
index e3f0d92529e193e6d8339419bcd9bed7901a7769..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/utils/env.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-"""This file holding some environment constant for sharing by other files."""
-
-import os.path as osp
-import subprocess
-import sys
-from collections import defaultdict
-
-import cv2
-import torch
-
-import annotator.uniformer.mmcv as mmcv
-from .parrots_wrapper import get_build_config
-
-
-def collect_env():
-    """Collect the information of the running environments.
-
-    Returns:
-        dict: The environment information. The following fields are contained.
-
-            - sys.platform: The variable of ``sys.platform``.
-            - Python: Python version.
-            - CUDA available: Bool, indicating if CUDA is available.
-            - GPU devices: Device type of each GPU.
-            - CUDA_HOME (optional): The env var ``CUDA_HOME``.
-            - NVCC (optional): NVCC version.
-            - GCC: GCC version, "n/a" if GCC is not installed.
-            - PyTorch: PyTorch version.
-            - PyTorch compiling details: The output of \
-                ``torch.__config__.show()``.
-            - TorchVision (optional): TorchVision version.
-            - OpenCV: OpenCV version.
-            - MMCV: MMCV version.
-            - MMCV Compiler: The GCC version for compiling MMCV ops.
-            - MMCV CUDA Compiler: The CUDA version for compiling MMCV ops.
-    """
-    env_info = {}
-    env_info['sys.platform'] = sys.platform
-    env_info['Python'] = sys.version.replace('\n', '')
-
-    cuda_available = torch.cuda.is_available()
-    env_info['CUDA available'] = cuda_available
-
-    if cuda_available:
-        devices = defaultdict(list)
-        for k in range(torch.cuda.device_count()):
-            devices[torch.cuda.get_device_name(k)].append(str(k))
-        for name, device_ids in devices.items():
-            env_info['GPU ' + ','.join(device_ids)] = name
-
-        from annotator.uniformer.mmcv.utils.parrots_wrapper import _get_cuda_home
-        CUDA_HOME = _get_cuda_home()
-        env_info['CUDA_HOME'] = CUDA_HOME
-
-        if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
-            try:
-                nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
-                nvcc = subprocess.check_output(
-                    f'"{nvcc}" -V | tail -n1', shell=True)
-                nvcc = nvcc.decode('utf-8').strip()
-            except subprocess.SubprocessError:
-                nvcc = 'Not Available'
-            env_info['NVCC'] = nvcc
-
-    try:
-        gcc = subprocess.check_output('gcc --version | head -n1', shell=True)
-        gcc = gcc.decode('utf-8').strip()
-        env_info['GCC'] = gcc
-    except subprocess.CalledProcessError:  # gcc is unavailable
-        env_info['GCC'] = 'n/a'
-
-    env_info['PyTorch'] = torch.__version__
-    env_info['PyTorch compiling details'] = get_build_config()
-
-    try:
-        import torchvision
-        env_info['TorchVision'] = torchvision.__version__
-    except ModuleNotFoundError:
-        pass
-
-    env_info['OpenCV'] = cv2.__version__
-
-    env_info['MMCV'] = mmcv.__version__
-
-    try:
-        from annotator.uniformer.mmcv.ops import get_compiler_version, get_compiling_cuda_version
-    except ModuleNotFoundError:
-        env_info['MMCV Compiler'] = 'n/a'
-        env_info['MMCV CUDA Compiler'] = 'n/a'
-    else:
-        env_info['MMCV Compiler'] = get_compiler_version()
-        env_info['MMCV CUDA Compiler'] = get_compiling_cuda_version()
-
-    return env_info
diff --git a/ControlNet/annotator/uniformer/mmcv/utils/ext_loader.py b/ControlNet/annotator/uniformer/mmcv/utils/ext_loader.py
deleted file mode 100644
index 08132d2c1b9a1c28880e4bab4d4fa1ba39d9d083..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/utils/ext_loader.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import importlib
-import os
-import pkgutil
-import warnings
-from collections import namedtuple
-
-import torch
-
-if torch.__version__ != 'parrots':
-
-    def load_ext(name, funcs):
-        ext = importlib.import_module('mmcv.' + name)
-        for fun in funcs:
-            assert hasattr(ext, fun), f'{fun} miss in module {name}'
-        return ext
-else:
-    from parrots import extension
-    from parrots.base import ParrotsException
-
-    has_return_value_ops = [
-        'nms',
-        'softnms',
-        'nms_match',
-        'nms_rotated',
-        'top_pool_forward',
-        'top_pool_backward',
-        'bottom_pool_forward',
-        'bottom_pool_backward',
-        'left_pool_forward',
-        'left_pool_backward',
-        'right_pool_forward',
-        'right_pool_backward',
-        'fused_bias_leakyrelu',
-        'upfirdn2d',
-        'ms_deform_attn_forward',
-        'pixel_group',
-        'contour_expand',
-    ]
-
-    def get_fake_func(name, e):
-
-        def fake_func(*args, **kwargs):
-            warnings.warn(f'{name} is not supported in parrots now')
-            raise e
-
-        return fake_func
-
-    def load_ext(name, funcs):
-        ExtModule = namedtuple('ExtModule', funcs)
-        ext_list = []
-        lib_root = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
-        for fun in funcs:
-            try:
-                ext_fun = extension.load(fun, name, lib_dir=lib_root)
-            except ParrotsException as e:
-                if 'No element registered' not in e.message:
-                    warnings.warn(e.message)
-                ext_fun = get_fake_func(fun, e)
-                ext_list.append(ext_fun)
-            else:
-                if fun in has_return_value_ops:
-                    ext_list.append(ext_fun.op)
-                else:
-                    ext_list.append(ext_fun.op_)
-        return ExtModule(*ext_list)
-
-
-def check_ops_exist():
-    ext_loader = pkgutil.find_loader('mmcv._ext')
-    return ext_loader is not None
diff --git a/ControlNet/annotator/uniformer/mmcv/utils/logging.py b/ControlNet/annotator/uniformer/mmcv/utils/logging.py
deleted file mode 100644
index 4aa0e04bb9b3ab2a4bfbc4def50404ccbac2c6e6..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/utils/logging.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import logging
-
-import torch.distributed as dist
-
-logger_initialized = {}
-
-
-def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
-    """Initialize and get a logger by name.
-
-    If the logger has not been initialized, this method will initialize the
-    logger by adding one or two handlers, otherwise the initialized logger will
-    be directly returned. During initialization, a StreamHandler will always be
-    added. If `log_file` is specified and the process rank is 0, a FileHandler
-    will also be added.
-
-    Args:
-        name (str): Logger name.
-        log_file (str | None): The log filename. If specified, a FileHandler
-            will be added to the logger.
-        log_level (int): The logger level. Note that only the process of
-            rank 0 is affected, and other processes will set the level to
-            "Error" thus be silent most of the time.
-        file_mode (str): The file mode used in opening log file.
-            Defaults to 'w'.
-
-    Returns:
-        logging.Logger: The expected logger.
-    """
-    logger = logging.getLogger(name)
-    if name in logger_initialized:
-        return logger
-    # handle hierarchical names
-    # e.g., logger "a" is initialized, then logger "a.b" will skip the
-    # initialization since it is a child of "a".
-    for logger_name in logger_initialized:
-        if name.startswith(logger_name):
-            return logger
-
-    # handle duplicate logs to the console
-    # Starting in 1.8.0, PyTorch DDP attaches a StreamHandler <stderr> (NOTSET)
-    # to the root logger. As logger.propagate is True by default, this root
-    # level handler causes logging messages from rank>0 processes to
-    # unexpectedly show up on the console, creating much unwanted clutter.
-    # To fix this issue, we set the root logger's StreamHandler, if any, to log
-    # at the ERROR level.
-    for handler in logger.root.handlers:
-        if type(handler) is logging.StreamHandler:
-            handler.setLevel(logging.ERROR)
-
-    stream_handler = logging.StreamHandler()
-    handlers = [stream_handler]
-
-    if dist.is_available() and dist.is_initialized():
-        rank = dist.get_rank()
-    else:
-        rank = 0
-
-    # only rank 0 will add a FileHandler
-    if rank == 0 and log_file is not None:
-        # Here, the default behaviour of the official logger is 'a'. Thus, we
-        # provide an interface to change the file mode to the default
-        # behaviour.
-        file_handler = logging.FileHandler(log_file, file_mode)
-        handlers.append(file_handler)
-
-    formatter = logging.Formatter(
-        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-    for handler in handlers:
-        handler.setFormatter(formatter)
-        handler.setLevel(log_level)
-        logger.addHandler(handler)
-
-    if rank == 0:
-        logger.setLevel(log_level)
-    else:
-        logger.setLevel(logging.ERROR)
-
-    logger_initialized[name] = True
-
-    return logger
-
-
-def print_log(msg, logger=None, level=logging.INFO):
-    """Print a log message.
-
-    Args:
-        msg (str): The message to be logged.
-        logger (logging.Logger | str | None): The logger to be used.
-            Some special loggers are:
-            - "silent": no message will be printed.
-            - other str: the logger obtained with `get_root_logger(logger)`.
-            - None: The `print()` method will be used to print log messages.
-        level (int): Logging level. Only available when `logger` is a Logger
-            object or "root".
-    """
-    if logger is None:
-        print(msg)
-    elif isinstance(logger, logging.Logger):
-        logger.log(level, msg)
-    elif logger == 'silent':
-        pass
-    elif isinstance(logger, str):
-        _logger = get_logger(logger)
-        _logger.log(level, msg)
-    else:
-        raise TypeError(
-            'logger should be either a logging.Logger object, str, '
-            f'"silent" or None, but got {type(logger)}')
diff --git a/ControlNet/annotator/uniformer/mmcv/utils/misc.py b/ControlNet/annotator/uniformer/mmcv/utils/misc.py
deleted file mode 100644
index 2c58d0d7fee9fe3d4519270ad8c1e998d0d8a18c..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/utils/misc.py
+++ /dev/null
@@ -1,377 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import collections.abc
-import functools
-import itertools
-import subprocess
-import warnings
-from collections import abc
-from importlib import import_module
-from inspect import getfullargspec
-from itertools import repeat
-
-
-# From PyTorch internals
-def _ntuple(n):
-
-    def parse(x):
-        if isinstance(x, collections.abc.Iterable):
-            return x
-        return tuple(repeat(x, n))
-
-    return parse
-
-
-to_1tuple = _ntuple(1)
-to_2tuple = _ntuple(2)
-to_3tuple = _ntuple(3)
-to_4tuple = _ntuple(4)
-to_ntuple = _ntuple
-
-
-def is_str(x):
-    """Whether the input is an string instance.
-
-    Note: This method is deprecated since python 2 is no longer supported.
-    """
-    return isinstance(x, str)
-
-
-def import_modules_from_strings(imports, allow_failed_imports=False):
-    """Import modules from the given list of strings.
-
-    Args:
-        imports (list | str | None): The given module names to be imported.
-        allow_failed_imports (bool): If True, the failed imports will return
-            None. Otherwise, an ImportError is raise. Default: False.
-
-    Returns:
-        list[module] | module | None: The imported modules.
-
-    Examples:
-        >>> osp, sys = import_modules_from_strings(
-        ...     ['os.path', 'sys'])
-        >>> import os.path as osp_
-        >>> import sys as sys_
-        >>> assert osp == osp_
-        >>> assert sys == sys_
-    """
-    if not imports:
-        return
-    single_import = False
-    if isinstance(imports, str):
-        single_import = True
-        imports = [imports]
-    if not isinstance(imports, list):
-        raise TypeError(
-            f'custom_imports must be a list but got type {type(imports)}')
-    imported = []
-    for imp in imports:
-        if not isinstance(imp, str):
-            raise TypeError(
-                f'{imp} is of type {type(imp)} and cannot be imported.')
-        try:
-            imported_tmp = import_module(imp)
-        except ImportError:
-            if allow_failed_imports:
-                warnings.warn(f'{imp} failed to import and is ignored.',
-                              UserWarning)
-                imported_tmp = None
-            else:
-                raise ImportError
-        imported.append(imported_tmp)
-    if single_import:
-        imported = imported[0]
-    return imported
-
-
-def iter_cast(inputs, dst_type, return_type=None):
-    """Cast elements of an iterable object into some type.
-
-    Args:
-        inputs (Iterable): The input object.
-        dst_type (type): Destination type.
-        return_type (type, optional): If specified, the output object will be
-            converted to this type, otherwise an iterator.
-
-    Returns:
-        iterator or specified type: The converted object.
-    """
-    if not isinstance(inputs, abc.Iterable):
-        raise TypeError('inputs must be an iterable object')
-    if not isinstance(dst_type, type):
-        raise TypeError('"dst_type" must be a valid type')
-
-    out_iterable = map(dst_type, inputs)
-
-    if return_type is None:
-        return out_iterable
-    else:
-        return return_type(out_iterable)
-
-
-def list_cast(inputs, dst_type):
-    """Cast elements of an iterable object into a list of some type.
-
-    A partial method of :func:`iter_cast`.
-    """
-    return iter_cast(inputs, dst_type, return_type=list)
-
-
-def tuple_cast(inputs, dst_type):
-    """Cast elements of an iterable object into a tuple of some type.
-
-    A partial method of :func:`iter_cast`.
-    """
-    return iter_cast(inputs, dst_type, return_type=tuple)
-
-
-def is_seq_of(seq, expected_type, seq_type=None):
-    """Check whether it is a sequence of some type.
-
-    Args:
-        seq (Sequence): The sequence to be checked.
-        expected_type (type): Expected type of sequence items.
-        seq_type (type, optional): Expected sequence type.
-
-    Returns:
-        bool: Whether the sequence is valid.
-    """
-    if seq_type is None:
-        exp_seq_type = abc.Sequence
-    else:
-        assert isinstance(seq_type, type)
-        exp_seq_type = seq_type
-    if not isinstance(seq, exp_seq_type):
-        return False
-    for item in seq:
-        if not isinstance(item, expected_type):
-            return False
-    return True
-
-
-def is_list_of(seq, expected_type):
-    """Check whether it is a list of some type.
-
-    A partial method of :func:`is_seq_of`.
-    """
-    return is_seq_of(seq, expected_type, seq_type=list)
-
-
-def is_tuple_of(seq, expected_type):
-    """Check whether it is a tuple of some type.
-
-    A partial method of :func:`is_seq_of`.
-    """
-    return is_seq_of(seq, expected_type, seq_type=tuple)
-
-
-def slice_list(in_list, lens):
-    """Slice a list into several sub lists by a list of given length.
-
-    Args:
-        in_list (list): The list to be sliced.
-        lens(int or list): The expected length of each out list.
-
-    Returns:
-        list: A list of sliced list.
-    """
-    if isinstance(lens, int):
-        assert len(in_list) % lens == 0
-        lens = [lens] * int(len(in_list) / lens)
-    if not isinstance(lens, list):
-        raise TypeError('"indices" must be an integer or a list of integers')
-    elif sum(lens) != len(in_list):
-        raise ValueError('sum of lens and list length does not '
-                         f'match: {sum(lens)} != {len(in_list)}')
-    out_list = []
-    idx = 0
-    for i in range(len(lens)):
-        out_list.append(in_list[idx:idx + lens[i]])
-        idx += lens[i]
-    return out_list
-
-
-def concat_list(in_list):
-    """Concatenate a list of list into a single list.
-
-    Args:
-        in_list (list): The list of list to be merged.
-
-    Returns:
-        list: The concatenated flat list.
-    """
-    return list(itertools.chain(*in_list))
-
-
-def check_prerequisites(
-        prerequisites,
-        checker,
-        msg_tmpl='Prerequisites "{}" are required in method "{}" but not '
-        'found, please install them first.'):  # yapf: disable
-    """A decorator factory to check if prerequisites are satisfied.
-
-    Args:
-        prerequisites (str of list[str]): Prerequisites to be checked.
-        checker (callable): The checker method that returns True if a
-            prerequisite is meet, False otherwise.
-        msg_tmpl (str): The message template with two variables.
-
-    Returns:
-        decorator: A specific decorator.
-    """
-
-    def wrap(func):
-
-        @functools.wraps(func)
-        def wrapped_func(*args, **kwargs):
-            requirements = [prerequisites] if isinstance(
-                prerequisites, str) else prerequisites
-            missing = []
-            for item in requirements:
-                if not checker(item):
-                    missing.append(item)
-            if missing:
-                print(msg_tmpl.format(', '.join(missing), func.__name__))
-                raise RuntimeError('Prerequisites not meet.')
-            else:
-                return func(*args, **kwargs)
-
-        return wrapped_func
-
-    return wrap
-
-
-def _check_py_package(package):
-    try:
-        import_module(package)
-    except ImportError:
-        return False
-    else:
-        return True
-
-
-def _check_executable(cmd):
-    if subprocess.call(f'which {cmd}', shell=True) != 0:
-        return False
-    else:
-        return True
-
-
-def requires_package(prerequisites):
-    """A decorator to check if some python packages are installed.
-
-    Example:
-        >>> @requires_package('numpy')
-        >>> func(arg1, args):
-        >>>     return numpy.zeros(1)
-        array([0.])
-        >>> @requires_package(['numpy', 'non_package'])
-        >>> func(arg1, args):
-        >>>     return numpy.zeros(1)
-        ImportError
-    """
-    return check_prerequisites(prerequisites, checker=_check_py_package)
-
-
-def requires_executable(prerequisites):
-    """A decorator to check if some executable files are installed.
-
-    Example:
-        >>> @requires_executable('ffmpeg')
-        >>> func(arg1, args):
-        >>>     print(1)
-        1
-    """
-    return check_prerequisites(prerequisites, checker=_check_executable)
-
-
-def deprecated_api_warning(name_dict, cls_name=None):
-    """A decorator to check if some arguments are deprecate and try to replace
-    deprecate src_arg_name to dst_arg_name.
-
-    Args:
-        name_dict(dict):
-            key (str): Deprecate argument names.
-            val (str): Expected argument names.
-
-    Returns:
-        func: New function.
-    """
-
-    def api_warning_wrapper(old_func):
-
-        @functools.wraps(old_func)
-        def new_func(*args, **kwargs):
-            # get the arg spec of the decorated method
-            args_info = getfullargspec(old_func)
-            # get name of the function
-            func_name = old_func.__name__
-            if cls_name is not None:
-                func_name = f'{cls_name}.{func_name}'
-            if args:
-                arg_names = args_info.args[:len(args)]
-                for src_arg_name, dst_arg_name in name_dict.items():
-                    if src_arg_name in arg_names:
-                        warnings.warn(
-                            f'"{src_arg_name}" is deprecated in '
-                            f'`{func_name}`, please use "{dst_arg_name}" '
-                            'instead')
-                        arg_names[arg_names.index(src_arg_name)] = dst_arg_name
-            if kwargs:
-                for src_arg_name, dst_arg_name in name_dict.items():
-                    if src_arg_name in kwargs:
-
-                        assert dst_arg_name not in kwargs, (
-                            f'The expected behavior is to replace '
-                            f'the deprecated key `{src_arg_name}` to '
-                            f'new key `{dst_arg_name}`, but got them '
-                            f'in the arguments at the same time, which '
-                            f'is confusing. `{src_arg_name} will be '
-                            f'deprecated in the future, please '
-                            f'use `{dst_arg_name}` instead.')
-
-                        warnings.warn(
-                            f'"{src_arg_name}" is deprecated in '
-                            f'`{func_name}`, please use "{dst_arg_name}" '
-                            'instead')
-                        kwargs[dst_arg_name] = kwargs.pop(src_arg_name)
-
-            # apply converted arguments to the decorated method
-            output = old_func(*args, **kwargs)
-            return output
-
-        return new_func
-
-    return api_warning_wrapper
-
-
-def is_method_overridden(method, base_class, derived_class):
-    """Check if a method of base class is overridden in derived class.
-
-    Args:
-        method (str): the method name to check.
-        base_class (type): the class of the base class.
-        derived_class (type | Any): the class or instance of the derived class.
-    """
-    assert isinstance(base_class, type), \
-        "base_class doesn't accept instance, Please pass class instead."
-
-    if not isinstance(derived_class, type):
-        derived_class = derived_class.__class__
-
-    base_method = getattr(base_class, method)
-    derived_method = getattr(derived_class, method)
-    return derived_method != base_method
-
-
-def has_method(obj: object, method: str) -> bool:
-    """Check whether the object has a method.
-
-    Args:
-        method (str): The method name to check.
-        obj (object): The object to check.
-
-    Returns:
-        bool: True if the object has the method else False.
-    """
-    return hasattr(obj, method) and callable(getattr(obj, method))
diff --git a/ControlNet/annotator/uniformer/mmcv/utils/parrots_jit.py b/ControlNet/annotator/uniformer/mmcv/utils/parrots_jit.py
deleted file mode 100644
index 61873f6dbb9b10ed972c90aa8faa321e3cb3249e..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/utils/parrots_jit.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os
-
-from .parrots_wrapper import TORCH_VERSION
-
-parrots_jit_option = os.getenv('PARROTS_JIT_OPTION')
-
-if TORCH_VERSION == 'parrots' and parrots_jit_option == 'ON':
-    from parrots.jit import pat as jit
-else:
-
-    def jit(func=None,
-            check_input=None,
-            full_shape=True,
-            derivate=False,
-            coderize=False,
-            optimize=False):
-
-        def wrapper(func):
-
-            def wrapper_inner(*args, **kargs):
-                return func(*args, **kargs)
-
-            return wrapper_inner
-
-        if func is None:
-            return wrapper
-        else:
-            return func
-
-
-if TORCH_VERSION == 'parrots':
-    from parrots.utils.tester import skip_no_elena
-else:
-
-    def skip_no_elena(func):
-
-        def wrapper(*args, **kargs):
-            return func(*args, **kargs)
-
-        return wrapper
diff --git a/ControlNet/annotator/uniformer/mmcv/utils/parrots_wrapper.py b/ControlNet/annotator/uniformer/mmcv/utils/parrots_wrapper.py
deleted file mode 100644
index 93c97640d4b9ed088ca82cfe03e6efebfcfa9dbf..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/utils/parrots_wrapper.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from functools import partial
-
-import torch
-
-TORCH_VERSION = torch.__version__
-
-
-def is_rocm_pytorch() -> bool:
-    is_rocm = False
-    if TORCH_VERSION != 'parrots':
-        try:
-            from torch.utils.cpp_extension import ROCM_HOME
-            is_rocm = True if ((torch.version.hip is not None) and
-                               (ROCM_HOME is not None)) else False
-        except ImportError:
-            pass
-    return is_rocm
-
-
-def _get_cuda_home():
-    if TORCH_VERSION == 'parrots':
-        from parrots.utils.build_extension import CUDA_HOME
-    else:
-        if is_rocm_pytorch():
-            from torch.utils.cpp_extension import ROCM_HOME
-            CUDA_HOME = ROCM_HOME
-        else:
-            from torch.utils.cpp_extension import CUDA_HOME
-    return CUDA_HOME
-
-
-def get_build_config():
-    if TORCH_VERSION == 'parrots':
-        from parrots.config import get_build_info
-        return get_build_info()
-    else:
-        return torch.__config__.show()
-
-
-def _get_conv():
-    if TORCH_VERSION == 'parrots':
-        from parrots.nn.modules.conv import _ConvNd, _ConvTransposeMixin
-    else:
-        from torch.nn.modules.conv import _ConvNd, _ConvTransposeMixin
-    return _ConvNd, _ConvTransposeMixin
-
-
-def _get_dataloader():
-    if TORCH_VERSION == 'parrots':
-        from torch.utils.data import DataLoader, PoolDataLoader
-    else:
-        from torch.utils.data import DataLoader
-        PoolDataLoader = DataLoader
-    return DataLoader, PoolDataLoader
-
-
-def _get_extension():
-    if TORCH_VERSION == 'parrots':
-        from parrots.utils.build_extension import BuildExtension, Extension
-        CppExtension = partial(Extension, cuda=False)
-        CUDAExtension = partial(Extension, cuda=True)
-    else:
-        from torch.utils.cpp_extension import (BuildExtension, CppExtension,
-                                               CUDAExtension)
-    return BuildExtension, CppExtension, CUDAExtension
-
-
-def _get_pool():
-    if TORCH_VERSION == 'parrots':
-        from parrots.nn.modules.pool import (_AdaptiveAvgPoolNd,
-                                             _AdaptiveMaxPoolNd, _AvgPoolNd,
-                                             _MaxPoolNd)
-    else:
-        from torch.nn.modules.pooling import (_AdaptiveAvgPoolNd,
-                                              _AdaptiveMaxPoolNd, _AvgPoolNd,
-                                              _MaxPoolNd)
-    return _AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, _AvgPoolNd, _MaxPoolNd
-
-
-def _get_norm():
-    if TORCH_VERSION == 'parrots':
-        from parrots.nn.modules.batchnorm import _BatchNorm, _InstanceNorm
-        SyncBatchNorm_ = torch.nn.SyncBatchNorm2d
-    else:
-        from torch.nn.modules.instancenorm import _InstanceNorm
-        from torch.nn.modules.batchnorm import _BatchNorm
-        SyncBatchNorm_ = torch.nn.SyncBatchNorm
-    return _BatchNorm, _InstanceNorm, SyncBatchNorm_
-
-
-_ConvNd, _ConvTransposeMixin = _get_conv()
-DataLoader, PoolDataLoader = _get_dataloader()
-BuildExtension, CppExtension, CUDAExtension = _get_extension()
-_BatchNorm, _InstanceNorm, SyncBatchNorm_ = _get_norm()
-_AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, _AvgPoolNd, _MaxPoolNd = _get_pool()
-
-
-class SyncBatchNorm(SyncBatchNorm_):
-
-    def _check_input_dim(self, input):
-        if TORCH_VERSION == 'parrots':
-            if input.dim() < 2:
-                raise ValueError(
-                    f'expected at least 2D input (got {input.dim()}D input)')
-        else:
-            super()._check_input_dim(input)
diff --git a/ControlNet/annotator/uniformer/mmcv/utils/path.py b/ControlNet/annotator/uniformer/mmcv/utils/path.py
deleted file mode 100644
index 7dab4b3041413b1432b0f434b8b14783097d33c6..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/utils/path.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os
-import os.path as osp
-from pathlib import Path
-
-from .misc import is_str
-
-
-def is_filepath(x):
-    return is_str(x) or isinstance(x, Path)
-
-
-def fopen(filepath, *args, **kwargs):
-    if is_str(filepath):
-        return open(filepath, *args, **kwargs)
-    elif isinstance(filepath, Path):
-        return filepath.open(*args, **kwargs)
-    raise ValueError('`filepath` should be a string or a Path')
-
-
-def check_file_exist(filename, msg_tmpl='file "{}" does not exist'):
-    if not osp.isfile(filename):
-        raise FileNotFoundError(msg_tmpl.format(filename))
-
-
-def mkdir_or_exist(dir_name, mode=0o777):
-    if dir_name == '':
-        return
-    dir_name = osp.expanduser(dir_name)
-    os.makedirs(dir_name, mode=mode, exist_ok=True)
-
-
-def symlink(src, dst, overwrite=True, **kwargs):
-    if os.path.lexists(dst) and overwrite:
-        os.remove(dst)
-    os.symlink(src, dst, **kwargs)
-
-
-def scandir(dir_path, suffix=None, recursive=False, case_sensitive=True):
-    """Scan a directory to find the interested files.
-
-    Args:
-        dir_path (str | obj:`Path`): Path of the directory.
-        suffix (str | tuple(str), optional): File suffix that we are
-            interested in. Default: None.
-        recursive (bool, optional): If set to True, recursively scan the
-            directory. Default: False.
-        case_sensitive (bool, optional) : If set to False, ignore the case of
-            suffix. Default: True.
-
-    Returns:
-        A generator for all the interested files with relative paths.
-    """
-    if isinstance(dir_path, (str, Path)):
-        dir_path = str(dir_path)
-    else:
-        raise TypeError('"dir_path" must be a string or Path object')
-
-    if (suffix is not None) and not isinstance(suffix, (str, tuple)):
-        raise TypeError('"suffix" must be a string or tuple of strings')
-
-    if suffix is not None and not case_sensitive:
-        suffix = suffix.lower() if isinstance(suffix, str) else tuple(
-            item.lower() for item in suffix)
-
-    root = dir_path
-
-    def _scandir(dir_path, suffix, recursive, case_sensitive):
-        for entry in os.scandir(dir_path):
-            if not entry.name.startswith('.') and entry.is_file():
-                rel_path = osp.relpath(entry.path, root)
-                _rel_path = rel_path if case_sensitive else rel_path.lower()
-                if suffix is None or _rel_path.endswith(suffix):
-                    yield rel_path
-            elif recursive and os.path.isdir(entry.path):
-                # scan recursively if entry.path is a directory
-                yield from _scandir(entry.path, suffix, recursive,
-                                    case_sensitive)
-
-    return _scandir(dir_path, suffix, recursive, case_sensitive)
-
-
-def find_vcs_root(path, markers=('.git', )):
-    """Finds the root directory (including itself) of specified markers.
-
-    Args:
-        path (str): Path of directory or file.
-        markers (list[str], optional): List of file or directory names.
-
-    Returns:
-        The directory contained one of the markers or None if not found.
-    """
-    if osp.isfile(path):
-        path = osp.dirname(path)
-
-    prev, cur = None, osp.abspath(osp.expanduser(path))
-    while cur != prev:
-        if any(osp.exists(osp.join(cur, marker)) for marker in markers):
-            return cur
-        prev, cur = cur, osp.split(cur)[0]
-    return None
diff --git a/ControlNet/annotator/uniformer/mmcv/utils/progressbar.py b/ControlNet/annotator/uniformer/mmcv/utils/progressbar.py
deleted file mode 100644
index 0062f670dd94fa9da559ab26ef85517dcf5211c7..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/utils/progressbar.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import sys
-from collections.abc import Iterable
-from multiprocessing import Pool
-from shutil import get_terminal_size
-
-from .timer import Timer
-
-
-class ProgressBar:
-    """A progress bar which can print the progress."""
-
-    def __init__(self, task_num=0, bar_width=50, start=True, file=sys.stdout):
-        self.task_num = task_num
-        self.bar_width = bar_width
-        self.completed = 0
-        self.file = file
-        if start:
-            self.start()
-
-    @property
-    def terminal_width(self):
-        width, _ = get_terminal_size()
-        return width
-
-    def start(self):
-        if self.task_num > 0:
-            self.file.write(f'[{" " * self.bar_width}] 0/{self.task_num}, '
-                            'elapsed: 0s, ETA:')
-        else:
-            self.file.write('completed: 0, elapsed: 0s')
-        self.file.flush()
-        self.timer = Timer()
-
-    def update(self, num_tasks=1):
-        assert num_tasks > 0
-        self.completed += num_tasks
-        elapsed = self.timer.since_start()
-        if elapsed > 0:
-            fps = self.completed / elapsed
-        else:
-            fps = float('inf')
-        if self.task_num > 0:
-            percentage = self.completed / float(self.task_num)
-            eta = int(elapsed * (1 - percentage) / percentage + 0.5)
-            msg = f'\r[{{}}] {self.completed}/{self.task_num}, ' \
-                  f'{fps:.1f} task/s, elapsed: {int(elapsed + 0.5)}s, ' \
-                  f'ETA: {eta:5}s'
-
-            bar_width = min(self.bar_width,
-                            int(self.terminal_width - len(msg)) + 2,
-                            int(self.terminal_width * 0.6))
-            bar_width = max(2, bar_width)
-            mark_width = int(bar_width * percentage)
-            bar_chars = '>' * mark_width + ' ' * (bar_width - mark_width)
-            self.file.write(msg.format(bar_chars))
-        else:
-            self.file.write(
-                f'completed: {self.completed}, elapsed: {int(elapsed + 0.5)}s,'
-                f' {fps:.1f} tasks/s')
-        self.file.flush()
-
-
-def track_progress(func, tasks, bar_width=50, file=sys.stdout, **kwargs):
-    """Track the progress of tasks execution with a progress bar.
-
-    Tasks are done with a simple for-loop.
-
-    Args:
-        func (callable): The function to be applied to each task.
-        tasks (list or tuple[Iterable, int]): A list of tasks or
-            (tasks, total num).
-        bar_width (int): Width of progress bar.
-
-    Returns:
-        list: The task results.
-    """
-    if isinstance(tasks, tuple):
-        assert len(tasks) == 2
-        assert isinstance(tasks[0], Iterable)
-        assert isinstance(tasks[1], int)
-        task_num = tasks[1]
-        tasks = tasks[0]
-    elif isinstance(tasks, Iterable):
-        task_num = len(tasks)
-    else:
-        raise TypeError(
-            '"tasks" must be an iterable object or a (iterator, int) tuple')
-    prog_bar = ProgressBar(task_num, bar_width, file=file)
-    results = []
-    for task in tasks:
-        results.append(func(task, **kwargs))
-        prog_bar.update()
-    prog_bar.file.write('\n')
-    return results
-
-
-def init_pool(process_num, initializer=None, initargs=None):
-    if initializer is None:
-        return Pool(process_num)
-    elif initargs is None:
-        return Pool(process_num, initializer)
-    else:
-        if not isinstance(initargs, tuple):
-            raise TypeError('"initargs" must be a tuple')
-        return Pool(process_num, initializer, initargs)
-
-
-def track_parallel_progress(func,
-                            tasks,
-                            nproc,
-                            initializer=None,
-                            initargs=None,
-                            bar_width=50,
-                            chunksize=1,
-                            skip_first=False,
-                            keep_order=True,
-                            file=sys.stdout):
-    """Track the progress of parallel task execution with a progress bar.
-
-    The built-in :mod:`multiprocessing` module is used for process pools and
-    tasks are done with :func:`Pool.map` or :func:`Pool.imap_unordered`.
-
-    Args:
-        func (callable): The function to be applied to each task.
-        tasks (list or tuple[Iterable, int]): A list of tasks or
-            (tasks, total num).
-        nproc (int): Process (worker) number.
-        initializer (None or callable): Refer to :class:`multiprocessing.Pool`
-            for details.
-        initargs (None or tuple): Refer to :class:`multiprocessing.Pool` for
-            details.
-        chunksize (int): Refer to :class:`multiprocessing.Pool` for details.
-        bar_width (int): Width of progress bar.
-        skip_first (bool): Whether to skip the first sample for each worker
-            when estimating fps, since the initialization step may takes
-            longer.
-        keep_order (bool): If True, :func:`Pool.imap` is used, otherwise
-            :func:`Pool.imap_unordered` is used.
-
-    Returns:
-        list: The task results.
-    """
-    if isinstance(tasks, tuple):
-        assert len(tasks) == 2
-        assert isinstance(tasks[0], Iterable)
-        assert isinstance(tasks[1], int)
-        task_num = tasks[1]
-        tasks = tasks[0]
-    elif isinstance(tasks, Iterable):
-        task_num = len(tasks)
-    else:
-        raise TypeError(
-            '"tasks" must be an iterable object or a (iterator, int) tuple')
-    pool = init_pool(nproc, initializer, initargs)
-    start = not skip_first
-    task_num -= nproc * chunksize * int(skip_first)
-    prog_bar = ProgressBar(task_num, bar_width, start, file=file)
-    results = []
-    if keep_order:
-        gen = pool.imap(func, tasks, chunksize)
-    else:
-        gen = pool.imap_unordered(func, tasks, chunksize)
-    for result in gen:
-        results.append(result)
-        if skip_first:
-            if len(results) < nproc * chunksize:
-                continue
-            elif len(results) == nproc * chunksize:
-                prog_bar.start()
-                continue
-        prog_bar.update()
-    prog_bar.file.write('\n')
-    pool.close()
-    pool.join()
-    return results
-
-
-def track_iter_progress(tasks, bar_width=50, file=sys.stdout):
-    """Track the progress of tasks iteration or enumeration with a progress
-    bar.
-
-    Tasks are yielded with a simple for-loop.
-
-    Args:
-        tasks (list or tuple[Iterable, int]): A list of tasks or
-            (tasks, total num).
-        bar_width (int): Width of progress bar.
-
-    Yields:
-        list: The task results.
-    """
-    if isinstance(tasks, tuple):
-        assert len(tasks) == 2
-        assert isinstance(tasks[0], Iterable)
-        assert isinstance(tasks[1], int)
-        task_num = tasks[1]
-        tasks = tasks[0]
-    elif isinstance(tasks, Iterable):
-        task_num = len(tasks)
-    else:
-        raise TypeError(
-            '"tasks" must be an iterable object or a (iterator, int) tuple')
-    prog_bar = ProgressBar(task_num, bar_width, file=file)
-    for task in tasks:
-        yield task
-        prog_bar.update()
-    prog_bar.file.write('\n')
diff --git a/ControlNet/annotator/uniformer/mmcv/utils/registry.py b/ControlNet/annotator/uniformer/mmcv/utils/registry.py
deleted file mode 100644
index fa9df39bc9f3d8d568361e7250ab35468f2b74e0..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/utils/registry.py
+++ /dev/null
@@ -1,315 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import inspect
-import warnings
-from functools import partial
-
-from .misc import is_seq_of
-
-
-def build_from_cfg(cfg, registry, default_args=None):
-    """Build a module from config dict.
-
-    Args:
-        cfg (dict): Config dict. It should at least contain the key "type".
-        registry (:obj:`Registry`): The registry to search the type from.
-        default_args (dict, optional): Default initialization arguments.
-
-    Returns:
-        object: The constructed object.
-    """
-    if not isinstance(cfg, dict):
-        raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
-    if 'type' not in cfg:
-        if default_args is None or 'type' not in default_args:
-            raise KeyError(
-                '`cfg` or `default_args` must contain the key "type", '
-                f'but got {cfg}\n{default_args}')
-    if not isinstance(registry, Registry):
-        raise TypeError('registry must be an mmcv.Registry object, '
-                        f'but got {type(registry)}')
-    if not (isinstance(default_args, dict) or default_args is None):
-        raise TypeError('default_args must be a dict or None, '
-                        f'but got {type(default_args)}')
-
-    args = cfg.copy()
-
-    if default_args is not None:
-        for name, value in default_args.items():
-            args.setdefault(name, value)
-
-    obj_type = args.pop('type')
-    if isinstance(obj_type, str):
-        obj_cls = registry.get(obj_type)
-        if obj_cls is None:
-            raise KeyError(
-                f'{obj_type} is not in the {registry.name} registry')
-    elif inspect.isclass(obj_type):
-        obj_cls = obj_type
-    else:
-        raise TypeError(
-            f'type must be a str or valid type, but got {type(obj_type)}')
-    try:
-        return obj_cls(**args)
-    except Exception as e:
-        # Normal TypeError does not print class name.
-        raise type(e)(f'{obj_cls.__name__}: {e}')
-
-
-class Registry:
-    """A registry to map strings to classes.
-
-    Registered object could be built from registry.
-    Example:
-        >>> MODELS = Registry('models')
-        >>> @MODELS.register_module()
-        >>> class ResNet:
-        >>>     pass
-        >>> resnet = MODELS.build(dict(type='ResNet'))
-
-    Please refer to
-    https://mmcv.readthedocs.io/en/latest/understand_mmcv/registry.html for
-    advanced usage.
-
-    Args:
-        name (str): Registry name.
-        build_func(func, optional): Build function to construct instance from
-            Registry, func:`build_from_cfg` is used if neither ``parent`` or
-            ``build_func`` is specified. If ``parent`` is specified and
-            ``build_func`` is not given,  ``build_func`` will be inherited
-            from ``parent``. Default: None.
-        parent (Registry, optional): Parent registry. The class registered in
-            children registry could be built from parent. Default: None.
-        scope (str, optional): The scope of registry. It is the key to search
-            for children registry. If not specified, scope will be the name of
-            the package where class is defined, e.g. mmdet, mmcls, mmseg.
-            Default: None.
-    """
-
-    def __init__(self, name, build_func=None, parent=None, scope=None):
-        self._name = name
-        self._module_dict = dict()
-        self._children = dict()
-        self._scope = self.infer_scope() if scope is None else scope
-
-        # self.build_func will be set with the following priority:
-        # 1. build_func
-        # 2. parent.build_func
-        # 3. build_from_cfg
-        if build_func is None:
-            if parent is not None:
-                self.build_func = parent.build_func
-            else:
-                self.build_func = build_from_cfg
-        else:
-            self.build_func = build_func
-        if parent is not None:
-            assert isinstance(parent, Registry)
-            parent._add_children(self)
-            self.parent = parent
-        else:
-            self.parent = None
-
-    def __len__(self):
-        return len(self._module_dict)
-
-    def __contains__(self, key):
-        return self.get(key) is not None
-
-    def __repr__(self):
-        format_str = self.__class__.__name__ + \
-                     f'(name={self._name}, ' \
-                     f'items={self._module_dict})'
-        return format_str
-
-    @staticmethod
-    def infer_scope():
-        """Infer the scope of registry.
-
-        The name of the package where registry is defined will be returned.
-
-        Example:
-            # in mmdet/models/backbone/resnet.py
-            >>> MODELS = Registry('models')
-            >>> @MODELS.register_module()
-            >>> class ResNet:
-            >>>     pass
-            The scope of ``ResNet`` will be ``mmdet``.
-
-
-        Returns:
-            scope (str): The inferred scope name.
-        """
-        # inspect.stack() trace where this function is called, the index-2
-        # indicates the frame where `infer_scope()` is called
-        filename = inspect.getmodule(inspect.stack()[2][0]).__name__
-        split_filename = filename.split('.')
-        return split_filename[0]
-
-    @staticmethod
-    def split_scope_key(key):
-        """Split scope and key.
-
-        The first scope will be split from key.
-
-        Examples:
-            >>> Registry.split_scope_key('mmdet.ResNet')
-            'mmdet', 'ResNet'
-            >>> Registry.split_scope_key('ResNet')
-            None, 'ResNet'
-
-        Return:
-            scope (str, None): The first scope.
-            key (str): The remaining key.
-        """
-        split_index = key.find('.')
-        if split_index != -1:
-            return key[:split_index], key[split_index + 1:]
-        else:
-            return None, key
-
-    @property
-    def name(self):
-        return self._name
-
-    @property
-    def scope(self):
-        return self._scope
-
-    @property
-    def module_dict(self):
-        return self._module_dict
-
-    @property
-    def children(self):
-        return self._children
-
-    def get(self, key):
-        """Get the registry record.
-
-        Args:
-            key (str): The class name in string format.
-
-        Returns:
-            class: The corresponding class.
-        """
-        scope, real_key = self.split_scope_key(key)
-        if scope is None or scope == self._scope:
-            # get from self
-            if real_key in self._module_dict:
-                return self._module_dict[real_key]
-        else:
-            # get from self._children
-            if scope in self._children:
-                return self._children[scope].get(real_key)
-            else:
-                # goto root
-                parent = self.parent
-                while parent.parent is not None:
-                    parent = parent.parent
-                return parent.get(key)
-
-    def build(self, *args, **kwargs):
-        return self.build_func(*args, **kwargs, registry=self)
-
-    def _add_children(self, registry):
-        """Add children for a registry.
-
-        The ``registry`` will be added as children based on its scope.
-        The parent registry could build objects from children registry.
-
-        Example:
-            >>> models = Registry('models')
-            >>> mmdet_models = Registry('models', parent=models)
-            >>> @mmdet_models.register_module()
-            >>> class ResNet:
-            >>>     pass
-            >>> resnet = models.build(dict(type='mmdet.ResNet'))
-        """
-
-        assert isinstance(registry, Registry)
-        assert registry.scope is not None
-        assert registry.scope not in self.children, \
-            f'scope {registry.scope} exists in {self.name} registry'
-        self.children[registry.scope] = registry
-
-    def _register_module(self, module_class, module_name=None, force=False):
-        if not inspect.isclass(module_class):
-            raise TypeError('module must be a class, '
-                            f'but got {type(module_class)}')
-
-        if module_name is None:
-            module_name = module_class.__name__
-        if isinstance(module_name, str):
-            module_name = [module_name]
-        for name in module_name:
-            if not force and name in self._module_dict:
-                raise KeyError(f'{name} is already registered '
-                               f'in {self.name}')
-            self._module_dict[name] = module_class
-
-    def deprecated_register_module(self, cls=None, force=False):
-        warnings.warn(
-            'The old API of register_module(module, force=False) '
-            'is deprecated and will be removed, please use the new API '
-            'register_module(name=None, force=False, module=None) instead.')
-        if cls is None:
-            return partial(self.deprecated_register_module, force=force)
-        self._register_module(cls, force=force)
-        return cls
-
-    def register_module(self, name=None, force=False, module=None):
-        """Register a module.
-
-        A record will be added to `self._module_dict`, whose key is the class
-        name or the specified name, and value is the class itself.
-        It can be used as a decorator or a normal function.
-
-        Example:
-            >>> backbones = Registry('backbone')
-            >>> @backbones.register_module()
-            >>> class ResNet:
-            >>>     pass
-
-            >>> backbones = Registry('backbone')
-            >>> @backbones.register_module(name='mnet')
-            >>> class MobileNet:
-            >>>     pass
-
-            >>> backbones = Registry('backbone')
-            >>> class ResNet:
-            >>>     pass
-            >>> backbones.register_module(ResNet)
-
-        Args:
-            name (str | None): The module name to be registered. If not
-                specified, the class name will be used.
-            force (bool, optional): Whether to override an existing class with
-                the same name. Default: False.
-            module (type): Module class to be registered.
-        """
-        if not isinstance(force, bool):
-            raise TypeError(f'force must be a boolean, but got {type(force)}')
-        # NOTE: This is a walkaround to be compatible with the old api,
-        # while it may introduce unexpected bugs.
-        if isinstance(name, type):
-            return self.deprecated_register_module(name, force=force)
-
-        # raise the error ahead of time
-        if not (name is None or isinstance(name, str) or is_seq_of(name, str)):
-            raise TypeError(
-                'name must be either of None, an instance of str or a sequence'
-                f'  of str, but got {type(name)}')
-
-        # use it as a normal method: x.register_module(module=SomeClass)
-        if module is not None:
-            self._register_module(
-                module_class=module, module_name=name, force=force)
-            return module
-
-        # use it as a decorator: @x.register_module()
-        def _register(cls):
-            self._register_module(
-                module_class=cls, module_name=name, force=force)
-            return cls
-
-        return _register
diff --git a/ControlNet/annotator/uniformer/mmcv/utils/testing.py b/ControlNet/annotator/uniformer/mmcv/utils/testing.py
deleted file mode 100644
index a27f936da8ec14bac18562ede0a79d476d82f797..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/utils/testing.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (c) Open-MMLab.
-import sys
-from collections.abc import Iterable
-from runpy import run_path
-from shlex import split
-from typing import Any, Dict, List
-from unittest.mock import patch
-
-
-def check_python_script(cmd):
-    """Run the python cmd script with `__main__`. The difference between
-    `os.system` is that, this function exectues code in the current process, so
-    that it can be tracked by coverage tools. Currently it supports two forms:
-
-    - ./tests/data/scripts/hello.py zz
-    - python tests/data/scripts/hello.py zz
-    """
-    args = split(cmd)
-    if args[0] == 'python':
-        args = args[1:]
-    with patch.object(sys, 'argv', args):
-        run_path(args[0], run_name='__main__')
-
-
-def _any(judge_result):
-    """Since built-in ``any`` works only when the element of iterable is not
-    iterable, implement the function."""
-    if not isinstance(judge_result, Iterable):
-        return judge_result
-
-    try:
-        for element in judge_result:
-            if _any(element):
-                return True
-    except TypeError:
-        # Maybe encounter the case: torch.tensor(True) | torch.tensor(False)
-        if judge_result:
-            return True
-    return False
-
-
-def assert_dict_contains_subset(dict_obj: Dict[Any, Any],
-                                expected_subset: Dict[Any, Any]) -> bool:
-    """Check if the dict_obj contains the expected_subset.
-
-    Args:
-        dict_obj (Dict[Any, Any]): Dict object to be checked.
-        expected_subset (Dict[Any, Any]): Subset expected to be contained in
-            dict_obj.
-
-    Returns:
-        bool: Whether the dict_obj contains the expected_subset.
-    """
-
-    for key, value in expected_subset.items():
-        if key not in dict_obj.keys() or _any(dict_obj[key] != value):
-            return False
-    return True
-
-
-def assert_attrs_equal(obj: Any, expected_attrs: Dict[str, Any]) -> bool:
-    """Check if attribute of class object is correct.
-
-    Args:
-        obj (object): Class object to be checked.
-        expected_attrs (Dict[str, Any]): Dict of the expected attrs.
-
-    Returns:
-        bool: Whether the attribute of class object is correct.
-    """
-    for attr, value in expected_attrs.items():
-        if not hasattr(obj, attr) or _any(getattr(obj, attr) != value):
-            return False
-    return True
-
-
-def assert_dict_has_keys(obj: Dict[str, Any],
-                         expected_keys: List[str]) -> bool:
-    """Check if the obj has all the expected_keys.
-
-    Args:
-        obj (Dict[str, Any]): Object to be checked.
-        expected_keys (List[str]): Keys expected to contained in the keys of
-            the obj.
-
-    Returns:
-        bool: Whether the obj has the expected keys.
-    """
-    return set(expected_keys).issubset(set(obj.keys()))
-
-
-def assert_keys_equal(result_keys: List[str], target_keys: List[str]) -> bool:
-    """Check if target_keys is equal to result_keys.
-
-    Args:
-        result_keys (List[str]): Result keys to be checked.
-        target_keys (List[str]): Target keys to be checked.
-
-    Returns:
-        bool: Whether target_keys is equal to result_keys.
-    """
-    return set(result_keys) == set(target_keys)
-
-
-def assert_is_norm_layer(module) -> bool:
-    """Check if the module is a norm layer.
-
-    Args:
-        module (nn.Module): The module to be checked.
-
-    Returns:
-        bool: Whether the module is a norm layer.
-    """
-    from .parrots_wrapper import _BatchNorm, _InstanceNorm
-    from torch.nn import GroupNorm, LayerNorm
-    norm_layer_candidates = (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm)
-    return isinstance(module, norm_layer_candidates)
-
-
-def assert_params_all_zeros(module) -> bool:
-    """Check if the parameters of the module is all zeros.
-
-    Args:
-        module (nn.Module): The module to be checked.
-
-    Returns:
-        bool: Whether the parameters of the module is all zeros.
-    """
-    weight_data = module.weight.data
-    is_weight_zero = weight_data.allclose(
-        weight_data.new_zeros(weight_data.size()))
-
-    if hasattr(module, 'bias') and module.bias is not None:
-        bias_data = module.bias.data
-        is_bias_zero = bias_data.allclose(
-            bias_data.new_zeros(bias_data.size()))
-    else:
-        is_bias_zero = True
-
-    return is_weight_zero and is_bias_zero
diff --git a/ControlNet/annotator/uniformer/mmcv/utils/timer.py b/ControlNet/annotator/uniformer/mmcv/utils/timer.py
deleted file mode 100644
index e3db7d497d8b374e18b5297e0a1d6eb186fd8cba..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/utils/timer.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from time import time
-
-
-class TimerError(Exception):
-
-    def __init__(self, message):
-        self.message = message
-        super(TimerError, self).__init__(message)
-
-
-class Timer:
-    """A flexible Timer class.
-
-    :Example:
-
-    >>> import time
-    >>> import annotator.uniformer.mmcv as mmcv
-    >>> with mmcv.Timer():
-    >>>     # simulate a code block that will run for 1s
-    >>>     time.sleep(1)
-    1.000
-    >>> with mmcv.Timer(print_tmpl='it takes {:.1f} seconds'):
-    >>>     # simulate a code block that will run for 1s
-    >>>     time.sleep(1)
-    it takes 1.0 seconds
-    >>> timer = mmcv.Timer()
-    >>> time.sleep(0.5)
-    >>> print(timer.since_start())
-    0.500
-    >>> time.sleep(0.5)
-    >>> print(timer.since_last_check())
-    0.500
-    >>> print(timer.since_start())
-    1.000
-    """
-
-    def __init__(self, start=True, print_tmpl=None):
-        self._is_running = False
-        self.print_tmpl = print_tmpl if print_tmpl else '{:.3f}'
-        if start:
-            self.start()
-
-    @property
-    def is_running(self):
-        """bool: indicate whether the timer is running"""
-        return self._is_running
-
-    def __enter__(self):
-        self.start()
-        return self
-
-    def __exit__(self, type, value, traceback):
-        print(self.print_tmpl.format(self.since_last_check()))
-        self._is_running = False
-
-    def start(self):
-        """Start the timer."""
-        if not self._is_running:
-            self._t_start = time()
-            self._is_running = True
-        self._t_last = time()
-
-    def since_start(self):
-        """Total time since the timer is started.
-
-        Returns (float): Time in seconds.
-        """
-        if not self._is_running:
-            raise TimerError('timer is not running')
-        self._t_last = time()
-        return self._t_last - self._t_start
-
-    def since_last_check(self):
-        """Time since the last checking.
-
-        Either :func:`since_start` or :func:`since_last_check` is a checking
-        operation.
-
-        Returns (float): Time in seconds.
-        """
-        if not self._is_running:
-            raise TimerError('timer is not running')
-        dur = time() - self._t_last
-        self._t_last = time()
-        return dur
-
-
-_g_timers = {}  # global timers
-
-
-def check_time(timer_id):
-    """Add check points in a single line.
-
-    This method is suitable for running a task on a list of items. A timer will
-    be registered when the method is called for the first time.
-
-    :Example:
-
-    >>> import time
-    >>> import annotator.uniformer.mmcv as mmcv
-    >>> for i in range(1, 6):
-    >>>     # simulate a code block
-    >>>     time.sleep(i)
-    >>>     mmcv.check_time('task1')
-    2.000
-    3.000
-    4.000
-    5.000
-
-    Args:
-        timer_id (str): Timer identifier.
-    """
-    if timer_id not in _g_timers:
-        _g_timers[timer_id] = Timer()
-        return 0
-    else:
-        return _g_timers[timer_id].since_last_check()
diff --git a/ControlNet/annotator/uniformer/mmcv/utils/trace.py b/ControlNet/annotator/uniformer/mmcv/utils/trace.py
deleted file mode 100644
index 5ca99dc3eda05ef980d9a4249b50deca8273b6cc..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/utils/trace.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import warnings
-
-import torch
-
-from annotator.uniformer.mmcv.utils import digit_version
-
-
-def is_jit_tracing() -> bool:
-    if (torch.__version__ != 'parrots'
-            and digit_version(torch.__version__) >= digit_version('1.6.0')):
-        on_trace = torch.jit.is_tracing()
-        # In PyTorch 1.6, torch.jit.is_tracing has a bug.
-        # Refers to https://github.com/pytorch/pytorch/issues/42448
-        if isinstance(on_trace, bool):
-            return on_trace
-        else:
-            return torch._C._is_tracing()
-    else:
-        warnings.warn(
-            'torch.jit.is_tracing is only supported after v1.6.0. '
-            'Therefore is_tracing returns False automatically. Please '
-            'set on_trace manually if you are using trace.', UserWarning)
-        return False
diff --git a/ControlNet/annotator/uniformer/mmcv/utils/version_utils.py b/ControlNet/annotator/uniformer/mmcv/utils/version_utils.py
deleted file mode 100644
index 963c45a2e8a86a88413ab6c18c22481fb9831985..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/utils/version_utils.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os
-import subprocess
-import warnings
-
-from packaging.version import parse
-
-
-def digit_version(version_str: str, length: int = 4):
-    """Convert a version string into a tuple of integers.
-
-    This method is usually used for comparing two versions. For pre-release
-    versions: alpha < beta < rc.
-
-    Args:
-        version_str (str): The version string.
-        length (int): The maximum number of version levels. Default: 4.
-
-    Returns:
-        tuple[int]: The version info in digits (integers).
-    """
-    assert 'parrots' not in version_str
-    version = parse(version_str)
-    assert version.release, f'failed to parse version {version_str}'
-    release = list(version.release)
-    release = release[:length]
-    if len(release) < length:
-        release = release + [0] * (length - len(release))
-    if version.is_prerelease:
-        mapping = {'a': -3, 'b': -2, 'rc': -1}
-        val = -4
-        # version.pre can be None
-        if version.pre:
-            if version.pre[0] not in mapping:
-                warnings.warn(f'unknown prerelease version {version.pre[0]}, '
-                              'version checking may go wrong')
-            else:
-                val = mapping[version.pre[0]]
-            release.extend([val, version.pre[-1]])
-        else:
-            release.extend([val, 0])
-
-    elif version.is_postrelease:
-        release.extend([1, version.post])
-    else:
-        release.extend([0, 0])
-    return tuple(release)
-
-
-def _minimal_ext_cmd(cmd):
-    # construct minimal environment
-    env = {}
-    for k in ['SYSTEMROOT', 'PATH', 'HOME']:
-        v = os.environ.get(k)
-        if v is not None:
-            env[k] = v
-    # LANGUAGE is used on win32
-    env['LANGUAGE'] = 'C'
-    env['LANG'] = 'C'
-    env['LC_ALL'] = 'C'
-    out = subprocess.Popen(
-        cmd, stdout=subprocess.PIPE, env=env).communicate()[0]
-    return out
-
-
-def get_git_hash(fallback='unknown', digits=None):
-    """Get the git hash of the current repo.
-
-    Args:
-        fallback (str, optional): The fallback string when git hash is
-            unavailable. Defaults to 'unknown'.
-        digits (int, optional): kept digits of the hash. Defaults to None,
-            meaning all digits are kept.
-
-    Returns:
-        str: Git commit hash.
-    """
-
-    if digits is not None and not isinstance(digits, int):
-        raise TypeError('digits must be None or an integer')
-
-    try:
-        out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
-        sha = out.strip().decode('ascii')
-        if digits is not None:
-            sha = sha[:digits]
-    except OSError:
-        sha = fallback
-
-    return sha
diff --git a/ControlNet/annotator/uniformer/mmcv/version.py b/ControlNet/annotator/uniformer/mmcv/version.py
deleted file mode 100644
index 1cce4e50bd692d4002e3cac3c545a3fb2efe95d0..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/version.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-__version__ = '1.3.17'
-
-
-def parse_version_info(version_str: str, length: int = 4) -> tuple:
-    """Parse a version string into a tuple.
-
-    Args:
-        version_str (str): The version string.
-        length (int): The maximum number of version levels. Default: 4.
-
-    Returns:
-        tuple[int | str]: The version info, e.g., "1.3.0" is parsed into
-            (1, 3, 0, 0, 0, 0), and "2.0.0rc1" is parsed into
-            (2, 0, 0, 0, 'rc', 1) (when length is set to 4).
-    """
-    from packaging.version import parse
-    version = parse(version_str)
-    assert version.release, f'failed to parse version {version_str}'
-    release = list(version.release)
-    release = release[:length]
-    if len(release) < length:
-        release = release + [0] * (length - len(release))
-    if version.is_prerelease:
-        release.extend(list(version.pre))
-    elif version.is_postrelease:
-        release.extend(list(version.post))
-    else:
-        release.extend([0, 0])
-    return tuple(release)
-
-
-version_info = tuple(int(x) for x in __version__.split('.')[:3])
-
-__all__ = ['__version__', 'version_info', 'parse_version_info']
diff --git a/ControlNet/annotator/uniformer/mmcv/video/__init__.py b/ControlNet/annotator/uniformer/mmcv/video/__init__.py
deleted file mode 100644
index 73199b01dec52820dc6ca0139903536344d5a1eb..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/video/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .io import Cache, VideoReader, frames2video
-from .optflow import (dequantize_flow, flow_from_bytes, flow_warp, flowread,
-                      flowwrite, quantize_flow, sparse_flow_from_bytes)
-from .processing import concat_video, convert_video, cut_video, resize_video
-
-__all__ = [
-    'Cache', 'VideoReader', 'frames2video', 'convert_video', 'resize_video',
-    'cut_video', 'concat_video', 'flowread', 'flowwrite', 'quantize_flow',
-    'dequantize_flow', 'flow_warp', 'flow_from_bytes', 'sparse_flow_from_bytes'
-]
diff --git a/ControlNet/annotator/uniformer/mmcv/video/io.py b/ControlNet/annotator/uniformer/mmcv/video/io.py
deleted file mode 100644
index 9879154227f640c262853b92c219461c6f67ee8e..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/video/io.py
+++ /dev/null
@@ -1,318 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-from collections import OrderedDict
-
-import cv2
-from cv2 import (CAP_PROP_FOURCC, CAP_PROP_FPS, CAP_PROP_FRAME_COUNT,
-                 CAP_PROP_FRAME_HEIGHT, CAP_PROP_FRAME_WIDTH,
-                 CAP_PROP_POS_FRAMES, VideoWriter_fourcc)
-
-from annotator.uniformer.mmcv.utils import (check_file_exist, mkdir_or_exist, scandir,
-                        track_progress)
-
-
-class Cache:
-
-    def __init__(self, capacity):
-        self._cache = OrderedDict()
-        self._capacity = int(capacity)
-        if capacity <= 0:
-            raise ValueError('capacity must be a positive integer')
-
-    @property
-    def capacity(self):
-        return self._capacity
-
-    @property
-    def size(self):
-        return len(self._cache)
-
-    def put(self, key, val):
-        if key in self._cache:
-            return
-        if len(self._cache) >= self.capacity:
-            self._cache.popitem(last=False)
-        self._cache[key] = val
-
-    def get(self, key, default=None):
-        val = self._cache[key] if key in self._cache else default
-        return val
-
-
-class VideoReader:
-    """Video class with similar usage to a list object.
-
-    This video warpper class provides convenient apis to access frames.
-    There exists an issue of OpenCV's VideoCapture class that jumping to a
-    certain frame may be inaccurate. It is fixed in this class by checking
-    the position after jumping each time.
-    Cache is used when decoding videos. So if the same frame is visited for
-    the second time, there is no need to decode again if it is stored in the
-    cache.
-
-    :Example:
-
-    >>> import annotator.uniformer.mmcv as mmcv
-    >>> v = mmcv.VideoReader('sample.mp4')
-    >>> len(v)  # get the total frame number with `len()`
-    120
-    >>> for img in v:  # v is iterable
-    >>>     mmcv.imshow(img)
-    >>> v[5]  # get the 6th frame
-    """
-
-    def __init__(self, filename, cache_capacity=10):
-        # Check whether the video path is a url
-        if not filename.startswith(('https://', 'http://')):
-            check_file_exist(filename, 'Video file not found: ' + filename)
-        self._vcap = cv2.VideoCapture(filename)
-        assert cache_capacity > 0
-        self._cache = Cache(cache_capacity)
-        self._position = 0
-        # get basic info
-        self._width = int(self._vcap.get(CAP_PROP_FRAME_WIDTH))
-        self._height = int(self._vcap.get(CAP_PROP_FRAME_HEIGHT))
-        self._fps = self._vcap.get(CAP_PROP_FPS)
-        self._frame_cnt = int(self._vcap.get(CAP_PROP_FRAME_COUNT))
-        self._fourcc = self._vcap.get(CAP_PROP_FOURCC)
-
-    @property
-    def vcap(self):
-        """:obj:`cv2.VideoCapture`: The raw VideoCapture object."""
-        return self._vcap
-
-    @property
-    def opened(self):
-        """bool: Indicate whether the video is opened."""
-        return self._vcap.isOpened()
-
-    @property
-    def width(self):
-        """int: Width of video frames."""
-        return self._width
-
-    @property
-    def height(self):
-        """int: Height of video frames."""
-        return self._height
-
-    @property
-    def resolution(self):
-        """tuple: Video resolution (width, height)."""
-        return (self._width, self._height)
-
-    @property
-    def fps(self):
-        """float: FPS of the video."""
-        return self._fps
-
-    @property
-    def frame_cnt(self):
-        """int: Total frames of the video."""
-        return self._frame_cnt
-
-    @property
-    def fourcc(self):
-        """str: "Four character code" of the video."""
-        return self._fourcc
-
-    @property
-    def position(self):
-        """int: Current cursor position, indicating frame decoded."""
-        return self._position
-
-    def _get_real_position(self):
-        return int(round(self._vcap.get(CAP_PROP_POS_FRAMES)))
-
-    def _set_real_position(self, frame_id):
-        self._vcap.set(CAP_PROP_POS_FRAMES, frame_id)
-        pos = self._get_real_position()
-        for _ in range(frame_id - pos):
-            self._vcap.read()
-        self._position = frame_id
-
-    def read(self):
-        """Read the next frame.
-
-        If the next frame have been decoded before and in the cache, then
-        return it directly, otherwise decode, cache and return it.
-
-        Returns:
-            ndarray or None: Return the frame if successful, otherwise None.
-        """
-        # pos = self._position
-        if self._cache:
-            img = self._cache.get(self._position)
-            if img is not None:
-                ret = True
-            else:
-                if self._position != self._get_real_position():
-                    self._set_real_position(self._position)
-                ret, img = self._vcap.read()
-                if ret:
-                    self._cache.put(self._position, img)
-        else:
-            ret, img = self._vcap.read()
-        if ret:
-            self._position += 1
-        return img
-
-    def get_frame(self, frame_id):
-        """Get frame by index.
-
-        Args:
-            frame_id (int): Index of the expected frame, 0-based.
-
-        Returns:
-            ndarray or None: Return the frame if successful, otherwise None.
-        """
-        if frame_id < 0 or frame_id >= self._frame_cnt:
-            raise IndexError(
-                f'"frame_id" must be between 0 and {self._frame_cnt - 1}')
-        if frame_id == self._position:
-            return self.read()
-        if self._cache:
-            img = self._cache.get(frame_id)
-            if img is not None:
-                self._position = frame_id + 1
-                return img
-        self._set_real_position(frame_id)
-        ret, img = self._vcap.read()
-        if ret:
-            if self._cache:
-                self._cache.put(self._position, img)
-            self._position += 1
-        return img
-
-    def current_frame(self):
-        """Get the current frame (frame that is just visited).
-
-        Returns:
-            ndarray or None: If the video is fresh, return None, otherwise
-                return the frame.
-        """
-        if self._position == 0:
-            return None
-        return self._cache.get(self._position - 1)
-
-    def cvt2frames(self,
-                   frame_dir,
-                   file_start=0,
-                   filename_tmpl='{:06d}.jpg',
-                   start=0,
-                   max_num=0,
-                   show_progress=True):
-        """Convert a video to frame images.
-
-        Args:
-            frame_dir (str): Output directory to store all the frame images.
-            file_start (int): Filenames will start from the specified number.
-            filename_tmpl (str): Filename template with the index as the
-                placeholder.
-            start (int): The starting frame index.
-            max_num (int): Maximum number of frames to be written.
-            show_progress (bool): Whether to show a progress bar.
-        """
-        mkdir_or_exist(frame_dir)
-        if max_num == 0:
-            task_num = self.frame_cnt - start
-        else:
-            task_num = min(self.frame_cnt - start, max_num)
-        if task_num <= 0:
-            raise ValueError('start must be less than total frame number')
-        if start > 0:
-            self._set_real_position(start)
-
-        def write_frame(file_idx):
-            img = self.read()
-            if img is None:
-                return
-            filename = osp.join(frame_dir, filename_tmpl.format(file_idx))
-            cv2.imwrite(filename, img)
-
-        if show_progress:
-            track_progress(write_frame, range(file_start,
-                                              file_start + task_num))
-        else:
-            for i in range(task_num):
-                write_frame(file_start + i)
-
-    def __len__(self):
-        return self.frame_cnt
-
-    def __getitem__(self, index):
-        if isinstance(index, slice):
-            return [
-                self.get_frame(i)
-                for i in range(*index.indices(self.frame_cnt))
-            ]
-        # support negative indexing
-        if index < 0:
-            index += self.frame_cnt
-            if index < 0:
-                raise IndexError('index out of range')
-        return self.get_frame(index)
-
-    def __iter__(self):
-        self._set_real_position(0)
-        return self
-
-    def __next__(self):
-        img = self.read()
-        if img is not None:
-            return img
-        else:
-            raise StopIteration
-
-    next = __next__
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self._vcap.release()
-
-
-def frames2video(frame_dir,
-                 video_file,
-                 fps=30,
-                 fourcc='XVID',
-                 filename_tmpl='{:06d}.jpg',
-                 start=0,
-                 end=0,
-                 show_progress=True):
-    """Read the frame images from a directory and join them as a video.
-
-    Args:
-        frame_dir (str): The directory containing video frames.
-        video_file (str): Output filename.
-        fps (float): FPS of the output video.
-        fourcc (str): Fourcc of the output video, this should be compatible
-            with the output file type.
-        filename_tmpl (str): Filename template with the index as the variable.
-        start (int): Starting frame index.
-        end (int): Ending frame index.
-        show_progress (bool): Whether to show a progress bar.
-    """
-    if end == 0:
-        ext = filename_tmpl.split('.')[-1]
-        end = len([name for name in scandir(frame_dir, ext)])
-    first_file = osp.join(frame_dir, filename_tmpl.format(start))
-    check_file_exist(first_file, 'The start frame not found: ' + first_file)
-    img = cv2.imread(first_file)
-    height, width = img.shape[:2]
-    resolution = (width, height)
-    vwriter = cv2.VideoWriter(video_file, VideoWriter_fourcc(*fourcc), fps,
-                              resolution)
-
-    def write_frame(file_idx):
-        filename = osp.join(frame_dir, filename_tmpl.format(file_idx))
-        img = cv2.imread(filename)
-        vwriter.write(img)
-
-    if show_progress:
-        track_progress(write_frame, range(start, end))
-    else:
-        for i in range(start, end):
-            write_frame(i)
-    vwriter.release()
diff --git a/ControlNet/annotator/uniformer/mmcv/video/optflow.py b/ControlNet/annotator/uniformer/mmcv/video/optflow.py
deleted file mode 100644
index 84160f8d6ef9fceb5a2f89e7481593109fc1905d..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/video/optflow.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-
-import cv2
-import numpy as np
-
-from annotator.uniformer.mmcv.arraymisc import dequantize, quantize
-from annotator.uniformer.mmcv.image import imread, imwrite
-from annotator.uniformer.mmcv.utils import is_str
-
-
-def flowread(flow_or_path, quantize=False, concat_axis=0, *args, **kwargs):
-    """Read an optical flow map.
-
-    Args:
-        flow_or_path (ndarray or str): A flow map or filepath.
-        quantize (bool): whether to read quantized pair, if set to True,
-            remaining args will be passed to :func:`dequantize_flow`.
-        concat_axis (int): The axis that dx and dy are concatenated,
-            can be either 0 or 1. Ignored if quantize is False.
-
-    Returns:
-        ndarray: Optical flow represented as a (h, w, 2) numpy array
-    """
-    if isinstance(flow_or_path, np.ndarray):
-        if (flow_or_path.ndim != 3) or (flow_or_path.shape[-1] != 2):
-            raise ValueError(f'Invalid flow with shape {flow_or_path.shape}')
-        return flow_or_path
-    elif not is_str(flow_or_path):
-        raise TypeError(f'"flow_or_path" must be a filename or numpy array, '
-                        f'not {type(flow_or_path)}')
-
-    if not quantize:
-        with open(flow_or_path, 'rb') as f:
-            try:
-                header = f.read(4).decode('utf-8')
-            except Exception:
-                raise IOError(f'Invalid flow file: {flow_or_path}')
-            else:
-                if header != 'PIEH':
-                    raise IOError(f'Invalid flow file: {flow_or_path}, '
-                                  'header does not contain PIEH')
-
-            w = np.fromfile(f, np.int32, 1).squeeze()
-            h = np.fromfile(f, np.int32, 1).squeeze()
-            flow = np.fromfile(f, np.float32, w * h * 2).reshape((h, w, 2))
-    else:
-        assert concat_axis in [0, 1]
-        cat_flow = imread(flow_or_path, flag='unchanged')
-        if cat_flow.ndim != 2:
-            raise IOError(
-                f'{flow_or_path} is not a valid quantized flow file, '
-                f'its dimension is {cat_flow.ndim}.')
-        assert cat_flow.shape[concat_axis] % 2 == 0
-        dx, dy = np.split(cat_flow, 2, axis=concat_axis)
-        flow = dequantize_flow(dx, dy, *args, **kwargs)
-
-    return flow.astype(np.float32)
-
-
-def flowwrite(flow, filename, quantize=False, concat_axis=0, *args, **kwargs):
-    """Write optical flow to file.
-
-    If the flow is not quantized, it will be saved as a .flo file losslessly,
-    otherwise a jpeg image which is lossy but of much smaller size. (dx and dy
-    will be concatenated horizontally into a single image if quantize is True.)
-
-    Args:
-        flow (ndarray): (h, w, 2) array of optical flow.
-        filename (str): Output filepath.
-        quantize (bool): Whether to quantize the flow and save it to 2 jpeg
-            images. If set to True, remaining args will be passed to
-            :func:`quantize_flow`.
-        concat_axis (int): The axis that dx and dy are concatenated,
-            can be either 0 or 1. Ignored if quantize is False.
-    """
-    if not quantize:
-        with open(filename, 'wb') as f:
-            f.write('PIEH'.encode('utf-8'))
-            np.array([flow.shape[1], flow.shape[0]], dtype=np.int32).tofile(f)
-            flow = flow.astype(np.float32)
-            flow.tofile(f)
-            f.flush()
-    else:
-        assert concat_axis in [0, 1]
-        dx, dy = quantize_flow(flow, *args, **kwargs)
-        dxdy = np.concatenate((dx, dy), axis=concat_axis)
-        imwrite(dxdy, filename)
-
-
-def quantize_flow(flow, max_val=0.02, norm=True):
-    """Quantize flow to [0, 255].
-
-    After this step, the size of flow will be much smaller, and can be
-    dumped as jpeg images.
-
-    Args:
-        flow (ndarray): (h, w, 2) array of optical flow.
-        max_val (float): Maximum value of flow, values beyond
-                        [-max_val, max_val] will be truncated.
-        norm (bool): Whether to divide flow values by image width/height.
-
-    Returns:
-        tuple[ndarray]: Quantized dx and dy.
-    """
-    h, w, _ = flow.shape
-    dx = flow[..., 0]
-    dy = flow[..., 1]
-    if norm:
-        dx = dx / w  # avoid inplace operations
-        dy = dy / h
-    # use 255 levels instead of 256 to make sure 0 is 0 after dequantization.
-    flow_comps = [
-        quantize(d, -max_val, max_val, 255, np.uint8) for d in [dx, dy]
-    ]
-    return tuple(flow_comps)
-
-
-def dequantize_flow(dx, dy, max_val=0.02, denorm=True):
-    """Recover from quantized flow.
-
-    Args:
-        dx (ndarray): Quantized dx.
-        dy (ndarray): Quantized dy.
-        max_val (float): Maximum value used when quantizing.
-        denorm (bool): Whether to multiply flow values with width/height.
-
-    Returns:
-        ndarray: Dequantized flow.
-    """
-    assert dx.shape == dy.shape
-    assert dx.ndim == 2 or (dx.ndim == 3 and dx.shape[-1] == 1)
-
-    dx, dy = [dequantize(d, -max_val, max_val, 255) for d in [dx, dy]]
-
-    if denorm:
-        dx *= dx.shape[1]
-        dy *= dx.shape[0]
-    flow = np.dstack((dx, dy))
-    return flow
-
-
-def flow_warp(img, flow, filling_value=0, interpolate_mode='nearest'):
-    """Use flow to warp img.
-
-    Args:
-        img (ndarray, float or uint8): Image to be warped.
-        flow (ndarray, float): Optical Flow.
-        filling_value (int): The missing pixels will be set with filling_value.
-        interpolate_mode (str): bilinear -> Bilinear Interpolation;
-                                nearest -> Nearest Neighbor.
-
-    Returns:
-        ndarray: Warped image with the same shape of img
-    """
-    warnings.warn('This function is just for prototyping and cannot '
-                  'guarantee the computational efficiency.')
-    assert flow.ndim == 3, 'Flow must be in 3D arrays.'
-    height = flow.shape[0]
-    width = flow.shape[1]
-    channels = img.shape[2]
-
-    output = np.ones(
-        (height, width, channels), dtype=img.dtype) * filling_value
-
-    grid = np.indices((height, width)).swapaxes(0, 1).swapaxes(1, 2)
-    dx = grid[:, :, 0] + flow[:, :, 1]
-    dy = grid[:, :, 1] + flow[:, :, 0]
-    sx = np.floor(dx).astype(int)
-    sy = np.floor(dy).astype(int)
-    valid = (sx >= 0) & (sx < height - 1) & (sy >= 0) & (sy < width - 1)
-
-    if interpolate_mode == 'nearest':
-        output[valid, :] = img[dx[valid].round().astype(int),
-                               dy[valid].round().astype(int), :]
-    elif interpolate_mode == 'bilinear':
-        # dirty walkround for integer positions
-        eps_ = 1e-6
-        dx, dy = dx + eps_, dy + eps_
-        left_top_ = img[np.floor(dx[valid]).astype(int),
-                        np.floor(dy[valid]).astype(int), :] * (
-                            np.ceil(dx[valid]) - dx[valid])[:, None] * (
-                                np.ceil(dy[valid]) - dy[valid])[:, None]
-        left_down_ = img[np.ceil(dx[valid]).astype(int),
-                         np.floor(dy[valid]).astype(int), :] * (
-                             dx[valid] - np.floor(dx[valid]))[:, None] * (
-                                 np.ceil(dy[valid]) - dy[valid])[:, None]
-        right_top_ = img[np.floor(dx[valid]).astype(int),
-                         np.ceil(dy[valid]).astype(int), :] * (
-                             np.ceil(dx[valid]) - dx[valid])[:, None] * (
-                                 dy[valid] - np.floor(dy[valid]))[:, None]
-        right_down_ = img[np.ceil(dx[valid]).astype(int),
-                          np.ceil(dy[valid]).astype(int), :] * (
-                              dx[valid] - np.floor(dx[valid]))[:, None] * (
-                                  dy[valid] - np.floor(dy[valid]))[:, None]
-        output[valid, :] = left_top_ + left_down_ + right_top_ + right_down_
-    else:
-        raise NotImplementedError(
-            'We only support interpolation modes of nearest and bilinear, '
-            f'but got {interpolate_mode}.')
-    return output.astype(img.dtype)
-
-
-def flow_from_bytes(content):
-    """Read dense optical flow from bytes.
-
-    .. note::
-        This load optical flow function works for FlyingChairs, FlyingThings3D,
-        Sintel, FlyingChairsOcc datasets, but cannot load the data from
-        ChairsSDHom.
-
-    Args:
-        content (bytes): Optical flow bytes got from files or other streams.
-
-    Returns:
-        ndarray: Loaded optical flow with the shape (H, W, 2).
-    """
-
-    # header in first 4 bytes
-    header = content[:4]
-    if header.decode('utf-8') != 'PIEH':
-        raise Exception('Flow file header does not contain PIEH')
-    # width in second 4 bytes
-    width = np.frombuffer(content[4:], np.int32, 1).squeeze()
-    # height in third 4 bytes
-    height = np.frombuffer(content[8:], np.int32, 1).squeeze()
-    # after first 12 bytes, all bytes are flow
-    flow = np.frombuffer(content[12:], np.float32, width * height * 2).reshape(
-        (height, width, 2))
-
-    return flow
-
-
-def sparse_flow_from_bytes(content):
-    """Read the optical flow in KITTI datasets from bytes.
-
-    This function is modified from RAFT load the `KITTI datasets
-    <https://github.com/princeton-vl/RAFT/blob/224320502d66c356d88e6c712f38129e60661e80/core/utils/frame_utils.py#L102>`_.
-
-    Args:
-        content (bytes): Optical flow bytes got from files or other streams.
-
-    Returns:
-        Tuple(ndarray, ndarray): Loaded optical flow with the shape (H, W, 2)
-            and flow valid mask with the shape (H, W).
-    """  # nopa
-
-    content = np.frombuffer(content, np.uint8)
-    flow = cv2.imdecode(content, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_COLOR)
-    flow = flow[:, :, ::-1].astype(np.float32)
-    # flow shape (H, W, 2) valid shape (H, W)
-    flow, valid = flow[:, :, :2], flow[:, :, 2]
-    flow = (flow - 2**15) / 64.0
-    return flow, valid
diff --git a/ControlNet/annotator/uniformer/mmcv/video/processing.py b/ControlNet/annotator/uniformer/mmcv/video/processing.py
deleted file mode 100644
index 3d90b96e0823d5f116755e7f498d25d17017224a..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/video/processing.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os
-import os.path as osp
-import subprocess
-import tempfile
-
-from annotator.uniformer.mmcv.utils import requires_executable
-
-
-@requires_executable('ffmpeg')
-def convert_video(in_file,
-                  out_file,
-                  print_cmd=False,
-                  pre_options='',
-                  **kwargs):
-    """Convert a video with ffmpeg.
-
-    This provides a general api to ffmpeg, the executed command is::
-
-        `ffmpeg -y <pre_options> -i <in_file> <options> <out_file>`
-
-    Options(kwargs) are mapped to ffmpeg commands with the following rules:
-
-    - key=val: "-key val"
-    - key=True: "-key"
-    - key=False: ""
-
-    Args:
-        in_file (str): Input video filename.
-        out_file (str): Output video filename.
-        pre_options (str): Options appears before "-i <in_file>".
-        print_cmd (bool): Whether to print the final ffmpeg command.
-    """
-    options = []
-    for k, v in kwargs.items():
-        if isinstance(v, bool):
-            if v:
-                options.append(f'-{k}')
-        elif k == 'log_level':
-            assert v in [
-                'quiet', 'panic', 'fatal', 'error', 'warning', 'info',
-                'verbose', 'debug', 'trace'
-            ]
-            options.append(f'-loglevel {v}')
-        else:
-            options.append(f'-{k} {v}')
-    cmd = f'ffmpeg -y {pre_options} -i {in_file} {" ".join(options)} ' \
-          f'{out_file}'
-    if print_cmd:
-        print(cmd)
-    subprocess.call(cmd, shell=True)
-
-
-@requires_executable('ffmpeg')
-def resize_video(in_file,
-                 out_file,
-                 size=None,
-                 ratio=None,
-                 keep_ar=False,
-                 log_level='info',
-                 print_cmd=False):
-    """Resize a video.
-
-    Args:
-        in_file (str): Input video filename.
-        out_file (str): Output video filename.
-        size (tuple): Expected size (w, h), eg, (320, 240) or (320, -1).
-        ratio (tuple or float): Expected resize ratio, (2, 0.5) means
-            (w*2, h*0.5).
-        keep_ar (bool): Whether to keep original aspect ratio.
-        log_level (str): Logging level of ffmpeg.
-        print_cmd (bool): Whether to print the final ffmpeg command.
-    """
-    if size is None and ratio is None:
-        raise ValueError('expected size or ratio must be specified')
-    if size is not None and ratio is not None:
-        raise ValueError('size and ratio cannot be specified at the same time')
-    options = {'log_level': log_level}
-    if size:
-        if not keep_ar:
-            options['vf'] = f'scale={size[0]}:{size[1]}'
-        else:
-            options['vf'] = f'scale=w={size[0]}:h={size[1]}:' \
-                            'force_original_aspect_ratio=decrease'
-    else:
-        if not isinstance(ratio, tuple):
-            ratio = (ratio, ratio)
-        options['vf'] = f'scale="trunc(iw*{ratio[0]}):trunc(ih*{ratio[1]})"'
-    convert_video(in_file, out_file, print_cmd, **options)
-
-
-@requires_executable('ffmpeg')
-def cut_video(in_file,
-              out_file,
-              start=None,
-              end=None,
-              vcodec=None,
-              acodec=None,
-              log_level='info',
-              print_cmd=False):
-    """Cut a clip from a video.
-
-    Args:
-        in_file (str): Input video filename.
-        out_file (str): Output video filename.
-        start (None or float): Start time (in seconds).
-        end (None or float): End time (in seconds).
-        vcodec (None or str): Output video codec, None for unchanged.
-        acodec (None or str): Output audio codec, None for unchanged.
-        log_level (str): Logging level of ffmpeg.
-        print_cmd (bool): Whether to print the final ffmpeg command.
-    """
-    options = {'log_level': log_level}
-    if vcodec is None:
-        options['vcodec'] = 'copy'
-    if acodec is None:
-        options['acodec'] = 'copy'
-    if start:
-        options['ss'] = start
-    else:
-        start = 0
-    if end:
-        options['t'] = end - start
-    convert_video(in_file, out_file, print_cmd, **options)
-
-
-@requires_executable('ffmpeg')
-def concat_video(video_list,
-                 out_file,
-                 vcodec=None,
-                 acodec=None,
-                 log_level='info',
-                 print_cmd=False):
-    """Concatenate multiple videos into a single one.
-
-    Args:
-        video_list (list): A list of video filenames
-        out_file (str): Output video filename
-        vcodec (None or str): Output video codec, None for unchanged
-        acodec (None or str): Output audio codec, None for unchanged
-        log_level (str): Logging level of ffmpeg.
-        print_cmd (bool): Whether to print the final ffmpeg command.
-    """
-    tmp_filehandler, tmp_filename = tempfile.mkstemp(suffix='.txt', text=True)
-    with open(tmp_filename, 'w') as f:
-        for filename in video_list:
-            f.write(f'file {osp.abspath(filename)}\n')
-    options = {'log_level': log_level}
-    if vcodec is None:
-        options['vcodec'] = 'copy'
-    if acodec is None:
-        options['acodec'] = 'copy'
-    convert_video(
-        tmp_filename,
-        out_file,
-        print_cmd,
-        pre_options='-f concat -safe 0',
-        **options)
-    os.close(tmp_filehandler)
-    os.remove(tmp_filename)
diff --git a/ControlNet/annotator/uniformer/mmcv/visualization/__init__.py b/ControlNet/annotator/uniformer/mmcv/visualization/__init__.py
deleted file mode 100644
index 835df136bdcf69348281d22914d41aa84cdf92b1..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/visualization/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .color import Color, color_val
-from .image import imshow, imshow_bboxes, imshow_det_bboxes
-from .optflow import flow2rgb, flowshow, make_color_wheel
-
-__all__ = [
-    'Color', 'color_val', 'imshow', 'imshow_bboxes', 'imshow_det_bboxes',
-    'flowshow', 'flow2rgb', 'make_color_wheel'
-]
diff --git a/ControlNet/annotator/uniformer/mmcv/visualization/color.py b/ControlNet/annotator/uniformer/mmcv/visualization/color.py
deleted file mode 100644
index 9041e0e6b7581c3356795d6a3c5e84667c88f025..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/visualization/color.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from enum import Enum
-
-import numpy as np
-
-from annotator.uniformer.mmcv.utils import is_str
-
-
-class Color(Enum):
-    """An enum that defines common colors.
-
-    Contains red, green, blue, cyan, yellow, magenta, white and black.
-    """
-    red = (0, 0, 255)
-    green = (0, 255, 0)
-    blue = (255, 0, 0)
-    cyan = (255, 255, 0)
-    yellow = (0, 255, 255)
-    magenta = (255, 0, 255)
-    white = (255, 255, 255)
-    black = (0, 0, 0)
-
-
-def color_val(color):
-    """Convert various input to color tuples.
-
-    Args:
-        color (:obj:`Color`/str/tuple/int/ndarray): Color inputs
-
-    Returns:
-        tuple[int]: A tuple of 3 integers indicating BGR channels.
-    """
-    if is_str(color):
-        return Color[color].value
-    elif isinstance(color, Color):
-        return color.value
-    elif isinstance(color, tuple):
-        assert len(color) == 3
-        for channel in color:
-            assert 0 <= channel <= 255
-        return color
-    elif isinstance(color, int):
-        assert 0 <= color <= 255
-        return color, color, color
-    elif isinstance(color, np.ndarray):
-        assert color.ndim == 1 and color.size == 3
-        assert np.all((color >= 0) & (color <= 255))
-        color = color.astype(np.uint8)
-        return tuple(color)
-    else:
-        raise TypeError(f'Invalid type for color: {type(color)}')
diff --git a/ControlNet/annotator/uniformer/mmcv/visualization/image.py b/ControlNet/annotator/uniformer/mmcv/visualization/image.py
deleted file mode 100644
index 61a56c75b67f593c298408462c63c0468be8e276..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/visualization/image.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import cv2
-import numpy as np
-
-from annotator.uniformer.mmcv.image import imread, imwrite
-from .color import color_val
-
-
-def imshow(img, win_name='', wait_time=0):
-    """Show an image.
-
-    Args:
-        img (str or ndarray): The image to be displayed.
-        win_name (str): The window name.
-        wait_time (int): Value of waitKey param.
-    """
-    cv2.imshow(win_name, imread(img))
-    if wait_time == 0:  # prevent from hanging if windows was closed
-        while True:
-            ret = cv2.waitKey(1)
-
-            closed = cv2.getWindowProperty(win_name, cv2.WND_PROP_VISIBLE) < 1
-            # if user closed window or if some key pressed
-            if closed or ret != -1:
-                break
-    else:
-        ret = cv2.waitKey(wait_time)
-
-
-def imshow_bboxes(img,
-                  bboxes,
-                  colors='green',
-                  top_k=-1,
-                  thickness=1,
-                  show=True,
-                  win_name='',
-                  wait_time=0,
-                  out_file=None):
-    """Draw bboxes on an image.
-
-    Args:
-        img (str or ndarray): The image to be displayed.
-        bboxes (list or ndarray): A list of ndarray of shape (k, 4).
-        colors (list[str or tuple or Color]): A list of colors.
-        top_k (int): Plot the first k bboxes only if set positive.
-        thickness (int): Thickness of lines.
-        show (bool): Whether to show the image.
-        win_name (str): The window name.
-        wait_time (int): Value of waitKey param.
-        out_file (str, optional): The filename to write the image.
-
-    Returns:
-        ndarray: The image with bboxes drawn on it.
-    """
-    img = imread(img)
-    img = np.ascontiguousarray(img)
-
-    if isinstance(bboxes, np.ndarray):
-        bboxes = [bboxes]
-    if not isinstance(colors, list):
-        colors = [colors for _ in range(len(bboxes))]
-    colors = [color_val(c) for c in colors]
-    assert len(bboxes) == len(colors)
-
-    for i, _bboxes in enumerate(bboxes):
-        _bboxes = _bboxes.astype(np.int32)
-        if top_k <= 0:
-            _top_k = _bboxes.shape[0]
-        else:
-            _top_k = min(top_k, _bboxes.shape[0])
-        for j in range(_top_k):
-            left_top = (_bboxes[j, 0], _bboxes[j, 1])
-            right_bottom = (_bboxes[j, 2], _bboxes[j, 3])
-            cv2.rectangle(
-                img, left_top, right_bottom, colors[i], thickness=thickness)
-
-    if show:
-        imshow(img, win_name, wait_time)
-    if out_file is not None:
-        imwrite(img, out_file)
-    return img
-
-
-def imshow_det_bboxes(img,
-                      bboxes,
-                      labels,
-                      class_names=None,
-                      score_thr=0,
-                      bbox_color='green',
-                      text_color='green',
-                      thickness=1,
-                      font_scale=0.5,
-                      show=True,
-                      win_name='',
-                      wait_time=0,
-                      out_file=None):
-    """Draw bboxes and class labels (with scores) on an image.
-
-    Args:
-        img (str or ndarray): The image to be displayed.
-        bboxes (ndarray): Bounding boxes (with scores), shaped (n, 4) or
-            (n, 5).
-        labels (ndarray): Labels of bboxes.
-        class_names (list[str]): Names of each classes.
-        score_thr (float): Minimum score of bboxes to be shown.
-        bbox_color (str or tuple or :obj:`Color`): Color of bbox lines.
-        text_color (str or tuple or :obj:`Color`): Color of texts.
-        thickness (int): Thickness of lines.
-        font_scale (float): Font scales of texts.
-        show (bool): Whether to show the image.
-        win_name (str): The window name.
-        wait_time (int): Value of waitKey param.
-        out_file (str or None): The filename to write the image.
-
-    Returns:
-        ndarray: The image with bboxes drawn on it.
-    """
-    assert bboxes.ndim == 2
-    assert labels.ndim == 1
-    assert bboxes.shape[0] == labels.shape[0]
-    assert bboxes.shape[1] == 4 or bboxes.shape[1] == 5
-    img = imread(img)
-    img = np.ascontiguousarray(img)
-
-    if score_thr > 0:
-        assert bboxes.shape[1] == 5
-        scores = bboxes[:, -1]
-        inds = scores > score_thr
-        bboxes = bboxes[inds, :]
-        labels = labels[inds]
-
-    bbox_color = color_val(bbox_color)
-    text_color = color_val(text_color)
-
-    for bbox, label in zip(bboxes, labels):
-        bbox_int = bbox.astype(np.int32)
-        left_top = (bbox_int[0], bbox_int[1])
-        right_bottom = (bbox_int[2], bbox_int[3])
-        cv2.rectangle(
-            img, left_top, right_bottom, bbox_color, thickness=thickness)
-        label_text = class_names[
-            label] if class_names is not None else f'cls {label}'
-        if len(bbox) > 4:
-            label_text += f'|{bbox[-1]:.02f}'
-        cv2.putText(img, label_text, (bbox_int[0], bbox_int[1] - 2),
-                    cv2.FONT_HERSHEY_COMPLEX, font_scale, text_color)
-
-    if show:
-        imshow(img, win_name, wait_time)
-    if out_file is not None:
-        imwrite(img, out_file)
-    return img
diff --git a/ControlNet/annotator/uniformer/mmcv/visualization/optflow.py b/ControlNet/annotator/uniformer/mmcv/visualization/optflow.py
deleted file mode 100644
index c3870c700f7c946177ee5d536ce3f6c814a77ce7..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv/visualization/optflow.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from __future__ import division
-
-import numpy as np
-
-from annotator.uniformer.mmcv.image import rgb2bgr
-from annotator.uniformer.mmcv.video import flowread
-from .image import imshow
-
-
-def flowshow(flow, win_name='', wait_time=0):
-    """Show optical flow.
-
-    Args:
-        flow (ndarray or str): The optical flow to be displayed.
-        win_name (str): The window name.
-        wait_time (int): Value of waitKey param.
-    """
-    flow = flowread(flow)
-    flow_img = flow2rgb(flow)
-    imshow(rgb2bgr(flow_img), win_name, wait_time)
-
-
-def flow2rgb(flow, color_wheel=None, unknown_thr=1e6):
-    """Convert flow map to RGB image.
-
-    Args:
-        flow (ndarray): Array of optical flow.
-        color_wheel (ndarray or None): Color wheel used to map flow field to
-            RGB colorspace. Default color wheel will be used if not specified.
-        unknown_thr (str): Values above this threshold will be marked as
-            unknown and thus ignored.
-
-    Returns:
-        ndarray: RGB image that can be visualized.
-    """
-    assert flow.ndim == 3 and flow.shape[-1] == 2
-    if color_wheel is None:
-        color_wheel = make_color_wheel()
-    assert color_wheel.ndim == 2 and color_wheel.shape[1] == 3
-    num_bins = color_wheel.shape[0]
-
-    dx = flow[:, :, 0].copy()
-    dy = flow[:, :, 1].copy()
-
-    ignore_inds = (
-        np.isnan(dx) | np.isnan(dy) | (np.abs(dx) > unknown_thr) |
-        (np.abs(dy) > unknown_thr))
-    dx[ignore_inds] = 0
-    dy[ignore_inds] = 0
-
-    rad = np.sqrt(dx**2 + dy**2)
-    if np.any(rad > np.finfo(float).eps):
-        max_rad = np.max(rad)
-        dx /= max_rad
-        dy /= max_rad
-
-    rad = np.sqrt(dx**2 + dy**2)
-    angle = np.arctan2(-dy, -dx) / np.pi
-
-    bin_real = (angle + 1) / 2 * (num_bins - 1)
-    bin_left = np.floor(bin_real).astype(int)
-    bin_right = (bin_left + 1) % num_bins
-    w = (bin_real - bin_left.astype(np.float32))[..., None]
-    flow_img = (1 -
-                w) * color_wheel[bin_left, :] + w * color_wheel[bin_right, :]
-    small_ind = rad <= 1
-    flow_img[small_ind] = 1 - rad[small_ind, None] * (1 - flow_img[small_ind])
-    flow_img[np.logical_not(small_ind)] *= 0.75
-
-    flow_img[ignore_inds, :] = 0
-
-    return flow_img
-
-
-def make_color_wheel(bins=None):
-    """Build a color wheel.
-
-    Args:
-        bins(list or tuple, optional): Specify the number of bins for each
-            color range, corresponding to six ranges: red -> yellow,
-            yellow -> green, green -> cyan, cyan -> blue, blue -> magenta,
-            magenta -> red. [15, 6, 4, 11, 13, 6] is used for default
-            (see Middlebury).
-
-    Returns:
-        ndarray: Color wheel of shape (total_bins, 3).
-    """
-    if bins is None:
-        bins = [15, 6, 4, 11, 13, 6]
-    assert len(bins) == 6
-
-    RY, YG, GC, CB, BM, MR = tuple(bins)
-
-    ry = [1, np.arange(RY) / RY, 0]
-    yg = [1 - np.arange(YG) / YG, 1, 0]
-    gc = [0, 1, np.arange(GC) / GC]
-    cb = [0, 1 - np.arange(CB) / CB, 1]
-    bm = [np.arange(BM) / BM, 0, 1]
-    mr = [1, 0, 1 - np.arange(MR) / MR]
-
-    num_bins = RY + YG + GC + CB + BM + MR
-
-    color_wheel = np.zeros((3, num_bins), dtype=np.float32)
-
-    col = 0
-    for i, color in enumerate([ry, yg, gc, cb, bm, mr]):
-        for j in range(3):
-            color_wheel[j, col:col + bins[i]] = color[j]
-        col += bins[i]
-
-    return color_wheel.T
diff --git a/ControlNet/annotator/uniformer/mmcv_custom/__init__.py b/ControlNet/annotator/uniformer/mmcv_custom/__init__.py
deleted file mode 100644
index 4b958738b9fd93bfcec239c550df1d9a44b8c536..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv_custom/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from .checkpoint import load_checkpoint
-
-__all__ = ['load_checkpoint']
\ No newline at end of file
diff --git a/ControlNet/annotator/uniformer/mmcv_custom/checkpoint.py b/ControlNet/annotator/uniformer/mmcv_custom/checkpoint.py
deleted file mode 100644
index 19b87fef0a52d31babcdb3edb8f3089b6420173f..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmcv_custom/checkpoint.py
+++ /dev/null
@@ -1,500 +0,0 @@
-# Copyright (c) Open-MMLab. All rights reserved.
-import io
-import os
-import os.path as osp
-import pkgutil
-import time
-import warnings
-from collections import OrderedDict
-from importlib import import_module
-from tempfile import TemporaryDirectory
-
-import torch
-import torchvision
-from torch.optim import Optimizer
-from torch.utils import model_zoo
-from torch.nn import functional as F
-
-import annotator.uniformer.mmcv as mmcv
-from annotator.uniformer.mmcv.fileio import FileClient
-from annotator.uniformer.mmcv.fileio import load as load_file
-from annotator.uniformer.mmcv.parallel import is_module_wrapper
-from annotator.uniformer.mmcv.utils import mkdir_or_exist
-from annotator.uniformer.mmcv.runner import get_dist_info
-
-ENV_MMCV_HOME = 'MMCV_HOME'
-ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
-DEFAULT_CACHE_DIR = '~/.cache'
-
-
-def _get_mmcv_home():
-    mmcv_home = os.path.expanduser(
-        os.getenv(
-            ENV_MMCV_HOME,
-            os.path.join(
-                os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv')))
-
-    mkdir_or_exist(mmcv_home)
-    return mmcv_home
-
-
-def load_state_dict(module, state_dict, strict=False, logger=None):
-    """Load state_dict to a module.
-
-    This method is modified from :meth:`torch.nn.Module.load_state_dict`.
-    Default value for ``strict`` is set to ``False`` and the message for
-    param mismatch will be shown even if strict is False.
-
-    Args:
-        module (Module): Module that receives the state_dict.
-        state_dict (OrderedDict): Weights.
-        strict (bool): whether to strictly enforce that the keys
-            in :attr:`state_dict` match the keys returned by this module's
-            :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
-        logger (:obj:`logging.Logger`, optional): Logger to log the error
-            message. If not specified, print function will be used.
-    """
-    unexpected_keys = []
-    all_missing_keys = []
-    err_msg = []
-
-    metadata = getattr(state_dict, '_metadata', None)
-    state_dict = state_dict.copy()
-    if metadata is not None:
-        state_dict._metadata = metadata
-
-    # use _load_from_state_dict to enable checkpoint version control
-    def load(module, prefix=''):
-        # recursively check parallel module in case that the model has a
-        # complicated structure, e.g., nn.Module(nn.Module(DDP))
-        if is_module_wrapper(module):
-            module = module.module
-        local_metadata = {} if metadata is None else metadata.get(
-            prefix[:-1], {})
-        module._load_from_state_dict(state_dict, prefix, local_metadata, True,
-                                     all_missing_keys, unexpected_keys,
-                                     err_msg)
-        for name, child in module._modules.items():
-            if child is not None:
-                load(child, prefix + name + '.')
-
-    load(module)
-    load = None  # break load->load reference cycle
-
-    # ignore "num_batches_tracked" of BN layers
-    missing_keys = [
-        key for key in all_missing_keys if 'num_batches_tracked' not in key
-    ]
-
-    if unexpected_keys:
-        err_msg.append('unexpected key in source '
-                       f'state_dict: {", ".join(unexpected_keys)}\n')
-    if missing_keys:
-        err_msg.append(
-            f'missing keys in source state_dict: {", ".join(missing_keys)}\n')
-
-    rank, _ = get_dist_info()
-    if len(err_msg) > 0 and rank == 0:
-        err_msg.insert(
-            0, 'The model and loaded state dict do not match exactly\n')
-        err_msg = '\n'.join(err_msg)
-        if strict:
-            raise RuntimeError(err_msg)
-        elif logger is not None:
-            logger.warning(err_msg)
-        else:
-            print(err_msg)
-
-
-def load_url_dist(url, model_dir=None):
-    """In distributed setting, this function only download checkpoint at local
-    rank 0."""
-    rank, world_size = get_dist_info()
-    rank = int(os.environ.get('LOCAL_RANK', rank))
-    if rank == 0:
-        checkpoint = model_zoo.load_url(url, model_dir=model_dir)
-    if world_size > 1:
-        torch.distributed.barrier()
-        if rank > 0:
-            checkpoint = model_zoo.load_url(url, model_dir=model_dir)
-    return checkpoint
-
-
-def load_pavimodel_dist(model_path, map_location=None):
-    """In distributed setting, this function only download checkpoint at local
-    rank 0."""
-    try:
-        from pavi import modelcloud
-    except ImportError:
-        raise ImportError(
-            'Please install pavi to load checkpoint from modelcloud.')
-    rank, world_size = get_dist_info()
-    rank = int(os.environ.get('LOCAL_RANK', rank))
-    if rank == 0:
-        model = modelcloud.get(model_path)
-        with TemporaryDirectory() as tmp_dir:
-            downloaded_file = osp.join(tmp_dir, model.name)
-            model.download(downloaded_file)
-            checkpoint = torch.load(downloaded_file, map_location=map_location)
-    if world_size > 1:
-        torch.distributed.barrier()
-        if rank > 0:
-            model = modelcloud.get(model_path)
-            with TemporaryDirectory() as tmp_dir:
-                downloaded_file = osp.join(tmp_dir, model.name)
-                model.download(downloaded_file)
-                checkpoint = torch.load(
-                    downloaded_file, map_location=map_location)
-    return checkpoint
-
-
-def load_fileclient_dist(filename, backend, map_location):
-    """In distributed setting, this function only download checkpoint at local
-    rank 0."""
-    rank, world_size = get_dist_info()
-    rank = int(os.environ.get('LOCAL_RANK', rank))
-    allowed_backends = ['ceph']
-    if backend not in allowed_backends:
-        raise ValueError(f'Load from Backend {backend} is not supported.')
-    if rank == 0:
-        fileclient = FileClient(backend=backend)
-        buffer = io.BytesIO(fileclient.get(filename))
-        checkpoint = torch.load(buffer, map_location=map_location)
-    if world_size > 1:
-        torch.distributed.barrier()
-        if rank > 0:
-            fileclient = FileClient(backend=backend)
-            buffer = io.BytesIO(fileclient.get(filename))
-            checkpoint = torch.load(buffer, map_location=map_location)
-    return checkpoint
-
-
-def get_torchvision_models():
-    model_urls = dict()
-    for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__):
-        if ispkg:
-            continue
-        _zoo = import_module(f'torchvision.models.{name}')
-        if hasattr(_zoo, 'model_urls'):
-            _urls = getattr(_zoo, 'model_urls')
-            model_urls.update(_urls)
-    return model_urls
-
-
-def get_external_models():
-    mmcv_home = _get_mmcv_home()
-    default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json')
-    default_urls = load_file(default_json_path)
-    assert isinstance(default_urls, dict)
-    external_json_path = osp.join(mmcv_home, 'open_mmlab.json')
-    if osp.exists(external_json_path):
-        external_urls = load_file(external_json_path)
-        assert isinstance(external_urls, dict)
-        default_urls.update(external_urls)
-
-    return default_urls
-
-
-def get_mmcls_models():
-    mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json')
-    mmcls_urls = load_file(mmcls_json_path)
-
-    return mmcls_urls
-
-
-def get_deprecated_model_names():
-    deprecate_json_path = osp.join(mmcv.__path__[0],
-                                   'model_zoo/deprecated.json')
-    deprecate_urls = load_file(deprecate_json_path)
-    assert isinstance(deprecate_urls, dict)
-
-    return deprecate_urls
-
-
-def _process_mmcls_checkpoint(checkpoint):
-    state_dict = checkpoint['state_dict']
-    new_state_dict = OrderedDict()
-    for k, v in state_dict.items():
-        if k.startswith('backbone.'):
-            new_state_dict[k[9:]] = v
-    new_checkpoint = dict(state_dict=new_state_dict)
-
-    return new_checkpoint
-
-
-def _load_checkpoint(filename, map_location=None):
-    """Load checkpoint from somewhere (modelzoo, file, url).
-
-    Args:
-        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
-            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
-            details.
-        map_location (str | None): Same as :func:`torch.load`. Default: None.
-
-    Returns:
-        dict | OrderedDict: The loaded checkpoint. It can be either an
-            OrderedDict storing model weights or a dict containing other
-            information, which depends on the checkpoint.
-    """
-    if filename.startswith('modelzoo://'):
-        warnings.warn('The URL scheme of "modelzoo://" is deprecated, please '
-                      'use "torchvision://" instead')
-        model_urls = get_torchvision_models()
-        model_name = filename[11:]
-        checkpoint = load_url_dist(model_urls[model_name])
-    elif filename.startswith('torchvision://'):
-        model_urls = get_torchvision_models()
-        model_name = filename[14:]
-        checkpoint = load_url_dist(model_urls[model_name])
-    elif filename.startswith('open-mmlab://'):
-        model_urls = get_external_models()
-        model_name = filename[13:]
-        deprecated_urls = get_deprecated_model_names()
-        if model_name in deprecated_urls:
-            warnings.warn(f'open-mmlab://{model_name} is deprecated in favor '
-                          f'of open-mmlab://{deprecated_urls[model_name]}')
-            model_name = deprecated_urls[model_name]
-        model_url = model_urls[model_name]
-        # check if is url
-        if model_url.startswith(('http://', 'https://')):
-            checkpoint = load_url_dist(model_url)
-        else:
-            filename = osp.join(_get_mmcv_home(), model_url)
-            if not osp.isfile(filename):
-                raise IOError(f'{filename} is not a checkpoint file')
-            checkpoint = torch.load(filename, map_location=map_location)
-    elif filename.startswith('mmcls://'):
-        model_urls = get_mmcls_models()
-        model_name = filename[8:]
-        checkpoint = load_url_dist(model_urls[model_name])
-        checkpoint = _process_mmcls_checkpoint(checkpoint)
-    elif filename.startswith(('http://', 'https://')):
-        checkpoint = load_url_dist(filename)
-    elif filename.startswith('pavi://'):
-        model_path = filename[7:]
-        checkpoint = load_pavimodel_dist(model_path, map_location=map_location)
-    elif filename.startswith('s3://'):
-        checkpoint = load_fileclient_dist(
-            filename, backend='ceph', map_location=map_location)
-    else:
-        if not osp.isfile(filename):
-            raise IOError(f'{filename} is not a checkpoint file')
-        checkpoint = torch.load(filename, map_location=map_location)
-    return checkpoint
-
-
-def load_checkpoint(model,
-                    filename,
-                    map_location='cpu',
-                    strict=False,
-                    logger=None):
-    """Load checkpoint from a file or URI.
-
-    Args:
-        model (Module): Module to load checkpoint.
-        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
-            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
-            details.
-        map_location (str): Same as :func:`torch.load`.
-        strict (bool): Whether to allow different params for the model and
-            checkpoint.
-        logger (:mod:`logging.Logger` or None): The logger for error message.
-
-    Returns:
-        dict or OrderedDict: The loaded checkpoint.
-    """
-    checkpoint = _load_checkpoint(filename, map_location)
-    # OrderedDict is a subclass of dict
-    if not isinstance(checkpoint, dict):
-        raise RuntimeError(
-            f'No state_dict found in checkpoint file {filename}')
-    # get state_dict from checkpoint
-    if 'state_dict' in checkpoint:
-        state_dict = checkpoint['state_dict']
-    elif 'model' in checkpoint:
-        state_dict = checkpoint['model']
-    else:
-        state_dict = checkpoint
-    # strip prefix of state_dict
-    if list(state_dict.keys())[0].startswith('module.'):
-        state_dict = {k[7:]: v for k, v in state_dict.items()}
-
-    # for MoBY, load model of online branch
-    if sorted(list(state_dict.keys()))[0].startswith('encoder'):
-        state_dict = {k.replace('encoder.', ''): v for k, v in state_dict.items() if k.startswith('encoder.')}
-
-    # reshape absolute position embedding
-    if state_dict.get('absolute_pos_embed') is not None:
-        absolute_pos_embed = state_dict['absolute_pos_embed']
-        N1, L, C1 = absolute_pos_embed.size()
-        N2, C2, H, W = model.absolute_pos_embed.size()
-        if N1 != N2 or C1 != C2 or L != H*W:
-            logger.warning("Error in loading absolute_pos_embed, pass")
-        else:
-            state_dict['absolute_pos_embed'] = absolute_pos_embed.view(N2, H, W, C2).permute(0, 3, 1, 2)
-
-    # interpolate position bias table if needed
-    relative_position_bias_table_keys = [k for k in state_dict.keys() if "relative_position_bias_table" in k]
-    for table_key in relative_position_bias_table_keys:
-        table_pretrained = state_dict[table_key]
-        table_current = model.state_dict()[table_key]
-        L1, nH1 = table_pretrained.size()
-        L2, nH2 = table_current.size()
-        if nH1 != nH2:
-            logger.warning(f"Error in loading {table_key}, pass")
-        else:
-            if L1 != L2:
-                S1 = int(L1 ** 0.5)
-                S2 = int(L2 ** 0.5)
-                table_pretrained_resized = F.interpolate(
-                     table_pretrained.permute(1, 0).view(1, nH1, S1, S1),
-                     size=(S2, S2), mode='bicubic')
-                state_dict[table_key] = table_pretrained_resized.view(nH2, L2).permute(1, 0)
-
-    # load state_dict
-    load_state_dict(model, state_dict, strict, logger)
-    return checkpoint
-
-
-def weights_to_cpu(state_dict):
-    """Copy a model state_dict to cpu.
-
-    Args:
-        state_dict (OrderedDict): Model weights on GPU.
-
-    Returns:
-        OrderedDict: Model weights on GPU.
-    """
-    state_dict_cpu = OrderedDict()
-    for key, val in state_dict.items():
-        state_dict_cpu[key] = val.cpu()
-    return state_dict_cpu
-
-
-def _save_to_state_dict(module, destination, prefix, keep_vars):
-    """Saves module state to `destination` dictionary.
-
-    This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.
-
-    Args:
-        module (nn.Module): The module to generate state_dict.
-        destination (dict): A dict where state will be stored.
-        prefix (str): The prefix for parameters and buffers used in this
-            module.
-    """
-    for name, param in module._parameters.items():
-        if param is not None:
-            destination[prefix + name] = param if keep_vars else param.detach()
-    for name, buf in module._buffers.items():
-        # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d
-        if buf is not None:
-            destination[prefix + name] = buf if keep_vars else buf.detach()
-
-
-def get_state_dict(module, destination=None, prefix='', keep_vars=False):
-    """Returns a dictionary containing a whole state of the module.
-
-    Both parameters and persistent buffers (e.g. running averages) are
-    included. Keys are corresponding parameter and buffer names.
-
-    This method is modified from :meth:`torch.nn.Module.state_dict` to
-    recursively check parallel module in case that the model has a complicated
-    structure, e.g., nn.Module(nn.Module(DDP)).
-
-    Args:
-        module (nn.Module): The module to generate state_dict.
-        destination (OrderedDict): Returned dict for the state of the
-            module.
-        prefix (str): Prefix of the key.
-        keep_vars (bool): Whether to keep the variable property of the
-            parameters. Default: False.
-
-    Returns:
-        dict: A dictionary containing a whole state of the module.
-    """
-    # recursively check parallel module in case that the model has a
-    # complicated structure, e.g., nn.Module(nn.Module(DDP))
-    if is_module_wrapper(module):
-        module = module.module
-
-    # below is the same as torch.nn.Module.state_dict()
-    if destination is None:
-        destination = OrderedDict()
-        destination._metadata = OrderedDict()
-    destination._metadata[prefix[:-1]] = local_metadata = dict(
-        version=module._version)
-    _save_to_state_dict(module, destination, prefix, keep_vars)
-    for name, child in module._modules.items():
-        if child is not None:
-            get_state_dict(
-                child, destination, prefix + name + '.', keep_vars=keep_vars)
-    for hook in module._state_dict_hooks.values():
-        hook_result = hook(module, destination, prefix, local_metadata)
-        if hook_result is not None:
-            destination = hook_result
-    return destination
-
-
-def save_checkpoint(model, filename, optimizer=None, meta=None):
-    """Save checkpoint to file.
-
-    The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
-    ``optimizer``. By default ``meta`` will contain version and time info.
-
-    Args:
-        model (Module): Module whose params are to be saved.
-        filename (str): Checkpoint filename.
-        optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
-        meta (dict, optional): Metadata to be saved in checkpoint.
-    """
-    if meta is None:
-        meta = {}
-    elif not isinstance(meta, dict):
-        raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
-    meta.update(mmcv_version=mmcv.__version__, time=time.asctime())
-
-    if is_module_wrapper(model):
-        model = model.module
-
-    if hasattr(model, 'CLASSES') and model.CLASSES is not None:
-        # save class name to the meta
-        meta.update(CLASSES=model.CLASSES)
-
-    checkpoint = {
-        'meta': meta,
-        'state_dict': weights_to_cpu(get_state_dict(model))
-    }
-    # save optimizer state dict in the checkpoint
-    if isinstance(optimizer, Optimizer):
-        checkpoint['optimizer'] = optimizer.state_dict()
-    elif isinstance(optimizer, dict):
-        checkpoint['optimizer'] = {}
-        for name, optim in optimizer.items():
-            checkpoint['optimizer'][name] = optim.state_dict()
-
-    if filename.startswith('pavi://'):
-        try:
-            from pavi import modelcloud
-            from pavi.exception import NodeNotFoundError
-        except ImportError:
-            raise ImportError(
-                'Please install pavi to load checkpoint from modelcloud.')
-        model_path = filename[7:]
-        root = modelcloud.Folder()
-        model_dir, model_name = osp.split(model_path)
-        try:
-            model = modelcloud.get(model_dir)
-        except NodeNotFoundError:
-            model = root.create_training_model(model_dir)
-        with TemporaryDirectory() as tmp_dir:
-            checkpoint_file = osp.join(tmp_dir, model_name)
-            with open(checkpoint_file, 'wb') as f:
-                torch.save(checkpoint, f)
-                f.flush()
-            model.create_file(checkpoint_file, name=model_name)
-    else:
-        mmcv.mkdir_or_exist(osp.dirname(filename))
-        # immediately flush buffer
-        with open(filename, 'wb') as f:
-            torch.save(checkpoint, f)
-            f.flush()
\ No newline at end of file
diff --git a/ControlNet/annotator/uniformer/mmseg/apis/__init__.py b/ControlNet/annotator/uniformer/mmseg/apis/__init__.py
deleted file mode 100644
index 170724be38de42daf2bc1a1910e181d68818f165..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/apis/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from .inference import inference_segmentor, init_segmentor, show_result_pyplot
-from .test import multi_gpu_test, single_gpu_test
-from .train import get_root_logger, set_random_seed, train_segmentor
-
-__all__ = [
-    'get_root_logger', 'set_random_seed', 'train_segmentor', 'init_segmentor',
-    'inference_segmentor', 'multi_gpu_test', 'single_gpu_test',
-    'show_result_pyplot'
-]
diff --git a/ControlNet/annotator/uniformer/mmseg/apis/inference.py b/ControlNet/annotator/uniformer/mmseg/apis/inference.py
deleted file mode 100644
index 90bc1c0c68525734bd6793f07c15fe97d3c8342c..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/apis/inference.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import matplotlib.pyplot as plt
-import annotator.uniformer.mmcv as mmcv
-import torch
-from annotator.uniformer.mmcv.parallel import collate, scatter
-from annotator.uniformer.mmcv.runner import load_checkpoint
-
-from annotator.uniformer.mmseg.datasets.pipelines import Compose
-from annotator.uniformer.mmseg.models import build_segmentor
-
-
-def init_segmentor(config, checkpoint=None, device='cuda:0'):
-    """Initialize a segmentor from config file.
-
-    Args:
-        config (str or :obj:`mmcv.Config`): Config file path or the config
-            object.
-        checkpoint (str, optional): Checkpoint path. If left as None, the model
-            will not load any weights.
-        device (str, optional) CPU/CUDA device option. Default 'cuda:0'.
-            Use 'cpu' for loading model on CPU.
-    Returns:
-        nn.Module: The constructed segmentor.
-    """
-    if isinstance(config, str):
-        config = mmcv.Config.fromfile(config)
-    elif not isinstance(config, mmcv.Config):
-        raise TypeError('config must be a filename or Config object, '
-                        'but got {}'.format(type(config)))
-    config.model.pretrained = None
-    config.model.train_cfg = None
-    model = build_segmentor(config.model, test_cfg=config.get('test_cfg'))
-    if checkpoint is not None:
-        checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
-        model.CLASSES = checkpoint['meta']['CLASSES']
-        model.PALETTE = checkpoint['meta']['PALETTE']
-    model.cfg = config  # save the config in the model for convenience
-    model.to(device)
-    model.eval()
-    return model
-
-
-class LoadImage:
-    """A simple pipeline to load image."""
-
-    def __call__(self, results):
-        """Call function to load images into results.
-
-        Args:
-            results (dict): A result dict contains the file name
-                of the image to be read.
-
-        Returns:
-            dict: ``results`` will be returned containing loaded image.
-        """
-
-        if isinstance(results['img'], str):
-            results['filename'] = results['img']
-            results['ori_filename'] = results['img']
-        else:
-            results['filename'] = None
-            results['ori_filename'] = None
-        img = mmcv.imread(results['img'])
-        results['img'] = img
-        results['img_shape'] = img.shape
-        results['ori_shape'] = img.shape
-        return results
-
-
-def inference_segmentor(model, img):
-    """Inference image(s) with the segmentor.
-
-    Args:
-        model (nn.Module): The loaded segmentor.
-        imgs (str/ndarray or list[str/ndarray]): Either image files or loaded
-            images.
-
-    Returns:
-        (list[Tensor]): The segmentation result.
-    """
-    cfg = model.cfg
-    device = next(model.parameters()).device  # model device
-    # build the data pipeline
-    test_pipeline = [LoadImage()] + cfg.data.test.pipeline[1:]
-    test_pipeline = Compose(test_pipeline)
-    # prepare data
-    data = dict(img=img)
-    data = test_pipeline(data)
-    data = collate([data], samples_per_gpu=1)
-    if next(model.parameters()).is_cuda:
-        # scatter to specified GPU
-        data = scatter(data, [device])[0]
-    else:
-        data['img_metas'] = [i.data[0] for i in data['img_metas']]
-
-    # forward the model
-    with torch.no_grad():
-        result = model(return_loss=False, rescale=True, **data)
-    return result
-
-
-def show_result_pyplot(model,
-                       img,
-                       result,
-                       palette=None,
-                       fig_size=(15, 10),
-                       opacity=0.5,
-                       title='',
-                       block=True):
-    """Visualize the segmentation results on the image.
-
-    Args:
-        model (nn.Module): The loaded segmentor.
-        img (str or np.ndarray): Image filename or loaded image.
-        result (list): The segmentation result.
-        palette (list[list[int]]] | None): The palette of segmentation
-            map. If None is given, random palette will be generated.
-            Default: None
-        fig_size (tuple): Figure size of the pyplot figure.
-        opacity(float): Opacity of painted segmentation map.
-            Default 0.5.
-            Must be in (0, 1] range.
-        title (str): The title of pyplot figure.
-            Default is ''.
-        block (bool): Whether to block the pyplot figure.
-            Default is True.
-    """
-    if hasattr(model, 'module'):
-        model = model.module
-    img = model.show_result(
-        img, result, palette=palette, show=False, opacity=opacity)
-    # plt.figure(figsize=fig_size)
-    # plt.imshow(mmcv.bgr2rgb(img))
-    # plt.title(title)
-    # plt.tight_layout()
-    # plt.show(block=block)
-    return mmcv.bgr2rgb(img)
diff --git a/ControlNet/annotator/uniformer/mmseg/apis/test.py b/ControlNet/annotator/uniformer/mmseg/apis/test.py
deleted file mode 100644
index e574eb7da04f09a59cf99ff953c36468ae87a326..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/apis/test.py
+++ /dev/null
@@ -1,238 +0,0 @@
-import os.path as osp
-import pickle
-import shutil
-import tempfile
-
-import annotator.uniformer.mmcv as mmcv
-import numpy as np
-import torch
-import torch.distributed as dist
-from annotator.uniformer.mmcv.image import tensor2imgs
-from annotator.uniformer.mmcv.runner import get_dist_info
-
-
-def np2tmp(array, temp_file_name=None):
-    """Save ndarray to local numpy file.
-
-    Args:
-        array (ndarray): Ndarray to save.
-        temp_file_name (str): Numpy file name. If 'temp_file_name=None', this
-            function will generate a file name with tempfile.NamedTemporaryFile
-            to save ndarray. Default: None.
-
-    Returns:
-        str: The numpy file name.
-    """
-
-    if temp_file_name is None:
-        temp_file_name = tempfile.NamedTemporaryFile(
-            suffix='.npy', delete=False).name
-    np.save(temp_file_name, array)
-    return temp_file_name
-
-
-def single_gpu_test(model,
-                    data_loader,
-                    show=False,
-                    out_dir=None,
-                    efficient_test=False,
-                    opacity=0.5):
-    """Test with single GPU.
-
-    Args:
-        model (nn.Module): Model to be tested.
-        data_loader (utils.data.Dataloader): Pytorch data loader.
-        show (bool): Whether show results during inference. Default: False.
-        out_dir (str, optional): If specified, the results will be dumped into
-            the directory to save output results.
-        efficient_test (bool): Whether save the results as local numpy files to
-            save CPU memory during evaluation. Default: False.
-        opacity(float): Opacity of painted segmentation map.
-            Default 0.5.
-            Must be in (0, 1] range.
-    Returns:
-        list: The prediction results.
-    """
-
-    model.eval()
-    results = []
-    dataset = data_loader.dataset
-    prog_bar = mmcv.ProgressBar(len(dataset))
-    for i, data in enumerate(data_loader):
-        with torch.no_grad():
-            result = model(return_loss=False, **data)
-
-        if show or out_dir:
-            img_tensor = data['img'][0]
-            img_metas = data['img_metas'][0].data[0]
-            imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg'])
-            assert len(imgs) == len(img_metas)
-
-            for img, img_meta in zip(imgs, img_metas):
-                h, w, _ = img_meta['img_shape']
-                img_show = img[:h, :w, :]
-
-                ori_h, ori_w = img_meta['ori_shape'][:-1]
-                img_show = mmcv.imresize(img_show, (ori_w, ori_h))
-
-                if out_dir:
-                    out_file = osp.join(out_dir, img_meta['ori_filename'])
-                else:
-                    out_file = None
-
-                model.module.show_result(
-                    img_show,
-                    result,
-                    palette=dataset.PALETTE,
-                    show=show,
-                    out_file=out_file,
-                    opacity=opacity)
-
-        if isinstance(result, list):
-            if efficient_test:
-                result = [np2tmp(_) for _ in result]
-            results.extend(result)
-        else:
-            if efficient_test:
-                result = np2tmp(result)
-            results.append(result)
-
-        batch_size = len(result)
-        for _ in range(batch_size):
-            prog_bar.update()
-    return results
-
-
-def multi_gpu_test(model,
-                   data_loader,
-                   tmpdir=None,
-                   gpu_collect=False,
-                   efficient_test=False):
-    """Test model with multiple gpus.
-
-    This method tests model with multiple gpus and collects the results
-    under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
-    it encodes results to gpu tensors and use gpu communication for results
-    collection. On cpu mode it saves the results on different gpus to 'tmpdir'
-    and collects them by the rank 0 worker.
-
-    Args:
-        model (nn.Module): Model to be tested.
-        data_loader (utils.data.Dataloader): Pytorch data loader.
-        tmpdir (str): Path of directory to save the temporary results from
-            different gpus under cpu mode.
-        gpu_collect (bool): Option to use either gpu or cpu to collect results.
-        efficient_test (bool): Whether save the results as local numpy files to
-            save CPU memory during evaluation. Default: False.
-
-    Returns:
-        list: The prediction results.
-    """
-
-    model.eval()
-    results = []
-    dataset = data_loader.dataset
-    rank, world_size = get_dist_info()
-    if rank == 0:
-        prog_bar = mmcv.ProgressBar(len(dataset))
-    for i, data in enumerate(data_loader):
-        with torch.no_grad():
-            result = model(return_loss=False, rescale=True, **data)
-
-        if isinstance(result, list):
-            if efficient_test:
-                result = [np2tmp(_) for _ in result]
-            results.extend(result)
-        else:
-            if efficient_test:
-                result = np2tmp(result)
-            results.append(result)
-
-        if rank == 0:
-            batch_size = data['img'][0].size(0)
-            for _ in range(batch_size * world_size):
-                prog_bar.update()
-
-    # collect results from all ranks
-    if gpu_collect:
-        results = collect_results_gpu(results, len(dataset))
-    else:
-        results = collect_results_cpu(results, len(dataset), tmpdir)
-    return results
-
-
-def collect_results_cpu(result_part, size, tmpdir=None):
-    """Collect results with CPU."""
-    rank, world_size = get_dist_info()
-    # create a tmp dir if it is not specified
-    if tmpdir is None:
-        MAX_LEN = 512
-        # 32 is whitespace
-        dir_tensor = torch.full((MAX_LEN, ),
-                                32,
-                                dtype=torch.uint8,
-                                device='cuda')
-        if rank == 0:
-            tmpdir = tempfile.mkdtemp()
-            tmpdir = torch.tensor(
-                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
-            dir_tensor[:len(tmpdir)] = tmpdir
-        dist.broadcast(dir_tensor, 0)
-        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
-    else:
-        mmcv.mkdir_or_exist(tmpdir)
-    # dump the part result to the dir
-    mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank)))
-    dist.barrier()
-    # collect all parts
-    if rank != 0:
-        return None
-    else:
-        # load results of all parts from tmp dir
-        part_list = []
-        for i in range(world_size):
-            part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i))
-            part_list.append(mmcv.load(part_file))
-        # sort the results
-        ordered_results = []
-        for res in zip(*part_list):
-            ordered_results.extend(list(res))
-        # the dataloader may pad some samples
-        ordered_results = ordered_results[:size]
-        # remove tmp dir
-        shutil.rmtree(tmpdir)
-        return ordered_results
-
-
-def collect_results_gpu(result_part, size):
-    """Collect results with GPU."""
-    rank, world_size = get_dist_info()
-    # dump result part to tensor with pickle
-    part_tensor = torch.tensor(
-        bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda')
-    # gather all result part tensor shape
-    shape_tensor = torch.tensor(part_tensor.shape, device='cuda')
-    shape_list = [shape_tensor.clone() for _ in range(world_size)]
-    dist.all_gather(shape_list, shape_tensor)
-    # padding result part tensor to max length
-    shape_max = torch.tensor(shape_list).max()
-    part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda')
-    part_send[:shape_tensor[0]] = part_tensor
-    part_recv_list = [
-        part_tensor.new_zeros(shape_max) for _ in range(world_size)
-    ]
-    # gather all result part
-    dist.all_gather(part_recv_list, part_send)
-
-    if rank == 0:
-        part_list = []
-        for recv, shape in zip(part_recv_list, shape_list):
-            part_list.append(
-                pickle.loads(recv[:shape[0]].cpu().numpy().tobytes()))
-        # sort the results
-        ordered_results = []
-        for res in zip(*part_list):
-            ordered_results.extend(list(res))
-        # the dataloader may pad some samples
-        ordered_results = ordered_results[:size]
-        return ordered_results
diff --git a/ControlNet/annotator/uniformer/mmseg/apis/train.py b/ControlNet/annotator/uniformer/mmseg/apis/train.py
deleted file mode 100644
index 63f319a919ff023931a6a663e668f27dd1a07a2e..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/apis/train.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import random
-import warnings
-
-import numpy as np
-import torch
-from annotator.uniformer.mmcv.parallel import MMDataParallel, MMDistributedDataParallel
-from annotator.uniformer.mmcv.runner import build_optimizer, build_runner
-
-from annotator.uniformer.mmseg.core import DistEvalHook, EvalHook
-from annotator.uniformer.mmseg.datasets import build_dataloader, build_dataset
-from annotator.uniformer.mmseg.utils import get_root_logger
-
-
-def set_random_seed(seed, deterministic=False):
-    """Set random seed.
-
-    Args:
-        seed (int): Seed to be used.
-        deterministic (bool): Whether to set the deterministic option for
-            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
-            to True and `torch.backends.cudnn.benchmark` to False.
-            Default: False.
-    """
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-    if deterministic:
-        torch.backends.cudnn.deterministic = True
-        torch.backends.cudnn.benchmark = False
-
-
-def train_segmentor(model,
-                    dataset,
-                    cfg,
-                    distributed=False,
-                    validate=False,
-                    timestamp=None,
-                    meta=None):
-    """Launch segmentor training."""
-    logger = get_root_logger(cfg.log_level)
-
-    # prepare data loaders
-    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
-    data_loaders = [
-        build_dataloader(
-            ds,
-            cfg.data.samples_per_gpu,
-            cfg.data.workers_per_gpu,
-            # cfg.gpus will be ignored if distributed
-            len(cfg.gpu_ids),
-            dist=distributed,
-            seed=cfg.seed,
-            drop_last=True) for ds in dataset
-    ]
-
-    # put model on gpus
-    if distributed:
-        find_unused_parameters = cfg.get('find_unused_parameters', False)
-        # Sets the `find_unused_parameters` parameter in
-        # torch.nn.parallel.DistributedDataParallel
-        model = MMDistributedDataParallel(
-            model.cuda(),
-            device_ids=[torch.cuda.current_device()],
-            broadcast_buffers=False,
-            find_unused_parameters=find_unused_parameters)
-    else:
-        model = MMDataParallel(
-            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
-
-    # build runner
-    optimizer = build_optimizer(model, cfg.optimizer)
-
-    if cfg.get('runner') is None:
-        cfg.runner = {'type': 'IterBasedRunner', 'max_iters': cfg.total_iters}
-        warnings.warn(
-            'config is now expected to have a `runner` section, '
-            'please set `runner` in your config.', UserWarning)
-
-    runner = build_runner(
-        cfg.runner,
-        default_args=dict(
-            model=model,
-            batch_processor=None,
-            optimizer=optimizer,
-            work_dir=cfg.work_dir,
-            logger=logger,
-            meta=meta))
-
-    # register hooks
-    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
-                                   cfg.checkpoint_config, cfg.log_config,
-                                   cfg.get('momentum_config', None))
-
-    # an ugly walkaround to make the .log and .log.json filenames the same
-    runner.timestamp = timestamp
-
-    # register eval hooks
-    if validate:
-        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
-        val_dataloader = build_dataloader(
-            val_dataset,
-            samples_per_gpu=1,
-            workers_per_gpu=cfg.data.workers_per_gpu,
-            dist=distributed,
-            shuffle=False)
-        eval_cfg = cfg.get('evaluation', {})
-        eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
-        eval_hook = DistEvalHook if distributed else EvalHook
-        runner.register_hook(eval_hook(val_dataloader, **eval_cfg), priority='LOW')
-
-    if cfg.resume_from:
-        runner.resume(cfg.resume_from)
-    elif cfg.load_from:
-        runner.load_checkpoint(cfg.load_from)
-    runner.run(data_loaders, cfg.workflow)
diff --git a/ControlNet/annotator/uniformer/mmseg/core/__init__.py b/ControlNet/annotator/uniformer/mmseg/core/__init__.py
deleted file mode 100644
index 965605587211b7bf0bd6bc3acdbb33dd49cab023..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/core/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .evaluation import *  # noqa: F401, F403
-from .seg import *  # noqa: F401, F403
-from .utils import *  # noqa: F401, F403
diff --git a/ControlNet/annotator/uniformer/mmseg/core/evaluation/__init__.py b/ControlNet/annotator/uniformer/mmseg/core/evaluation/__init__.py
deleted file mode 100644
index f7cc4b23413a0639e9de00eeb0bf600632d2c6cd..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/core/evaluation/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from .class_names import get_classes, get_palette
-from .eval_hooks import DistEvalHook, EvalHook
-from .metrics import eval_metrics, mean_dice, mean_fscore, mean_iou
-
-__all__ = [
-    'EvalHook', 'DistEvalHook', 'mean_dice', 'mean_iou', 'mean_fscore',
-    'eval_metrics', 'get_classes', 'get_palette'
-]
diff --git a/ControlNet/annotator/uniformer/mmseg/core/evaluation/class_names.py b/ControlNet/annotator/uniformer/mmseg/core/evaluation/class_names.py
deleted file mode 100644
index ffae816cf980ce4b03e491cc0c4298cb823797e6..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/core/evaluation/class_names.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import annotator.uniformer.mmcv as mmcv
-
-
-def cityscapes_classes():
-    """Cityscapes class names for external use."""
-    return [
-        'road', 'sidewalk', 'building', 'wall', 'fence', 'pole',
-        'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky',
-        'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
-        'bicycle'
-    ]
-
-
-def ade_classes():
-    """ADE20K class names for external use."""
-    return [
-        'wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road', 'bed ',
-        'windowpane', 'grass', 'cabinet', 'sidewalk', 'person', 'earth',
-        'door', 'table', 'mountain', 'plant', 'curtain', 'chair', 'car',
-        'water', 'painting', 'sofa', 'shelf', 'house', 'sea', 'mirror', 'rug',
-        'field', 'armchair', 'seat', 'fence', 'desk', 'rock', 'wardrobe',
-        'lamp', 'bathtub', 'railing', 'cushion', 'base', 'box', 'column',
-        'signboard', 'chest of drawers', 'counter', 'sand', 'sink',
-        'skyscraper', 'fireplace', 'refrigerator', 'grandstand', 'path',
-        'stairs', 'runway', 'case', 'pool table', 'pillow', 'screen door',
-        'stairway', 'river', 'bridge', 'bookcase', 'blind', 'coffee table',
-        'toilet', 'flower', 'book', 'hill', 'bench', 'countertop', 'stove',
-        'palm', 'kitchen island', 'computer', 'swivel chair', 'boat', 'bar',
-        'arcade machine', 'hovel', 'bus', 'towel', 'light', 'truck', 'tower',
-        'chandelier', 'awning', 'streetlight', 'booth', 'television receiver',
-        'airplane', 'dirt track', 'apparel', 'pole', 'land', 'bannister',
-        'escalator', 'ottoman', 'bottle', 'buffet', 'poster', 'stage', 'van',
-        'ship', 'fountain', 'conveyer belt', 'canopy', 'washer', 'plaything',
-        'swimming pool', 'stool', 'barrel', 'basket', 'waterfall', 'tent',
-        'bag', 'minibike', 'cradle', 'oven', 'ball', 'food', 'step', 'tank',
-        'trade name', 'microwave', 'pot', 'animal', 'bicycle', 'lake',
-        'dishwasher', 'screen', 'blanket', 'sculpture', 'hood', 'sconce',
-        'vase', 'traffic light', 'tray', 'ashcan', 'fan', 'pier', 'crt screen',
-        'plate', 'monitor', 'bulletin board', 'shower', 'radiator', 'glass',
-        'clock', 'flag'
-    ]
-
-
-def voc_classes():
-    """Pascal VOC class names for external use."""
-    return [
-        'background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
-        'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
-        'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
-        'tvmonitor'
-    ]
-
-
-def cityscapes_palette():
-    """Cityscapes palette for external use."""
-    return [[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156],
-            [190, 153, 153], [153, 153, 153], [250, 170, 30], [220, 220, 0],
-            [107, 142, 35], [152, 251, 152], [70, 130, 180], [220, 20, 60],
-            [255, 0, 0], [0, 0, 142], [0, 0, 70], [0, 60, 100], [0, 80, 100],
-            [0, 0, 230], [119, 11, 32]]
-
-
-def ade_palette():
-    """ADE20K palette for external use."""
-    return [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
-            [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
-            [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
-            [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
-            [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
-            [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
-            [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
-            [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
-            [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
-            [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
-            [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
-            [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
-            [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
-            [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
-            [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
-            [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
-            [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
-            [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
-            [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
-            [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
-            [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
-            [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
-            [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
-            [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
-            [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
-            [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
-            [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
-            [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
-            [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
-            [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
-            [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
-            [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
-            [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
-            [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
-            [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
-            [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
-            [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
-            [102, 255, 0], [92, 0, 255]]
-
-
-def voc_palette():
-    """Pascal VOC palette for external use."""
-    return [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128],
-            [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0],
-            [192, 0, 0], [64, 128, 0], [192, 128, 0], [64, 0, 128],
-            [192, 0, 128], [64, 128, 128], [192, 128, 128], [0, 64, 0],
-            [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]]
-
-
-dataset_aliases = {
-    'cityscapes': ['cityscapes'],
-    'ade': ['ade', 'ade20k'],
-    'voc': ['voc', 'pascal_voc', 'voc12', 'voc12aug']
-}
-
-
-def get_classes(dataset):
-    """Get class names of a dataset."""
-    alias2name = {}
-    for name, aliases in dataset_aliases.items():
-        for alias in aliases:
-            alias2name[alias] = name
-
-    if mmcv.is_str(dataset):
-        if dataset in alias2name:
-            labels = eval(alias2name[dataset] + '_classes()')
-        else:
-            raise ValueError(f'Unrecognized dataset: {dataset}')
-    else:
-        raise TypeError(f'dataset must a str, but got {type(dataset)}')
-    return labels
-
-
-def get_palette(dataset):
-    """Get class palette (RGB) of a dataset."""
-    alias2name = {}
-    for name, aliases in dataset_aliases.items():
-        for alias in aliases:
-            alias2name[alias] = name
-
-    if mmcv.is_str(dataset):
-        if dataset in alias2name:
-            labels = eval(alias2name[dataset] + '_palette()')
-        else:
-            raise ValueError(f'Unrecognized dataset: {dataset}')
-    else:
-        raise TypeError(f'dataset must a str, but got {type(dataset)}')
-    return labels
diff --git a/ControlNet/annotator/uniformer/mmseg/core/evaluation/eval_hooks.py b/ControlNet/annotator/uniformer/mmseg/core/evaluation/eval_hooks.py
deleted file mode 100644
index 6fc100c8f96e817a6ed2666f7c9f762af2463b48..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/core/evaluation/eval_hooks.py
+++ /dev/null
@@ -1,109 +0,0 @@
-import os.path as osp
-
-from annotator.uniformer.mmcv.runner import DistEvalHook as _DistEvalHook
-from annotator.uniformer.mmcv.runner import EvalHook as _EvalHook
-
-
-class EvalHook(_EvalHook):
-    """Single GPU EvalHook, with efficient test support.
-
-    Args:
-        by_epoch (bool): Determine perform evaluation by epoch or by iteration.
-            If set to True, it will perform by epoch. Otherwise, by iteration.
-            Default: False.
-        efficient_test (bool): Whether save the results as local numpy files to
-            save CPU memory during evaluation. Default: False.
-    Returns:
-        list: The prediction results.
-    """
-
-    greater_keys = ['mIoU', 'mAcc', 'aAcc']
-
-    def __init__(self, *args, by_epoch=False, efficient_test=False, **kwargs):
-        super().__init__(*args, by_epoch=by_epoch, **kwargs)
-        self.efficient_test = efficient_test
-
-    def after_train_iter(self, runner):
-        """After train epoch hook.
-
-        Override default ``single_gpu_test``.
-        """
-        if self.by_epoch or not self.every_n_iters(runner, self.interval):
-            return
-        from annotator.uniformer.mmseg.apis import single_gpu_test
-        runner.log_buffer.clear()
-        results = single_gpu_test(
-            runner.model,
-            self.dataloader,
-            show=False,
-            efficient_test=self.efficient_test)
-        self.evaluate(runner, results)
-
-    def after_train_epoch(self, runner):
-        """After train epoch hook.
-
-        Override default ``single_gpu_test``.
-        """
-        if not self.by_epoch or not self.every_n_epochs(runner, self.interval):
-            return
-        from annotator.uniformer.mmseg.apis import single_gpu_test
-        runner.log_buffer.clear()
-        results = single_gpu_test(runner.model, self.dataloader, show=False)
-        self.evaluate(runner, results)
-
-
-class DistEvalHook(_DistEvalHook):
-    """Distributed EvalHook, with efficient test support.
-
-    Args:
-        by_epoch (bool): Determine perform evaluation by epoch or by iteration.
-            If set to True, it will perform by epoch. Otherwise, by iteration.
-            Default: False.
-        efficient_test (bool): Whether save the results as local numpy files to
-            save CPU memory during evaluation. Default: False.
-    Returns:
-        list: The prediction results.
-    """
-
-    greater_keys = ['mIoU', 'mAcc', 'aAcc']
-
-    def __init__(self, *args, by_epoch=False, efficient_test=False, **kwargs):
-        super().__init__(*args, by_epoch=by_epoch, **kwargs)
-        self.efficient_test = efficient_test
-
-    def after_train_iter(self, runner):
-        """After train epoch hook.
-
-        Override default ``multi_gpu_test``.
-        """
-        if self.by_epoch or not self.every_n_iters(runner, self.interval):
-            return
-        from annotator.uniformer.mmseg.apis import multi_gpu_test
-        runner.log_buffer.clear()
-        results = multi_gpu_test(
-            runner.model,
-            self.dataloader,
-            tmpdir=osp.join(runner.work_dir, '.eval_hook'),
-            gpu_collect=self.gpu_collect,
-            efficient_test=self.efficient_test)
-        if runner.rank == 0:
-            print('\n')
-            self.evaluate(runner, results)
-
-    def after_train_epoch(self, runner):
-        """After train epoch hook.
-
-        Override default ``multi_gpu_test``.
-        """
-        if not self.by_epoch or not self.every_n_epochs(runner, self.interval):
-            return
-        from annotator.uniformer.mmseg.apis import multi_gpu_test
-        runner.log_buffer.clear()
-        results = multi_gpu_test(
-            runner.model,
-            self.dataloader,
-            tmpdir=osp.join(runner.work_dir, '.eval_hook'),
-            gpu_collect=self.gpu_collect)
-        if runner.rank == 0:
-            print('\n')
-            self.evaluate(runner, results)
diff --git a/ControlNet/annotator/uniformer/mmseg/core/evaluation/metrics.py b/ControlNet/annotator/uniformer/mmseg/core/evaluation/metrics.py
deleted file mode 100644
index 16c7dd47cadd53cf1caaa194e28a343f2aacc599..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/core/evaluation/metrics.py
+++ /dev/null
@@ -1,326 +0,0 @@
-from collections import OrderedDict
-
-import annotator.uniformer.mmcv as mmcv
-import numpy as np
-import torch
-
-
-def f_score(precision, recall, beta=1):
-    """calcuate the f-score value.
-
-    Args:
-        precision (float | torch.Tensor): The precision value.
-        recall (float | torch.Tensor): The recall value.
-        beta (int): Determines the weight of recall in the combined score.
-            Default: False.
-
-    Returns:
-        [torch.tensor]: The f-score value.
-    """
-    score = (1 + beta**2) * (precision * recall) / (
-        (beta**2 * precision) + recall)
-    return score
-
-
-def intersect_and_union(pred_label,
-                        label,
-                        num_classes,
-                        ignore_index,
-                        label_map=dict(),
-                        reduce_zero_label=False):
-    """Calculate intersection and Union.
-
-    Args:
-        pred_label (ndarray | str): Prediction segmentation map
-            or predict result filename.
-        label (ndarray | str): Ground truth segmentation map
-            or label filename.
-        num_classes (int): Number of categories.
-        ignore_index (int): Index that will be ignored in evaluation.
-        label_map (dict): Mapping old labels to new labels. The parameter will
-            work only when label is str. Default: dict().
-        reduce_zero_label (bool): Wether ignore zero label. The parameter will
-            work only when label is str. Default: False.
-
-     Returns:
-         torch.Tensor: The intersection of prediction and ground truth
-            histogram on all classes.
-         torch.Tensor: The union of prediction and ground truth histogram on
-            all classes.
-         torch.Tensor: The prediction histogram on all classes.
-         torch.Tensor: The ground truth histogram on all classes.
-    """
-
-    if isinstance(pred_label, str):
-        pred_label = torch.from_numpy(np.load(pred_label))
-    else:
-        pred_label = torch.from_numpy((pred_label))
-
-    if isinstance(label, str):
-        label = torch.from_numpy(
-            mmcv.imread(label, flag='unchanged', backend='pillow'))
-    else:
-        label = torch.from_numpy(label)
-
-    if label_map is not None:
-        for old_id, new_id in label_map.items():
-            label[label == old_id] = new_id
-    if reduce_zero_label:
-        label[label == 0] = 255
-        label = label - 1
-        label[label == 254] = 255
-
-    mask = (label != ignore_index)
-    pred_label = pred_label[mask]
-    label = label[mask]
-
-    intersect = pred_label[pred_label == label]
-    area_intersect = torch.histc(
-        intersect.float(), bins=(num_classes), min=0, max=num_classes - 1)
-    area_pred_label = torch.histc(
-        pred_label.float(), bins=(num_classes), min=0, max=num_classes - 1)
-    area_label = torch.histc(
-        label.float(), bins=(num_classes), min=0, max=num_classes - 1)
-    area_union = area_pred_label + area_label - area_intersect
-    return area_intersect, area_union, area_pred_label, area_label
-
-
-def total_intersect_and_union(results,
-                              gt_seg_maps,
-                              num_classes,
-                              ignore_index,
-                              label_map=dict(),
-                              reduce_zero_label=False):
-    """Calculate Total Intersection and Union.
-
-    Args:
-        results (list[ndarray] | list[str]): List of prediction segmentation
-            maps or list of prediction result filenames.
-        gt_seg_maps (list[ndarray] | list[str]): list of ground truth
-            segmentation maps or list of label filenames.
-        num_classes (int): Number of categories.
-        ignore_index (int): Index that will be ignored in evaluation.
-        label_map (dict): Mapping old labels to new labels. Default: dict().
-        reduce_zero_label (bool): Wether ignore zero label. Default: False.
-
-     Returns:
-         ndarray: The intersection of prediction and ground truth histogram
-             on all classes.
-         ndarray: The union of prediction and ground truth histogram on all
-             classes.
-         ndarray: The prediction histogram on all classes.
-         ndarray: The ground truth histogram on all classes.
-    """
-    num_imgs = len(results)
-    assert len(gt_seg_maps) == num_imgs
-    total_area_intersect = torch.zeros((num_classes, ), dtype=torch.float64)
-    total_area_union = torch.zeros((num_classes, ), dtype=torch.float64)
-    total_area_pred_label = torch.zeros((num_classes, ), dtype=torch.float64)
-    total_area_label = torch.zeros((num_classes, ), dtype=torch.float64)
-    for i in range(num_imgs):
-        area_intersect, area_union, area_pred_label, area_label = \
-            intersect_and_union(
-                results[i], gt_seg_maps[i], num_classes, ignore_index,
-                label_map, reduce_zero_label)
-        total_area_intersect += area_intersect
-        total_area_union += area_union
-        total_area_pred_label += area_pred_label
-        total_area_label += area_label
-    return total_area_intersect, total_area_union, total_area_pred_label, \
-        total_area_label
-
-
-def mean_iou(results,
-             gt_seg_maps,
-             num_classes,
-             ignore_index,
-             nan_to_num=None,
-             label_map=dict(),
-             reduce_zero_label=False):
-    """Calculate Mean Intersection and Union (mIoU)
-
-    Args:
-        results (list[ndarray] | list[str]): List of prediction segmentation
-            maps or list of prediction result filenames.
-        gt_seg_maps (list[ndarray] | list[str]): list of ground truth
-            segmentation maps or list of label filenames.
-        num_classes (int): Number of categories.
-        ignore_index (int): Index that will be ignored in evaluation.
-        nan_to_num (int, optional): If specified, NaN values will be replaced
-            by the numbers defined by the user. Default: None.
-        label_map (dict): Mapping old labels to new labels. Default: dict().
-        reduce_zero_label (bool): Wether ignore zero label. Default: False.
-
-     Returns:
-        dict[str, float | ndarray]:
-            <aAcc> float: Overall accuracy on all images.
-            <Acc> ndarray: Per category accuracy, shape (num_classes, ).
-            <IoU> ndarray: Per category IoU, shape (num_classes, ).
-    """
-    iou_result = eval_metrics(
-        results=results,
-        gt_seg_maps=gt_seg_maps,
-        num_classes=num_classes,
-        ignore_index=ignore_index,
-        metrics=['mIoU'],
-        nan_to_num=nan_to_num,
-        label_map=label_map,
-        reduce_zero_label=reduce_zero_label)
-    return iou_result
-
-
-def mean_dice(results,
-              gt_seg_maps,
-              num_classes,
-              ignore_index,
-              nan_to_num=None,
-              label_map=dict(),
-              reduce_zero_label=False):
-    """Calculate Mean Dice (mDice)
-
-    Args:
-        results (list[ndarray] | list[str]): List of prediction segmentation
-            maps or list of prediction result filenames.
-        gt_seg_maps (list[ndarray] | list[str]): list of ground truth
-            segmentation maps or list of label filenames.
-        num_classes (int): Number of categories.
-        ignore_index (int): Index that will be ignored in evaluation.
-        nan_to_num (int, optional): If specified, NaN values will be replaced
-            by the numbers defined by the user. Default: None.
-        label_map (dict): Mapping old labels to new labels. Default: dict().
-        reduce_zero_label (bool): Wether ignore zero label. Default: False.
-
-     Returns:
-        dict[str, float | ndarray]: Default metrics.
-            <aAcc> float: Overall accuracy on all images.
-            <Acc> ndarray: Per category accuracy, shape (num_classes, ).
-            <Dice> ndarray: Per category dice, shape (num_classes, ).
-    """
-
-    dice_result = eval_metrics(
-        results=results,
-        gt_seg_maps=gt_seg_maps,
-        num_classes=num_classes,
-        ignore_index=ignore_index,
-        metrics=['mDice'],
-        nan_to_num=nan_to_num,
-        label_map=label_map,
-        reduce_zero_label=reduce_zero_label)
-    return dice_result
-
-
-def mean_fscore(results,
-                gt_seg_maps,
-                num_classes,
-                ignore_index,
-                nan_to_num=None,
-                label_map=dict(),
-                reduce_zero_label=False,
-                beta=1):
-    """Calculate Mean Intersection and Union (mIoU)
-
-    Args:
-        results (list[ndarray] | list[str]): List of prediction segmentation
-            maps or list of prediction result filenames.
-        gt_seg_maps (list[ndarray] | list[str]): list of ground truth
-            segmentation maps or list of label filenames.
-        num_classes (int): Number of categories.
-        ignore_index (int): Index that will be ignored in evaluation.
-        nan_to_num (int, optional): If specified, NaN values will be replaced
-            by the numbers defined by the user. Default: None.
-        label_map (dict): Mapping old labels to new labels. Default: dict().
-        reduce_zero_label (bool): Wether ignore zero label. Default: False.
-        beta (int): Determines the weight of recall in the combined score.
-            Default: False.
-
-
-     Returns:
-        dict[str, float | ndarray]: Default metrics.
-            <aAcc> float: Overall accuracy on all images.
-            <Fscore> ndarray: Per category recall, shape (num_classes, ).
-            <Precision> ndarray: Per category precision, shape (num_classes, ).
-            <Recall> ndarray: Per category f-score, shape (num_classes, ).
-    """
-    fscore_result = eval_metrics(
-        results=results,
-        gt_seg_maps=gt_seg_maps,
-        num_classes=num_classes,
-        ignore_index=ignore_index,
-        metrics=['mFscore'],
-        nan_to_num=nan_to_num,
-        label_map=label_map,
-        reduce_zero_label=reduce_zero_label,
-        beta=beta)
-    return fscore_result
-
-
-def eval_metrics(results,
-                 gt_seg_maps,
-                 num_classes,
-                 ignore_index,
-                 metrics=['mIoU'],
-                 nan_to_num=None,
-                 label_map=dict(),
-                 reduce_zero_label=False,
-                 beta=1):
-    """Calculate evaluation metrics
-    Args:
-        results (list[ndarray] | list[str]): List of prediction segmentation
-            maps or list of prediction result filenames.
-        gt_seg_maps (list[ndarray] | list[str]): list of ground truth
-            segmentation maps or list of label filenames.
-        num_classes (int): Number of categories.
-        ignore_index (int): Index that will be ignored in evaluation.
-        metrics (list[str] | str): Metrics to be evaluated, 'mIoU' and 'mDice'.
-        nan_to_num (int, optional): If specified, NaN values will be replaced
-            by the numbers defined by the user. Default: None.
-        label_map (dict): Mapping old labels to new labels. Default: dict().
-        reduce_zero_label (bool): Wether ignore zero label. Default: False.
-     Returns:
-        float: Overall accuracy on all images.
-        ndarray: Per category accuracy, shape (num_classes, ).
-        ndarray: Per category evaluation metrics, shape (num_classes, ).
-    """
-    if isinstance(metrics, str):
-        metrics = [metrics]
-    allowed_metrics = ['mIoU', 'mDice', 'mFscore']
-    if not set(metrics).issubset(set(allowed_metrics)):
-        raise KeyError('metrics {} is not supported'.format(metrics))
-
-    total_area_intersect, total_area_union, total_area_pred_label, \
-        total_area_label = total_intersect_and_union(
-            results, gt_seg_maps, num_classes, ignore_index, label_map,
-            reduce_zero_label)
-    all_acc = total_area_intersect.sum() / total_area_label.sum()
-    ret_metrics = OrderedDict({'aAcc': all_acc})
-    for metric in metrics:
-        if metric == 'mIoU':
-            iou = total_area_intersect / total_area_union
-            acc = total_area_intersect / total_area_label
-            ret_metrics['IoU'] = iou
-            ret_metrics['Acc'] = acc
-        elif metric == 'mDice':
-            dice = 2 * total_area_intersect / (
-                total_area_pred_label + total_area_label)
-            acc = total_area_intersect / total_area_label
-            ret_metrics['Dice'] = dice
-            ret_metrics['Acc'] = acc
-        elif metric == 'mFscore':
-            precision = total_area_intersect / total_area_pred_label
-            recall = total_area_intersect / total_area_label
-            f_value = torch.tensor(
-                [f_score(x[0], x[1], beta) for x in zip(precision, recall)])
-            ret_metrics['Fscore'] = f_value
-            ret_metrics['Precision'] = precision
-            ret_metrics['Recall'] = recall
-
-    ret_metrics = {
-        metric: value.numpy()
-        for metric, value in ret_metrics.items()
-    }
-    if nan_to_num is not None:
-        ret_metrics = OrderedDict({
-            metric: np.nan_to_num(metric_value, nan=nan_to_num)
-            for metric, metric_value in ret_metrics.items()
-        })
-    return ret_metrics
diff --git a/ControlNet/annotator/uniformer/mmseg/core/seg/__init__.py b/ControlNet/annotator/uniformer/mmseg/core/seg/__init__.py
deleted file mode 100644
index 93bc129b685e4a3efca2cc891729981b2865900d..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/core/seg/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .builder import build_pixel_sampler
-from .sampler import BasePixelSampler, OHEMPixelSampler
-
-__all__ = ['build_pixel_sampler', 'BasePixelSampler', 'OHEMPixelSampler']
diff --git a/ControlNet/annotator/uniformer/mmseg/core/seg/builder.py b/ControlNet/annotator/uniformer/mmseg/core/seg/builder.py
deleted file mode 100644
index db61f03d4abb2072f2532ce4429c0842495e015b..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/core/seg/builder.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from annotator.uniformer.mmcv.utils import Registry, build_from_cfg
-
-PIXEL_SAMPLERS = Registry('pixel sampler')
-
-
-def build_pixel_sampler(cfg, **default_args):
-    """Build pixel sampler for segmentation map."""
-    return build_from_cfg(cfg, PIXEL_SAMPLERS, default_args)
diff --git a/ControlNet/annotator/uniformer/mmseg/core/seg/sampler/__init__.py b/ControlNet/annotator/uniformer/mmseg/core/seg/sampler/__init__.py
deleted file mode 100644
index 332b242c03d1c5e80d4577df442a9a037b1816e1..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/core/seg/sampler/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .base_pixel_sampler import BasePixelSampler
-from .ohem_pixel_sampler import OHEMPixelSampler
-
-__all__ = ['BasePixelSampler', 'OHEMPixelSampler']
diff --git a/ControlNet/annotator/uniformer/mmseg/core/seg/sampler/base_pixel_sampler.py b/ControlNet/annotator/uniformer/mmseg/core/seg/sampler/base_pixel_sampler.py
deleted file mode 100644
index b75b1566c9f18169cee51d4b55d75e0357b69c57..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/core/seg/sampler/base_pixel_sampler.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from abc import ABCMeta, abstractmethod
-
-
-class BasePixelSampler(metaclass=ABCMeta):
-    """Base class of pixel sampler."""
-
-    def __init__(self, **kwargs):
-        pass
-
-    @abstractmethod
-    def sample(self, seg_logit, seg_label):
-        """Placeholder for sample function."""
diff --git a/ControlNet/annotator/uniformer/mmseg/core/seg/sampler/ohem_pixel_sampler.py b/ControlNet/annotator/uniformer/mmseg/core/seg/sampler/ohem_pixel_sampler.py
deleted file mode 100644
index 88bb10d44026ba9f21756eaea9e550841cd59b9f..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/core/seg/sampler/ohem_pixel_sampler.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import torch
-import torch.nn.functional as F
-
-from ..builder import PIXEL_SAMPLERS
-from .base_pixel_sampler import BasePixelSampler
-
-
-@PIXEL_SAMPLERS.register_module()
-class OHEMPixelSampler(BasePixelSampler):
-    """Online Hard Example Mining Sampler for segmentation.
-
-    Args:
-        context (nn.Module): The context of sampler, subclass of
-            :obj:`BaseDecodeHead`.
-        thresh (float, optional): The threshold for hard example selection.
-            Below which, are prediction with low confidence. If not
-            specified, the hard examples will be pixels of top ``min_kept``
-            loss. Default: None.
-        min_kept (int, optional): The minimum number of predictions to keep.
-            Default: 100000.
-    """
-
-    def __init__(self, context, thresh=None, min_kept=100000):
-        super(OHEMPixelSampler, self).__init__()
-        self.context = context
-        assert min_kept > 1
-        self.thresh = thresh
-        self.min_kept = min_kept
-
-    def sample(self, seg_logit, seg_label):
-        """Sample pixels that have high loss or with low prediction confidence.
-
-        Args:
-            seg_logit (torch.Tensor): segmentation logits, shape (N, C, H, W)
-            seg_label (torch.Tensor): segmentation label, shape (N, 1, H, W)
-
-        Returns:
-            torch.Tensor: segmentation weight, shape (N, H, W)
-        """
-        with torch.no_grad():
-            assert seg_logit.shape[2:] == seg_label.shape[2:]
-            assert seg_label.shape[1] == 1
-            seg_label = seg_label.squeeze(1).long()
-            batch_kept = self.min_kept * seg_label.size(0)
-            valid_mask = seg_label != self.context.ignore_index
-            seg_weight = seg_logit.new_zeros(size=seg_label.size())
-            valid_seg_weight = seg_weight[valid_mask]
-            if self.thresh is not None:
-                seg_prob = F.softmax(seg_logit, dim=1)
-
-                tmp_seg_label = seg_label.clone().unsqueeze(1)
-                tmp_seg_label[tmp_seg_label == self.context.ignore_index] = 0
-                seg_prob = seg_prob.gather(1, tmp_seg_label).squeeze(1)
-                sort_prob, sort_indices = seg_prob[valid_mask].sort()
-
-                if sort_prob.numel() > 0:
-                    min_threshold = sort_prob[min(batch_kept,
-                                                  sort_prob.numel() - 1)]
-                else:
-                    min_threshold = 0.0
-                threshold = max(min_threshold, self.thresh)
-                valid_seg_weight[seg_prob[valid_mask] < threshold] = 1.
-            else:
-                losses = self.context.loss_decode(
-                    seg_logit,
-                    seg_label,
-                    weight=None,
-                    ignore_index=self.context.ignore_index,
-                    reduction_override='none')
-                # faster than topk according to https://github.com/pytorch/pytorch/issues/22812  # noqa
-                _, sort_indices = losses[valid_mask].sort(descending=True)
-                valid_seg_weight[sort_indices[:batch_kept]] = 1.
-
-            seg_weight[valid_mask] = valid_seg_weight
-
-            return seg_weight
diff --git a/ControlNet/annotator/uniformer/mmseg/core/utils/__init__.py b/ControlNet/annotator/uniformer/mmseg/core/utils/__init__.py
deleted file mode 100644
index f2678b321c295bcceaef945111ac3524be19d6e4..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/core/utils/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .misc import add_prefix
-
-__all__ = ['add_prefix']
diff --git a/ControlNet/annotator/uniformer/mmseg/core/utils/misc.py b/ControlNet/annotator/uniformer/mmseg/core/utils/misc.py
deleted file mode 100644
index eb862a82bd47c8624db3dd5c6fb6ad8a03b62466..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/core/utils/misc.py
+++ /dev/null
@@ -1,17 +0,0 @@
-def add_prefix(inputs, prefix):
-    """Add prefix for dict.
-
-    Args:
-        inputs (dict): The input dict with str keys.
-        prefix (str): The prefix to add.
-
-    Returns:
-
-        dict: The dict with keys updated with ``prefix``.
-    """
-
-    outputs = dict()
-    for name, value in inputs.items():
-        outputs[f'{prefix}.{name}'] = value
-
-    return outputs
diff --git a/ControlNet/annotator/uniformer/mmseg/models/__init__.py b/ControlNet/annotator/uniformer/mmseg/models/__init__.py
deleted file mode 100644
index 3cf93f8bec9cf0cef0a3bd76ca3ca92eb188f535..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from .backbones import *  # noqa: F401,F403
-from .builder import (BACKBONES, HEADS, LOSSES, SEGMENTORS, build_backbone,
-                      build_head, build_loss, build_segmentor)
-from .decode_heads import *  # noqa: F401,F403
-from .losses import *  # noqa: F401,F403
-from .necks import *  # noqa: F401,F403
-from .segmentors import *  # noqa: F401,F403
-
-__all__ = [
-    'BACKBONES', 'HEADS', 'LOSSES', 'SEGMENTORS', 'build_backbone',
-    'build_head', 'build_loss', 'build_segmentor'
-]
diff --git a/ControlNet/annotator/uniformer/mmseg/models/backbones/__init__.py b/ControlNet/annotator/uniformer/mmseg/models/backbones/__init__.py
deleted file mode 100644
index 8339983905fb5d20bae42ba6f76fea75d278b1aa..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/backbones/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from .cgnet import CGNet
-# from .fast_scnn import FastSCNN
-from .hrnet import HRNet
-from .mobilenet_v2 import MobileNetV2
-from .mobilenet_v3 import MobileNetV3
-from .resnest import ResNeSt
-from .resnet import ResNet, ResNetV1c, ResNetV1d
-from .resnext import ResNeXt
-from .unet import UNet
-from .vit import VisionTransformer
-from .uniformer import UniFormer
-
-__all__ = [
-    'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet',
-    'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3',
-    'VisionTransformer', 'UniFormer'
-]
diff --git a/ControlNet/annotator/uniformer/mmseg/models/backbones/cgnet.py b/ControlNet/annotator/uniformer/mmseg/models/backbones/cgnet.py
deleted file mode 100644
index f8bca442c8f18179f217e40c298fb5ef39df77c4..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/backbones/cgnet.py
+++ /dev/null
@@ -1,367 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.utils.checkpoint as cp
-from annotator.uniformer.mmcv.cnn import (ConvModule, build_conv_layer, build_norm_layer,
-                      constant_init, kaiming_init)
-from annotator.uniformer.mmcv.runner import load_checkpoint
-from annotator.uniformer.mmcv.utils.parrots_wrapper import _BatchNorm
-
-from annotator.uniformer.mmseg.utils import get_root_logger
-from ..builder import BACKBONES
-
-
-class GlobalContextExtractor(nn.Module):
-    """Global Context Extractor for CGNet.
-
-    This class is employed to refine the joint feature of both local feature
-    and surrounding context.
-
-    Args:
-        channel (int): Number of input feature channels.
-        reduction (int): Reductions for global context extractor. Default: 16.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-    """
-
-    def __init__(self, channel, reduction=16, with_cp=False):
-        super(GlobalContextExtractor, self).__init__()
-        self.channel = channel
-        self.reduction = reduction
-        assert reduction >= 1 and channel >= reduction
-        self.with_cp = with_cp
-        self.avg_pool = nn.AdaptiveAvgPool2d(1)
-        self.fc = nn.Sequential(
-            nn.Linear(channel, channel // reduction), nn.ReLU(inplace=True),
-            nn.Linear(channel // reduction, channel), nn.Sigmoid())
-
-    def forward(self, x):
-
-        def _inner_forward(x):
-            num_batch, num_channel = x.size()[:2]
-            y = self.avg_pool(x).view(num_batch, num_channel)
-            y = self.fc(y).view(num_batch, num_channel, 1, 1)
-            return x * y
-
-        if self.with_cp and x.requires_grad:
-            out = cp.checkpoint(_inner_forward, x)
-        else:
-            out = _inner_forward(x)
-
-        return out
-
-
-class ContextGuidedBlock(nn.Module):
-    """Context Guided Block for CGNet.
-
-    This class consists of four components: local feature extractor,
-    surrounding feature extractor, joint feature extractor and global
-    context extractor.
-
-    Args:
-        in_channels (int): Number of input feature channels.
-        out_channels (int): Number of output feature channels.
-        dilation (int): Dilation rate for surrounding context extractor.
-            Default: 2.
-        reduction (int): Reduction for global context extractor. Default: 16.
-        skip_connect (bool): Add input to output or not. Default: True.
-        downsample (bool): Downsample the input to 1/2 or not. Default: False.
-        conv_cfg (dict): Config dict for convolution layer.
-            Default: None, which means using conv2d.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='BN', requires_grad=True).
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='PReLU').
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 dilation=2,
-                 reduction=16,
-                 skip_connect=True,
-                 downsample=False,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN', requires_grad=True),
-                 act_cfg=dict(type='PReLU'),
-                 with_cp=False):
-        super(ContextGuidedBlock, self).__init__()
-        self.with_cp = with_cp
-        self.downsample = downsample
-
-        channels = out_channels if downsample else out_channels // 2
-        if 'type' in act_cfg and act_cfg['type'] == 'PReLU':
-            act_cfg['num_parameters'] = channels
-        kernel_size = 3 if downsample else 1
-        stride = 2 if downsample else 1
-        padding = (kernel_size - 1) // 2
-
-        self.conv1x1 = ConvModule(
-            in_channels,
-            channels,
-            kernel_size,
-            stride,
-            padding,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=act_cfg)
-
-        self.f_loc = build_conv_layer(
-            conv_cfg,
-            channels,
-            channels,
-            kernel_size=3,
-            padding=1,
-            groups=channels,
-            bias=False)
-        self.f_sur = build_conv_layer(
-            conv_cfg,
-            channels,
-            channels,
-            kernel_size=3,
-            padding=dilation,
-            groups=channels,
-            dilation=dilation,
-            bias=False)
-
-        self.bn = build_norm_layer(norm_cfg, 2 * channels)[1]
-        self.activate = nn.PReLU(2 * channels)
-
-        if downsample:
-            self.bottleneck = build_conv_layer(
-                conv_cfg,
-                2 * channels,
-                out_channels,
-                kernel_size=1,
-                bias=False)
-
-        self.skip_connect = skip_connect and not downsample
-        self.f_glo = GlobalContextExtractor(out_channels, reduction, with_cp)
-
-    def forward(self, x):
-
-        def _inner_forward(x):
-            out = self.conv1x1(x)
-            loc = self.f_loc(out)
-            sur = self.f_sur(out)
-
-            joi_feat = torch.cat([loc, sur], 1)  # the joint feature
-            joi_feat = self.bn(joi_feat)
-            joi_feat = self.activate(joi_feat)
-            if self.downsample:
-                joi_feat = self.bottleneck(joi_feat)  # channel = out_channels
-            # f_glo is employed to refine the joint feature
-            out = self.f_glo(joi_feat)
-
-            if self.skip_connect:
-                return x + out
-            else:
-                return out
-
-        if self.with_cp and x.requires_grad:
-            out = cp.checkpoint(_inner_forward, x)
-        else:
-            out = _inner_forward(x)
-
-        return out
-
-
-class InputInjection(nn.Module):
-    """Downsampling module for CGNet."""
-
-    def __init__(self, num_downsampling):
-        super(InputInjection, self).__init__()
-        self.pool = nn.ModuleList()
-        for i in range(num_downsampling):
-            self.pool.append(nn.AvgPool2d(3, stride=2, padding=1))
-
-    def forward(self, x):
-        for pool in self.pool:
-            x = pool(x)
-        return x
-
-
-@BACKBONES.register_module()
-class CGNet(nn.Module):
-    """CGNet backbone.
-
-    A Light-weight Context Guided Network for Semantic Segmentation
-    arXiv: https://arxiv.org/abs/1811.08201
-
-    Args:
-        in_channels (int): Number of input image channels. Normally 3.
-        num_channels (tuple[int]): Numbers of feature channels at each stages.
-            Default: (32, 64, 128).
-        num_blocks (tuple[int]): Numbers of CG blocks at stage 1 and stage 2.
-            Default: (3, 21).
-        dilations (tuple[int]): Dilation rate for surrounding context
-            extractors at stage 1 and stage 2. Default: (2, 4).
-        reductions (tuple[int]): Reductions for global context extractors at
-            stage 1 and stage 2. Default: (8, 16).
-        conv_cfg (dict): Config dict for convolution layer.
-            Default: None, which means using conv2d.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='BN', requires_grad=True).
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='PReLU').
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only. Default: False.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-    """
-
-    def __init__(self,
-                 in_channels=3,
-                 num_channels=(32, 64, 128),
-                 num_blocks=(3, 21),
-                 dilations=(2, 4),
-                 reductions=(8, 16),
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN', requires_grad=True),
-                 act_cfg=dict(type='PReLU'),
-                 norm_eval=False,
-                 with_cp=False):
-
-        super(CGNet, self).__init__()
-        self.in_channels = in_channels
-        self.num_channels = num_channels
-        assert isinstance(self.num_channels, tuple) and len(
-            self.num_channels) == 3
-        self.num_blocks = num_blocks
-        assert isinstance(self.num_blocks, tuple) and len(self.num_blocks) == 2
-        self.dilations = dilations
-        assert isinstance(self.dilations, tuple) and len(self.dilations) == 2
-        self.reductions = reductions
-        assert isinstance(self.reductions, tuple) and len(self.reductions) == 2
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
-        if 'type' in self.act_cfg and self.act_cfg['type'] == 'PReLU':
-            self.act_cfg['num_parameters'] = num_channels[0]
-        self.norm_eval = norm_eval
-        self.with_cp = with_cp
-
-        cur_channels = in_channels
-        self.stem = nn.ModuleList()
-        for i in range(3):
-            self.stem.append(
-                ConvModule(
-                    cur_channels,
-                    num_channels[0],
-                    3,
-                    2 if i == 0 else 1,
-                    padding=1,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg))
-            cur_channels = num_channels[0]
-
-        self.inject_2x = InputInjection(1)  # down-sample for Input, factor=2
-        self.inject_4x = InputInjection(2)  # down-sample for Input, factor=4
-
-        cur_channels += in_channels
-        self.norm_prelu_0 = nn.Sequential(
-            build_norm_layer(norm_cfg, cur_channels)[1],
-            nn.PReLU(cur_channels))
-
-        # stage 1
-        self.level1 = nn.ModuleList()
-        for i in range(num_blocks[0]):
-            self.level1.append(
-                ContextGuidedBlock(
-                    cur_channels if i == 0 else num_channels[1],
-                    num_channels[1],
-                    dilations[0],
-                    reductions[0],
-                    downsample=(i == 0),
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
-                    with_cp=with_cp))  # CG block
-
-        cur_channels = 2 * num_channels[1] + in_channels
-        self.norm_prelu_1 = nn.Sequential(
-            build_norm_layer(norm_cfg, cur_channels)[1],
-            nn.PReLU(cur_channels))
-
-        # stage 2
-        self.level2 = nn.ModuleList()
-        for i in range(num_blocks[1]):
-            self.level2.append(
-                ContextGuidedBlock(
-                    cur_channels if i == 0 else num_channels[2],
-                    num_channels[2],
-                    dilations[1],
-                    reductions[1],
-                    downsample=(i == 0),
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
-                    with_cp=with_cp))  # CG block
-
-        cur_channels = 2 * num_channels[2]
-        self.norm_prelu_2 = nn.Sequential(
-            build_norm_layer(norm_cfg, cur_channels)[1],
-            nn.PReLU(cur_channels))
-
-    def forward(self, x):
-        output = []
-
-        # stage 0
-        inp_2x = self.inject_2x(x)
-        inp_4x = self.inject_4x(x)
-        for layer in self.stem:
-            x = layer(x)
-        x = self.norm_prelu_0(torch.cat([x, inp_2x], 1))
-        output.append(x)
-
-        # stage 1
-        for i, layer in enumerate(self.level1):
-            x = layer(x)
-            if i == 0:
-                down1 = x
-        x = self.norm_prelu_1(torch.cat([x, down1, inp_4x], 1))
-        output.append(x)
-
-        # stage 2
-        for i, layer in enumerate(self.level2):
-            x = layer(x)
-            if i == 0:
-                down2 = x
-        x = self.norm_prelu_2(torch.cat([down2, x], 1))
-        output.append(x)
-
-        return output
-
-    def init_weights(self, pretrained=None):
-        """Initialize the weights in backbone.
-
-        Args:
-            pretrained (str, optional): Path to pre-trained weights.
-                Defaults to None.
-        """
-        if isinstance(pretrained, str):
-            logger = get_root_logger()
-            load_checkpoint(self, pretrained, strict=False, logger=logger)
-        elif pretrained is None:
-            for m in self.modules():
-                if isinstance(m, (nn.Conv2d, nn.Linear)):
-                    kaiming_init(m)
-                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
-                    constant_init(m, 1)
-                elif isinstance(m, nn.PReLU):
-                    constant_init(m, 0)
-        else:
-            raise TypeError('pretrained must be a str or None')
-
-    def train(self, mode=True):
-        """Convert the model into training mode will keeping the normalization
-        layer freezed."""
-        super(CGNet, self).train(mode)
-        if mode and self.norm_eval:
-            for m in self.modules():
-                # trick: eval have effect on BatchNorm only
-                if isinstance(m, _BatchNorm):
-                    m.eval()
diff --git a/ControlNet/annotator/uniformer/mmseg/models/backbones/fast_scnn.py b/ControlNet/annotator/uniformer/mmseg/models/backbones/fast_scnn.py
deleted file mode 100644
index 38c2350177cbc2066f45add568d30eb6041f74f3..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/backbones/fast_scnn.py
+++ /dev/null
@@ -1,375 +0,0 @@
-import torch
-import torch.nn as nn
-from annotator.uniformer.mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule, constant_init,
-                      kaiming_init)
-from torch.nn.modules.batchnorm import _BatchNorm
-
-from annotator.uniformer.mmseg.models.decode_heads.psp_head import PPM
-from annotator.uniformer.mmseg.ops import resize
-from ..builder import BACKBONES
-from ..utils.inverted_residual import InvertedResidual
-
-
-class LearningToDownsample(nn.Module):
-    """Learning to downsample module.
-
-    Args:
-        in_channels (int): Number of input channels.
-        dw_channels (tuple[int]): Number of output channels of the first and
-            the second depthwise conv (dwconv) layers.
-        out_channels (int): Number of output channels of the whole
-            'learning to downsample' module.
-        conv_cfg (dict | None): Config of conv layers. Default: None
-        norm_cfg (dict | None): Config of norm layers. Default:
-            dict(type='BN')
-        act_cfg (dict): Config of activation layers. Default:
-            dict(type='ReLU')
-    """
-
-    def __init__(self,
-                 in_channels,
-                 dw_channels,
-                 out_channels,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 act_cfg=dict(type='ReLU')):
-        super(LearningToDownsample, self).__init__()
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
-        dw_channels1 = dw_channels[0]
-        dw_channels2 = dw_channels[1]
-
-        self.conv = ConvModule(
-            in_channels,
-            dw_channels1,
-            3,
-            stride=2,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-        self.dsconv1 = DepthwiseSeparableConvModule(
-            dw_channels1,
-            dw_channels2,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            norm_cfg=self.norm_cfg)
-        self.dsconv2 = DepthwiseSeparableConvModule(
-            dw_channels2,
-            out_channels,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            norm_cfg=self.norm_cfg)
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.dsconv1(x)
-        x = self.dsconv2(x)
-        return x
-
-
-class GlobalFeatureExtractor(nn.Module):
-    """Global feature extractor module.
-
-    Args:
-        in_channels (int): Number of input channels of the GFE module.
-            Default: 64
-        block_channels (tuple[int]): Tuple of ints. Each int specifies the
-            number of output channels of each Inverted Residual module.
-            Default: (64, 96, 128)
-        out_channels(int): Number of output channels of the GFE module.
-            Default: 128
-        expand_ratio (int): Adjusts number of channels of the hidden layer
-            in InvertedResidual by this amount.
-            Default: 6
-        num_blocks (tuple[int]): Tuple of ints. Each int specifies the
-            number of times each Inverted Residual module is repeated.
-            The repeated Inverted Residual modules are called a 'group'.
-            Default: (3, 3, 3)
-        strides (tuple[int]): Tuple of ints. Each int specifies
-            the downsampling factor of each 'group'.
-            Default: (2, 2, 1)
-        pool_scales (tuple[int]): Tuple of ints. Each int specifies
-            the parameter required in 'global average pooling' within PPM.
-            Default: (1, 2, 3, 6)
-        conv_cfg (dict | None): Config of conv layers. Default: None
-        norm_cfg (dict | None): Config of norm layers. Default:
-            dict(type='BN')
-        act_cfg (dict): Config of activation layers. Default:
-            dict(type='ReLU')
-        align_corners (bool): align_corners argument of F.interpolate.
-            Default: False
-    """
-
-    def __init__(self,
-                 in_channels=64,
-                 block_channels=(64, 96, 128),
-                 out_channels=128,
-                 expand_ratio=6,
-                 num_blocks=(3, 3, 3),
-                 strides=(2, 2, 1),
-                 pool_scales=(1, 2, 3, 6),
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 act_cfg=dict(type='ReLU'),
-                 align_corners=False):
-        super(GlobalFeatureExtractor, self).__init__()
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
-        assert len(block_channels) == len(num_blocks) == 3
-        self.bottleneck1 = self._make_layer(in_channels, block_channels[0],
-                                            num_blocks[0], strides[0],
-                                            expand_ratio)
-        self.bottleneck2 = self._make_layer(block_channels[0],
-                                            block_channels[1], num_blocks[1],
-                                            strides[1], expand_ratio)
-        self.bottleneck3 = self._make_layer(block_channels[1],
-                                            block_channels[2], num_blocks[2],
-                                            strides[2], expand_ratio)
-        self.ppm = PPM(
-            pool_scales,
-            block_channels[2],
-            block_channels[2] // 4,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg,
-            align_corners=align_corners)
-        self.out = ConvModule(
-            block_channels[2] * 2,
-            out_channels,
-            1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-
-    def _make_layer(self,
-                    in_channels,
-                    out_channels,
-                    blocks,
-                    stride=1,
-                    expand_ratio=6):
-        layers = [
-            InvertedResidual(
-                in_channels,
-                out_channels,
-                stride,
-                expand_ratio,
-                norm_cfg=self.norm_cfg)
-        ]
-        for i in range(1, blocks):
-            layers.append(
-                InvertedResidual(
-                    out_channels,
-                    out_channels,
-                    1,
-                    expand_ratio,
-                    norm_cfg=self.norm_cfg))
-        return nn.Sequential(*layers)
-
-    def forward(self, x):
-        x = self.bottleneck1(x)
-        x = self.bottleneck2(x)
-        x = self.bottleneck3(x)
-        x = torch.cat([x, *self.ppm(x)], dim=1)
-        x = self.out(x)
-        return x
-
-
-class FeatureFusionModule(nn.Module):
-    """Feature fusion module.
-
-    Args:
-        higher_in_channels (int): Number of input channels of the
-            higher-resolution branch.
-        lower_in_channels (int): Number of input channels of the
-            lower-resolution branch.
-        out_channels (int): Number of output channels.
-        conv_cfg (dict | None): Config of conv layers. Default: None
-        norm_cfg (dict | None): Config of norm layers. Default:
-            dict(type='BN')
-        act_cfg (dict): Config of activation layers. Default:
-            dict(type='ReLU')
-        align_corners (bool): align_corners argument of F.interpolate.
-            Default: False
-    """
-
-    def __init__(self,
-                 higher_in_channels,
-                 lower_in_channels,
-                 out_channels,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 act_cfg=dict(type='ReLU'),
-                 align_corners=False):
-        super(FeatureFusionModule, self).__init__()
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
-        self.align_corners = align_corners
-        self.dwconv = ConvModule(
-            lower_in_channels,
-            out_channels,
-            1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-        self.conv_lower_res = ConvModule(
-            out_channels,
-            out_channels,
-            1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=None)
-        self.conv_higher_res = ConvModule(
-            higher_in_channels,
-            out_channels,
-            1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=None)
-        self.relu = nn.ReLU(True)
-
-    def forward(self, higher_res_feature, lower_res_feature):
-        lower_res_feature = resize(
-            lower_res_feature,
-            size=higher_res_feature.size()[2:],
-            mode='bilinear',
-            align_corners=self.align_corners)
-        lower_res_feature = self.dwconv(lower_res_feature)
-        lower_res_feature = self.conv_lower_res(lower_res_feature)
-
-        higher_res_feature = self.conv_higher_res(higher_res_feature)
-        out = higher_res_feature + lower_res_feature
-        return self.relu(out)
-
-
-@BACKBONES.register_module()
-class FastSCNN(nn.Module):
-    """Fast-SCNN Backbone.
-
-    Args:
-        in_channels (int): Number of input image channels. Default: 3.
-        downsample_dw_channels (tuple[int]): Number of output channels after
-            the first conv layer & the second conv layer in
-            Learning-To-Downsample (LTD) module.
-            Default: (32, 48).
-        global_in_channels (int): Number of input channels of
-            Global Feature Extractor(GFE).
-            Equal to number of output channels of LTD.
-            Default: 64.
-        global_block_channels (tuple[int]): Tuple of integers that describe
-            the output channels for each of the MobileNet-v2 bottleneck
-            residual blocks in GFE.
-            Default: (64, 96, 128).
-        global_block_strides (tuple[int]): Tuple of integers
-            that describe the strides (downsampling factors) for each of the
-            MobileNet-v2 bottleneck residual blocks in GFE.
-            Default: (2, 2, 1).
-        global_out_channels (int): Number of output channels of GFE.
-            Default: 128.
-        higher_in_channels (int): Number of input channels of the higher
-            resolution branch in FFM.
-            Equal to global_in_channels.
-            Default: 64.
-        lower_in_channels (int): Number of input channels of  the lower
-            resolution branch in FFM.
-            Equal to global_out_channels.
-            Default: 128.
-        fusion_out_channels (int): Number of output channels of FFM.
-            Default: 128.
-        out_indices (tuple): Tuple of indices of list
-            [higher_res_features, lower_res_features, fusion_output].
-            Often set to (0,1,2) to enable aux. heads.
-            Default: (0, 1, 2).
-        conv_cfg (dict | None): Config of conv layers. Default: None
-        norm_cfg (dict | None): Config of norm layers. Default:
-            dict(type='BN')
-        act_cfg (dict): Config of activation layers. Default:
-            dict(type='ReLU')
-        align_corners (bool): align_corners argument of F.interpolate.
-            Default: False
-    """
-
-    def __init__(self,
-                 in_channels=3,
-                 downsample_dw_channels=(32, 48),
-                 global_in_channels=64,
-                 global_block_channels=(64, 96, 128),
-                 global_block_strides=(2, 2, 1),
-                 global_out_channels=128,
-                 higher_in_channels=64,
-                 lower_in_channels=128,
-                 fusion_out_channels=128,
-                 out_indices=(0, 1, 2),
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 act_cfg=dict(type='ReLU'),
-                 align_corners=False):
-
-        super(FastSCNN, self).__init__()
-        if global_in_channels != higher_in_channels:
-            raise AssertionError('Global Input Channels must be the same \
-                                 with Higher Input Channels!')
-        elif global_out_channels != lower_in_channels:
-            raise AssertionError('Global Output Channels must be the same \
-                                with Lower Input Channels!')
-
-        self.in_channels = in_channels
-        self.downsample_dw_channels1 = downsample_dw_channels[0]
-        self.downsample_dw_channels2 = downsample_dw_channels[1]
-        self.global_in_channels = global_in_channels
-        self.global_block_channels = global_block_channels
-        self.global_block_strides = global_block_strides
-        self.global_out_channels = global_out_channels
-        self.higher_in_channels = higher_in_channels
-        self.lower_in_channels = lower_in_channels
-        self.fusion_out_channels = fusion_out_channels
-        self.out_indices = out_indices
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
-        self.align_corners = align_corners
-        self.learning_to_downsample = LearningToDownsample(
-            in_channels,
-            downsample_dw_channels,
-            global_in_channels,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-        self.global_feature_extractor = GlobalFeatureExtractor(
-            global_in_channels,
-            global_block_channels,
-            global_out_channels,
-            strides=self.global_block_strides,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg,
-            align_corners=self.align_corners)
-        self.feature_fusion = FeatureFusionModule(
-            higher_in_channels,
-            lower_in_channels,
-            fusion_out_channels,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg,
-            align_corners=self.align_corners)
-
-    def init_weights(self, pretrained=None):
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                kaiming_init(m)
-            elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
-                constant_init(m, 1)
-
-    def forward(self, x):
-        higher_res_features = self.learning_to_downsample(x)
-        lower_res_features = self.global_feature_extractor(higher_res_features)
-        fusion_output = self.feature_fusion(higher_res_features,
-                                            lower_res_features)
-
-        outs = [higher_res_features, lower_res_features, fusion_output]
-        outs = [outs[i] for i in self.out_indices]
-        return tuple(outs)
diff --git a/ControlNet/annotator/uniformer/mmseg/models/backbones/hrnet.py b/ControlNet/annotator/uniformer/mmseg/models/backbones/hrnet.py
deleted file mode 100644
index 331ebf3ccb8597b3f507670753789073fc3c946d..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/backbones/hrnet.py
+++ /dev/null
@@ -1,555 +0,0 @@
-import torch.nn as nn
-from annotator.uniformer.mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init,
-                      kaiming_init)
-from annotator.uniformer.mmcv.runner import load_checkpoint
-from annotator.uniformer.mmcv.utils.parrots_wrapper import _BatchNorm
-
-from annotator.uniformer.mmseg.ops import Upsample, resize
-from annotator.uniformer.mmseg.utils import get_root_logger
-from ..builder import BACKBONES
-from .resnet import BasicBlock, Bottleneck
-
-
-class HRModule(nn.Module):
-    """High-Resolution Module for HRNet.
-
-    In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange
-    is in this module.
-    """
-
-    def __init__(self,
-                 num_branches,
-                 blocks,
-                 num_blocks,
-                 in_channels,
-                 num_channels,
-                 multiscale_output=True,
-                 with_cp=False,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN', requires_grad=True)):
-        super(HRModule, self).__init__()
-        self._check_branches(num_branches, num_blocks, in_channels,
-                             num_channels)
-
-        self.in_channels = in_channels
-        self.num_branches = num_branches
-
-        self.multiscale_output = multiscale_output
-        self.norm_cfg = norm_cfg
-        self.conv_cfg = conv_cfg
-        self.with_cp = with_cp
-        self.branches = self._make_branches(num_branches, blocks, num_blocks,
-                                            num_channels)
-        self.fuse_layers = self._make_fuse_layers()
-        self.relu = nn.ReLU(inplace=False)
-
-    def _check_branches(self, num_branches, num_blocks, in_channels,
-                        num_channels):
-        """Check branches configuration."""
-        if num_branches != len(num_blocks):
-            error_msg = f'NUM_BRANCHES({num_branches}) <> NUM_BLOCKS(' \
-                        f'{len(num_blocks)})'
-            raise ValueError(error_msg)
-
-        if num_branches != len(num_channels):
-            error_msg = f'NUM_BRANCHES({num_branches}) <> NUM_CHANNELS(' \
-                        f'{len(num_channels)})'
-            raise ValueError(error_msg)
-
-        if num_branches != len(in_channels):
-            error_msg = f'NUM_BRANCHES({num_branches}) <> NUM_INCHANNELS(' \
-                        f'{len(in_channels)})'
-            raise ValueError(error_msg)
-
-    def _make_one_branch(self,
-                         branch_index,
-                         block,
-                         num_blocks,
-                         num_channels,
-                         stride=1):
-        """Build one branch."""
-        downsample = None
-        if stride != 1 or \
-                self.in_channels[branch_index] != \
-                num_channels[branch_index] * block.expansion:
-            downsample = nn.Sequential(
-                build_conv_layer(
-                    self.conv_cfg,
-                    self.in_channels[branch_index],
-                    num_channels[branch_index] * block.expansion,
-                    kernel_size=1,
-                    stride=stride,
-                    bias=False),
-                build_norm_layer(self.norm_cfg, num_channels[branch_index] *
-                                 block.expansion)[1])
-
-        layers = []
-        layers.append(
-            block(
-                self.in_channels[branch_index],
-                num_channels[branch_index],
-                stride,
-                downsample=downsample,
-                with_cp=self.with_cp,
-                norm_cfg=self.norm_cfg,
-                conv_cfg=self.conv_cfg))
-        self.in_channels[branch_index] = \
-            num_channels[branch_index] * block.expansion
-        for i in range(1, num_blocks[branch_index]):
-            layers.append(
-                block(
-                    self.in_channels[branch_index],
-                    num_channels[branch_index],
-                    with_cp=self.with_cp,
-                    norm_cfg=self.norm_cfg,
-                    conv_cfg=self.conv_cfg))
-
-        return nn.Sequential(*layers)
-
-    def _make_branches(self, num_branches, block, num_blocks, num_channels):
-        """Build multiple branch."""
-        branches = []
-
-        for i in range(num_branches):
-            branches.append(
-                self._make_one_branch(i, block, num_blocks, num_channels))
-
-        return nn.ModuleList(branches)
-
-    def _make_fuse_layers(self):
-        """Build fuse layer."""
-        if self.num_branches == 1:
-            return None
-
-        num_branches = self.num_branches
-        in_channels = self.in_channels
-        fuse_layers = []
-        num_out_branches = num_branches if self.multiscale_output else 1
-        for i in range(num_out_branches):
-            fuse_layer = []
-            for j in range(num_branches):
-                if j > i:
-                    fuse_layer.append(
-                        nn.Sequential(
-                            build_conv_layer(
-                                self.conv_cfg,
-                                in_channels[j],
-                                in_channels[i],
-                                kernel_size=1,
-                                stride=1,
-                                padding=0,
-                                bias=False),
-                            build_norm_layer(self.norm_cfg, in_channels[i])[1],
-                            # we set align_corners=False for HRNet
-                            Upsample(
-                                scale_factor=2**(j - i),
-                                mode='bilinear',
-                                align_corners=False)))
-                elif j == i:
-                    fuse_layer.append(None)
-                else:
-                    conv_downsamples = []
-                    for k in range(i - j):
-                        if k == i - j - 1:
-                            conv_downsamples.append(
-                                nn.Sequential(
-                                    build_conv_layer(
-                                        self.conv_cfg,
-                                        in_channels[j],
-                                        in_channels[i],
-                                        kernel_size=3,
-                                        stride=2,
-                                        padding=1,
-                                        bias=False),
-                                    build_norm_layer(self.norm_cfg,
-                                                     in_channels[i])[1]))
-                        else:
-                            conv_downsamples.append(
-                                nn.Sequential(
-                                    build_conv_layer(
-                                        self.conv_cfg,
-                                        in_channels[j],
-                                        in_channels[j],
-                                        kernel_size=3,
-                                        stride=2,
-                                        padding=1,
-                                        bias=False),
-                                    build_norm_layer(self.norm_cfg,
-                                                     in_channels[j])[1],
-                                    nn.ReLU(inplace=False)))
-                    fuse_layer.append(nn.Sequential(*conv_downsamples))
-            fuse_layers.append(nn.ModuleList(fuse_layer))
-
-        return nn.ModuleList(fuse_layers)
-
-    def forward(self, x):
-        """Forward function."""
-        if self.num_branches == 1:
-            return [self.branches[0](x[0])]
-
-        for i in range(self.num_branches):
-            x[i] = self.branches[i](x[i])
-
-        x_fuse = []
-        for i in range(len(self.fuse_layers)):
-            y = 0
-            for j in range(self.num_branches):
-                if i == j:
-                    y += x[j]
-                elif j > i:
-                    y = y + resize(
-                        self.fuse_layers[i][j](x[j]),
-                        size=x[i].shape[2:],
-                        mode='bilinear',
-                        align_corners=False)
-                else:
-                    y += self.fuse_layers[i][j](x[j])
-            x_fuse.append(self.relu(y))
-        return x_fuse
-
-
-@BACKBONES.register_module()
-class HRNet(nn.Module):
-    """HRNet backbone.
-
-    High-Resolution Representations for Labeling Pixels and Regions
-    arXiv: https://arxiv.org/abs/1904.04514
-
-    Args:
-        extra (dict): detailed configuration for each stage of HRNet.
-        in_channels (int): Number of input image channels. Normally 3.
-        conv_cfg (dict): dictionary to construct and config conv layer.
-        norm_cfg (dict): dictionary to construct and config norm layer.
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed.
-        zero_init_residual (bool): whether to use zero init for last norm layer
-            in resblocks to let them behave as identity.
-
-    Example:
-        >>> from annotator.uniformer.mmseg.models import HRNet
-        >>> import torch
-        >>> extra = dict(
-        >>>     stage1=dict(
-        >>>         num_modules=1,
-        >>>         num_branches=1,
-        >>>         block='BOTTLENECK',
-        >>>         num_blocks=(4, ),
-        >>>         num_channels=(64, )),
-        >>>     stage2=dict(
-        >>>         num_modules=1,
-        >>>         num_branches=2,
-        >>>         block='BASIC',
-        >>>         num_blocks=(4, 4),
-        >>>         num_channels=(32, 64)),
-        >>>     stage3=dict(
-        >>>         num_modules=4,
-        >>>         num_branches=3,
-        >>>         block='BASIC',
-        >>>         num_blocks=(4, 4, 4),
-        >>>         num_channels=(32, 64, 128)),
-        >>>     stage4=dict(
-        >>>         num_modules=3,
-        >>>         num_branches=4,
-        >>>         block='BASIC',
-        >>>         num_blocks=(4, 4, 4, 4),
-        >>>         num_channels=(32, 64, 128, 256)))
-        >>> self = HRNet(extra, in_channels=1)
-        >>> self.eval()
-        >>> inputs = torch.rand(1, 1, 32, 32)
-        >>> level_outputs = self.forward(inputs)
-        >>> for level_out in level_outputs:
-        ...     print(tuple(level_out.shape))
-        (1, 32, 8, 8)
-        (1, 64, 4, 4)
-        (1, 128, 2, 2)
-        (1, 256, 1, 1)
-    """
-
-    blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck}
-
-    def __init__(self,
-                 extra,
-                 in_channels=3,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN', requires_grad=True),
-                 norm_eval=False,
-                 with_cp=False,
-                 zero_init_residual=False):
-        super(HRNet, self).__init__()
-        self.extra = extra
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.norm_eval = norm_eval
-        self.with_cp = with_cp
-        self.zero_init_residual = zero_init_residual
-
-        # stem net
-        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1)
-        self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2)
-
-        self.conv1 = build_conv_layer(
-            self.conv_cfg,
-            in_channels,
-            64,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            bias=False)
-
-        self.add_module(self.norm1_name, norm1)
-        self.conv2 = build_conv_layer(
-            self.conv_cfg,
-            64,
-            64,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            bias=False)
-
-        self.add_module(self.norm2_name, norm2)
-        self.relu = nn.ReLU(inplace=True)
-
-        # stage 1
-        self.stage1_cfg = self.extra['stage1']
-        num_channels = self.stage1_cfg['num_channels'][0]
-        block_type = self.stage1_cfg['block']
-        num_blocks = self.stage1_cfg['num_blocks'][0]
-
-        block = self.blocks_dict[block_type]
-        stage1_out_channels = num_channels * block.expansion
-        self.layer1 = self._make_layer(block, 64, num_channels, num_blocks)
-
-        # stage 2
-        self.stage2_cfg = self.extra['stage2']
-        num_channels = self.stage2_cfg['num_channels']
-        block_type = self.stage2_cfg['block']
-
-        block = self.blocks_dict[block_type]
-        num_channels = [channel * block.expansion for channel in num_channels]
-        self.transition1 = self._make_transition_layer([stage1_out_channels],
-                                                       num_channels)
-        self.stage2, pre_stage_channels = self._make_stage(
-            self.stage2_cfg, num_channels)
-
-        # stage 3
-        self.stage3_cfg = self.extra['stage3']
-        num_channels = self.stage3_cfg['num_channels']
-        block_type = self.stage3_cfg['block']
-
-        block = self.blocks_dict[block_type]
-        num_channels = [channel * block.expansion for channel in num_channels]
-        self.transition2 = self._make_transition_layer(pre_stage_channels,
-                                                       num_channels)
-        self.stage3, pre_stage_channels = self._make_stage(
-            self.stage3_cfg, num_channels)
-
-        # stage 4
-        self.stage4_cfg = self.extra['stage4']
-        num_channels = self.stage4_cfg['num_channels']
-        block_type = self.stage4_cfg['block']
-
-        block = self.blocks_dict[block_type]
-        num_channels = [channel * block.expansion for channel in num_channels]
-        self.transition3 = self._make_transition_layer(pre_stage_channels,
-                                                       num_channels)
-        self.stage4, pre_stage_channels = self._make_stage(
-            self.stage4_cfg, num_channels)
-
-    @property
-    def norm1(self):
-        """nn.Module: the normalization layer named "norm1" """
-        return getattr(self, self.norm1_name)
-
-    @property
-    def norm2(self):
-        """nn.Module: the normalization layer named "norm2" """
-        return getattr(self, self.norm2_name)
-
-    def _make_transition_layer(self, num_channels_pre_layer,
-                               num_channels_cur_layer):
-        """Make transition layer."""
-        num_branches_cur = len(num_channels_cur_layer)
-        num_branches_pre = len(num_channels_pre_layer)
-
-        transition_layers = []
-        for i in range(num_branches_cur):
-            if i < num_branches_pre:
-                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
-                    transition_layers.append(
-                        nn.Sequential(
-                            build_conv_layer(
-                                self.conv_cfg,
-                                num_channels_pre_layer[i],
-                                num_channels_cur_layer[i],
-                                kernel_size=3,
-                                stride=1,
-                                padding=1,
-                                bias=False),
-                            build_norm_layer(self.norm_cfg,
-                                             num_channels_cur_layer[i])[1],
-                            nn.ReLU(inplace=True)))
-                else:
-                    transition_layers.append(None)
-            else:
-                conv_downsamples = []
-                for j in range(i + 1 - num_branches_pre):
-                    in_channels = num_channels_pre_layer[-1]
-                    out_channels = num_channels_cur_layer[i] \
-                        if j == i - num_branches_pre else in_channels
-                    conv_downsamples.append(
-                        nn.Sequential(
-                            build_conv_layer(
-                                self.conv_cfg,
-                                in_channels,
-                                out_channels,
-                                kernel_size=3,
-                                stride=2,
-                                padding=1,
-                                bias=False),
-                            build_norm_layer(self.norm_cfg, out_channels)[1],
-                            nn.ReLU(inplace=True)))
-                transition_layers.append(nn.Sequential(*conv_downsamples))
-
-        return nn.ModuleList(transition_layers)
-
-    def _make_layer(self, block, inplanes, planes, blocks, stride=1):
-        """Make each layer."""
-        downsample = None
-        if stride != 1 or inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                build_conv_layer(
-                    self.conv_cfg,
-                    inplanes,
-                    planes * block.expansion,
-                    kernel_size=1,
-                    stride=stride,
-                    bias=False),
-                build_norm_layer(self.norm_cfg, planes * block.expansion)[1])
-
-        layers = []
-        layers.append(
-            block(
-                inplanes,
-                planes,
-                stride,
-                downsample=downsample,
-                with_cp=self.with_cp,
-                norm_cfg=self.norm_cfg,
-                conv_cfg=self.conv_cfg))
-        inplanes = planes * block.expansion
-        for i in range(1, blocks):
-            layers.append(
-                block(
-                    inplanes,
-                    planes,
-                    with_cp=self.with_cp,
-                    norm_cfg=self.norm_cfg,
-                    conv_cfg=self.conv_cfg))
-
-        return nn.Sequential(*layers)
-
-    def _make_stage(self, layer_config, in_channels, multiscale_output=True):
-        """Make each stage."""
-        num_modules = layer_config['num_modules']
-        num_branches = layer_config['num_branches']
-        num_blocks = layer_config['num_blocks']
-        num_channels = layer_config['num_channels']
-        block = self.blocks_dict[layer_config['block']]
-
-        hr_modules = []
-        for i in range(num_modules):
-            # multi_scale_output is only used for the last module
-            if not multiscale_output and i == num_modules - 1:
-                reset_multiscale_output = False
-            else:
-                reset_multiscale_output = True
-
-            hr_modules.append(
-                HRModule(
-                    num_branches,
-                    block,
-                    num_blocks,
-                    in_channels,
-                    num_channels,
-                    reset_multiscale_output,
-                    with_cp=self.with_cp,
-                    norm_cfg=self.norm_cfg,
-                    conv_cfg=self.conv_cfg))
-
-        return nn.Sequential(*hr_modules), in_channels
-
-    def init_weights(self, pretrained=None):
-        """Initialize the weights in backbone.
-
-        Args:
-            pretrained (str, optional): Path to pre-trained weights.
-                Defaults to None.
-        """
-        if isinstance(pretrained, str):
-            logger = get_root_logger()
-            load_checkpoint(self, pretrained, strict=False, logger=logger)
-        elif pretrained is None:
-            for m in self.modules():
-                if isinstance(m, nn.Conv2d):
-                    kaiming_init(m)
-                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
-                    constant_init(m, 1)
-
-            if self.zero_init_residual:
-                for m in self.modules():
-                    if isinstance(m, Bottleneck):
-                        constant_init(m.norm3, 0)
-                    elif isinstance(m, BasicBlock):
-                        constant_init(m.norm2, 0)
-        else:
-            raise TypeError('pretrained must be a str or None')
-
-    def forward(self, x):
-        """Forward function."""
-
-        x = self.conv1(x)
-        x = self.norm1(x)
-        x = self.relu(x)
-        x = self.conv2(x)
-        x = self.norm2(x)
-        x = self.relu(x)
-        x = self.layer1(x)
-
-        x_list = []
-        for i in range(self.stage2_cfg['num_branches']):
-            if self.transition1[i] is not None:
-                x_list.append(self.transition1[i](x))
-            else:
-                x_list.append(x)
-        y_list = self.stage2(x_list)
-
-        x_list = []
-        for i in range(self.stage3_cfg['num_branches']):
-            if self.transition2[i] is not None:
-                x_list.append(self.transition2[i](y_list[-1]))
-            else:
-                x_list.append(y_list[i])
-        y_list = self.stage3(x_list)
-
-        x_list = []
-        for i in range(self.stage4_cfg['num_branches']):
-            if self.transition3[i] is not None:
-                x_list.append(self.transition3[i](y_list[-1]))
-            else:
-                x_list.append(y_list[i])
-        y_list = self.stage4(x_list)
-
-        return y_list
-
-    def train(self, mode=True):
-        """Convert the model into training mode will keeping the normalization
-        layer freezed."""
-        super(HRNet, self).train(mode)
-        if mode and self.norm_eval:
-            for m in self.modules():
-                # trick: eval have effect on BatchNorm only
-                if isinstance(m, _BatchNorm):
-                    m.eval()
diff --git a/ControlNet/annotator/uniformer/mmseg/models/backbones/mobilenet_v2.py b/ControlNet/annotator/uniformer/mmseg/models/backbones/mobilenet_v2.py
deleted file mode 100644
index ab6b3791692a0d1b5da3601875711710b7bd01ba..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/backbones/mobilenet_v2.py
+++ /dev/null
@@ -1,180 +0,0 @@
-import logging
-
-import torch.nn as nn
-from annotator.uniformer.mmcv.cnn import ConvModule, constant_init, kaiming_init
-from annotator.uniformer.mmcv.runner import load_checkpoint
-from torch.nn.modules.batchnorm import _BatchNorm
-
-from ..builder import BACKBONES
-from ..utils import InvertedResidual, make_divisible
-
-
-@BACKBONES.register_module()
-class MobileNetV2(nn.Module):
-    """MobileNetV2 backbone.
-
-    Args:
-        widen_factor (float): Width multiplier, multiply number of
-            channels in each layer by this amount. Default: 1.0.
-        strides (Sequence[int], optional): Strides of the first block of each
-            layer. If not specified, default config in ``arch_setting`` will
-            be used.
-        dilations (Sequence[int]): Dilation of each layer.
-        out_indices (None or Sequence[int]): Output from which stages.
-            Default: (7, ).
-        frozen_stages (int): Stages to be frozen (all param fixed).
-            Default: -1, which means not freezing any parameters.
-        conv_cfg (dict): Config dict for convolution layer.
-            Default: None, which means using conv2d.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='BN').
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='ReLU6').
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only. Default: False.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-    """
-
-    # Parameters to build layers. 3 parameters are needed to construct a
-    # layer, from left to right: expand_ratio, channel, num_blocks.
-    arch_settings = [[1, 16, 1], [6, 24, 2], [6, 32, 3], [6, 64, 4],
-                     [6, 96, 3], [6, 160, 3], [6, 320, 1]]
-
-    def __init__(self,
-                 widen_factor=1.,
-                 strides=(1, 2, 2, 2, 1, 2, 1),
-                 dilations=(1, 1, 1, 1, 1, 1, 1),
-                 out_indices=(1, 2, 4, 6),
-                 frozen_stages=-1,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 act_cfg=dict(type='ReLU6'),
-                 norm_eval=False,
-                 with_cp=False):
-        super(MobileNetV2, self).__init__()
-        self.widen_factor = widen_factor
-        self.strides = strides
-        self.dilations = dilations
-        assert len(strides) == len(dilations) == len(self.arch_settings)
-        self.out_indices = out_indices
-        for index in out_indices:
-            if index not in range(0, 7):
-                raise ValueError('the item in out_indices must in '
-                                 f'range(0, 8). But received {index}')
-
-        if frozen_stages not in range(-1, 7):
-            raise ValueError('frozen_stages must be in range(-1, 7). '
-                             f'But received {frozen_stages}')
-        self.out_indices = out_indices
-        self.frozen_stages = frozen_stages
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
-        self.norm_eval = norm_eval
-        self.with_cp = with_cp
-
-        self.in_channels = make_divisible(32 * widen_factor, 8)
-
-        self.conv1 = ConvModule(
-            in_channels=3,
-            out_channels=self.in_channels,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-
-        self.layers = []
-
-        for i, layer_cfg in enumerate(self.arch_settings):
-            expand_ratio, channel, num_blocks = layer_cfg
-            stride = self.strides[i]
-            dilation = self.dilations[i]
-            out_channels = make_divisible(channel * widen_factor, 8)
-            inverted_res_layer = self.make_layer(
-                out_channels=out_channels,
-                num_blocks=num_blocks,
-                stride=stride,
-                dilation=dilation,
-                expand_ratio=expand_ratio)
-            layer_name = f'layer{i + 1}'
-            self.add_module(layer_name, inverted_res_layer)
-            self.layers.append(layer_name)
-
-    def make_layer(self, out_channels, num_blocks, stride, dilation,
-                   expand_ratio):
-        """Stack InvertedResidual blocks to build a layer for MobileNetV2.
-
-        Args:
-            out_channels (int): out_channels of block.
-            num_blocks (int): Number of blocks.
-            stride (int): Stride of the first block.
-            dilation (int): Dilation of the first block.
-            expand_ratio (int): Expand the number of channels of the
-                hidden layer in InvertedResidual by this ratio.
-        """
-        layers = []
-        for i in range(num_blocks):
-            layers.append(
-                InvertedResidual(
-                    self.in_channels,
-                    out_channels,
-                    stride if i == 0 else 1,
-                    expand_ratio=expand_ratio,
-                    dilation=dilation if i == 0 else 1,
-                    conv_cfg=self.conv_cfg,
-                    norm_cfg=self.norm_cfg,
-                    act_cfg=self.act_cfg,
-                    with_cp=self.with_cp))
-            self.in_channels = out_channels
-
-        return nn.Sequential(*layers)
-
-    def init_weights(self, pretrained=None):
-        if isinstance(pretrained, str):
-            logger = logging.getLogger()
-            load_checkpoint(self, pretrained, strict=False, logger=logger)
-        elif pretrained is None:
-            for m in self.modules():
-                if isinstance(m, nn.Conv2d):
-                    kaiming_init(m)
-                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
-                    constant_init(m, 1)
-        else:
-            raise TypeError('pretrained must be a str or None')
-
-    def forward(self, x):
-        x = self.conv1(x)
-
-        outs = []
-        for i, layer_name in enumerate(self.layers):
-            layer = getattr(self, layer_name)
-            x = layer(x)
-            if i in self.out_indices:
-                outs.append(x)
-
-        if len(outs) == 1:
-            return outs[0]
-        else:
-            return tuple(outs)
-
-    def _freeze_stages(self):
-        if self.frozen_stages >= 0:
-            for param in self.conv1.parameters():
-                param.requires_grad = False
-        for i in range(1, self.frozen_stages + 1):
-            layer = getattr(self, f'layer{i}')
-            layer.eval()
-            for param in layer.parameters():
-                param.requires_grad = False
-
-    def train(self, mode=True):
-        super(MobileNetV2, self).train(mode)
-        self._freeze_stages()
-        if mode and self.norm_eval:
-            for m in self.modules():
-                if isinstance(m, _BatchNorm):
-                    m.eval()
diff --git a/ControlNet/annotator/uniformer/mmseg/models/backbones/mobilenet_v3.py b/ControlNet/annotator/uniformer/mmseg/models/backbones/mobilenet_v3.py
deleted file mode 100644
index 16817400b4102899794fe64c9644713a4e54e2f9..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/backbones/mobilenet_v3.py
+++ /dev/null
@@ -1,255 +0,0 @@
-import logging
-
-import annotator.uniformer.mmcv as mmcv
-import torch.nn as nn
-from annotator.uniformer.mmcv.cnn import ConvModule, constant_init, kaiming_init
-from annotator.uniformer.mmcv.cnn.bricks import Conv2dAdaptivePadding
-from annotator.uniformer.mmcv.runner import load_checkpoint
-from torch.nn.modules.batchnorm import _BatchNorm
-
-from ..builder import BACKBONES
-from ..utils import InvertedResidualV3 as InvertedResidual
-
-
-@BACKBONES.register_module()
-class MobileNetV3(nn.Module):
-    """MobileNetV3 backbone.
-
-    This backbone is the improved implementation of `Searching for MobileNetV3
-    <https://ieeexplore.ieee.org/document/9008835>`_.
-
-    Args:
-        arch (str): Architecture of mobilnetv3, from {'small', 'large'}.
-            Default: 'small'.
-        conv_cfg (dict): Config dict for convolution layer.
-            Default: None, which means using conv2d.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='BN').
-        out_indices (tuple[int]): Output from which layer.
-            Default: (0, 1, 12).
-        frozen_stages (int): Stages to be frozen (all param fixed).
-            Default: -1, which means not freezing any parameters.
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only. Default: False.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save
-            some memory while slowing down the training speed.
-            Default: False.
-    """
-    # Parameters to build each block:
-    #     [kernel size, mid channels, out channels, with_se, act type, stride]
-    arch_settings = {
-        'small': [[3, 16, 16, True, 'ReLU', 2],  # block0 layer1 os=4
-                  [3, 72, 24, False, 'ReLU', 2],  # block1 layer2 os=8
-                  [3, 88, 24, False, 'ReLU', 1],
-                  [5, 96, 40, True, 'HSwish', 2],  # block2 layer4 os=16
-                  [5, 240, 40, True, 'HSwish', 1],
-                  [5, 240, 40, True, 'HSwish', 1],
-                  [5, 120, 48, True, 'HSwish', 1],  # block3 layer7 os=16
-                  [5, 144, 48, True, 'HSwish', 1],
-                  [5, 288, 96, True, 'HSwish', 2],  # block4 layer9 os=32
-                  [5, 576, 96, True, 'HSwish', 1],
-                  [5, 576, 96, True, 'HSwish', 1]],
-        'large': [[3, 16, 16, False, 'ReLU', 1],  # block0 layer1 os=2
-                  [3, 64, 24, False, 'ReLU', 2],  # block1 layer2 os=4
-                  [3, 72, 24, False, 'ReLU', 1],
-                  [5, 72, 40, True, 'ReLU', 2],  # block2 layer4 os=8
-                  [5, 120, 40, True, 'ReLU', 1],
-                  [5, 120, 40, True, 'ReLU', 1],
-                  [3, 240, 80, False, 'HSwish', 2],  # block3 layer7 os=16
-                  [3, 200, 80, False, 'HSwish', 1],
-                  [3, 184, 80, False, 'HSwish', 1],
-                  [3, 184, 80, False, 'HSwish', 1],
-                  [3, 480, 112, True, 'HSwish', 1],  # block4 layer11 os=16
-                  [3, 672, 112, True, 'HSwish', 1],
-                  [5, 672, 160, True, 'HSwish', 2],  # block5 layer13 os=32
-                  [5, 960, 160, True, 'HSwish', 1],
-                  [5, 960, 160, True, 'HSwish', 1]]
-    }  # yapf: disable
-
-    def __init__(self,
-                 arch='small',
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 out_indices=(0, 1, 12),
-                 frozen_stages=-1,
-                 reduction_factor=1,
-                 norm_eval=False,
-                 with_cp=False):
-        super(MobileNetV3, self).__init__()
-        assert arch in self.arch_settings
-        assert isinstance(reduction_factor, int) and reduction_factor > 0
-        assert mmcv.is_tuple_of(out_indices, int)
-        for index in out_indices:
-            if index not in range(0, len(self.arch_settings[arch]) + 2):
-                raise ValueError(
-                    'the item in out_indices must in '
-                    f'range(0, {len(self.arch_settings[arch])+2}). '
-                    f'But received {index}')
-
-        if frozen_stages not in range(-1, len(self.arch_settings[arch]) + 2):
-            raise ValueError('frozen_stages must be in range(-1, '
-                             f'{len(self.arch_settings[arch])+2}). '
-                             f'But received {frozen_stages}')
-        self.arch = arch
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.out_indices = out_indices
-        self.frozen_stages = frozen_stages
-        self.reduction_factor = reduction_factor
-        self.norm_eval = norm_eval
-        self.with_cp = with_cp
-        self.layers = self._make_layer()
-
-    def _make_layer(self):
-        layers = []
-
-        # build the first layer (layer0)
-        in_channels = 16
-        layer = ConvModule(
-            in_channels=3,
-            out_channels=in_channels,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            conv_cfg=dict(type='Conv2dAdaptivePadding'),
-            norm_cfg=self.norm_cfg,
-            act_cfg=dict(type='HSwish'))
-        self.add_module('layer0', layer)
-        layers.append('layer0')
-
-        layer_setting = self.arch_settings[self.arch]
-        for i, params in enumerate(layer_setting):
-            (kernel_size, mid_channels, out_channels, with_se, act,
-             stride) = params
-
-            if self.arch == 'large' and i >= 12 or self.arch == 'small' and \
-                    i >= 8:
-                mid_channels = mid_channels // self.reduction_factor
-                out_channels = out_channels // self.reduction_factor
-
-            if with_se:
-                se_cfg = dict(
-                    channels=mid_channels,
-                    ratio=4,
-                    act_cfg=(dict(type='ReLU'),
-                             dict(type='HSigmoid', bias=3.0, divisor=6.0)))
-            else:
-                se_cfg = None
-
-            layer = InvertedResidual(
-                in_channels=in_channels,
-                out_channels=out_channels,
-                mid_channels=mid_channels,
-                kernel_size=kernel_size,
-                stride=stride,
-                se_cfg=se_cfg,
-                with_expand_conv=(in_channels != mid_channels),
-                conv_cfg=self.conv_cfg,
-                norm_cfg=self.norm_cfg,
-                act_cfg=dict(type=act),
-                with_cp=self.with_cp)
-            in_channels = out_channels
-            layer_name = 'layer{}'.format(i + 1)
-            self.add_module(layer_name, layer)
-            layers.append(layer_name)
-
-        # build the last layer
-        # block5 layer12 os=32 for small model
-        # block6 layer16 os=32 for large model
-        layer = ConvModule(
-            in_channels=in_channels,
-            out_channels=576 if self.arch == 'small' else 960,
-            kernel_size=1,
-            stride=1,
-            dilation=4,
-            padding=0,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=dict(type='HSwish'))
-        layer_name = 'layer{}'.format(len(layer_setting) + 1)
-        self.add_module(layer_name, layer)
-        layers.append(layer_name)
-
-        # next, convert backbone MobileNetV3 to a semantic segmentation version
-        if self.arch == 'small':
-            self.layer4.depthwise_conv.conv.stride = (1, 1)
-            self.layer9.depthwise_conv.conv.stride = (1, 1)
-            for i in range(4, len(layers)):
-                layer = getattr(self, layers[i])
-                if isinstance(layer, InvertedResidual):
-                    modified_module = layer.depthwise_conv.conv
-                else:
-                    modified_module = layer.conv
-
-                if i < 9:
-                    modified_module.dilation = (2, 2)
-                    pad = 2
-                else:
-                    modified_module.dilation = (4, 4)
-                    pad = 4
-
-                if not isinstance(modified_module, Conv2dAdaptivePadding):
-                    # Adjust padding
-                    pad *= (modified_module.kernel_size[0] - 1) // 2
-                    modified_module.padding = (pad, pad)
-        else:
-            self.layer7.depthwise_conv.conv.stride = (1, 1)
-            self.layer13.depthwise_conv.conv.stride = (1, 1)
-            for i in range(7, len(layers)):
-                layer = getattr(self, layers[i])
-                if isinstance(layer, InvertedResidual):
-                    modified_module = layer.depthwise_conv.conv
-                else:
-                    modified_module = layer.conv
-
-                if i < 13:
-                    modified_module.dilation = (2, 2)
-                    pad = 2
-                else:
-                    modified_module.dilation = (4, 4)
-                    pad = 4
-
-                if not isinstance(modified_module, Conv2dAdaptivePadding):
-                    # Adjust padding
-                    pad *= (modified_module.kernel_size[0] - 1) // 2
-                    modified_module.padding = (pad, pad)
-
-        return layers
-
-    def init_weights(self, pretrained=None):
-        if isinstance(pretrained, str):
-            logger = logging.getLogger()
-            load_checkpoint(self, pretrained, strict=False, logger=logger)
-        elif pretrained is None:
-            for m in self.modules():
-                if isinstance(m, nn.Conv2d):
-                    kaiming_init(m)
-                elif isinstance(m, nn.BatchNorm2d):
-                    constant_init(m, 1)
-        else:
-            raise TypeError('pretrained must be a str or None')
-
-    def forward(self, x):
-        outs = []
-        for i, layer_name in enumerate(self.layers):
-            layer = getattr(self, layer_name)
-            x = layer(x)
-            if i in self.out_indices:
-                outs.append(x)
-        return outs
-
-    def _freeze_stages(self):
-        for i in range(self.frozen_stages + 1):
-            layer = getattr(self, f'layer{i}')
-            layer.eval()
-            for param in layer.parameters():
-                param.requires_grad = False
-
-    def train(self, mode=True):
-        super(MobileNetV3, self).train(mode)
-        self._freeze_stages()
-        if mode and self.norm_eval:
-            for m in self.modules():
-                if isinstance(m, _BatchNorm):
-                    m.eval()
diff --git a/ControlNet/annotator/uniformer/mmseg/models/backbones/resnest.py b/ControlNet/annotator/uniformer/mmseg/models/backbones/resnest.py
deleted file mode 100644
index b45a837f395230029e9d4194ff9f7f2f8f7067b0..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/backbones/resnest.py
+++ /dev/null
@@ -1,314 +0,0 @@
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.utils.checkpoint as cp
-from annotator.uniformer.mmcv.cnn import build_conv_layer, build_norm_layer
-
-from ..builder import BACKBONES
-from ..utils import ResLayer
-from .resnet import Bottleneck as _Bottleneck
-from .resnet import ResNetV1d
-
-
-class RSoftmax(nn.Module):
-    """Radix Softmax module in ``SplitAttentionConv2d``.
-
-    Args:
-        radix (int): Radix of input.
-        groups (int): Groups of input.
-    """
-
-    def __init__(self, radix, groups):
-        super().__init__()
-        self.radix = radix
-        self.groups = groups
-
-    def forward(self, x):
-        batch = x.size(0)
-        if self.radix > 1:
-            x = x.view(batch, self.groups, self.radix, -1).transpose(1, 2)
-            x = F.softmax(x, dim=1)
-            x = x.reshape(batch, -1)
-        else:
-            x = torch.sigmoid(x)
-        return x
-
-
-class SplitAttentionConv2d(nn.Module):
-    """Split-Attention Conv2d in ResNeSt.
-
-    Args:
-        in_channels (int): Same as nn.Conv2d.
-        out_channels (int): Same as nn.Conv2d.
-        kernel_size (int | tuple[int]): Same as nn.Conv2d.
-        stride (int | tuple[int]): Same as nn.Conv2d.
-        padding (int | tuple[int]): Same as nn.Conv2d.
-        dilation (int | tuple[int]): Same as nn.Conv2d.
-        groups (int): Same as nn.Conv2d.
-        radix (int): Radix of SpltAtConv2d. Default: 2
-        reduction_factor (int): Reduction factor of inter_channels. Default: 4.
-        conv_cfg (dict): Config dict for convolution layer. Default: None,
-            which means using conv2d.
-        norm_cfg (dict): Config dict for normalization layer. Default: None.
-        dcn (dict): Config dict for DCN. Default: None.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 radix=2,
-                 reduction_factor=4,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 dcn=None):
-        super(SplitAttentionConv2d, self).__init__()
-        inter_channels = max(in_channels * radix // reduction_factor, 32)
-        self.radix = radix
-        self.groups = groups
-        self.channels = channels
-        self.with_dcn = dcn is not None
-        self.dcn = dcn
-        fallback_on_stride = False
-        if self.with_dcn:
-            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
-        if self.with_dcn and not fallback_on_stride:
-            assert conv_cfg is None, 'conv_cfg must be None for DCN'
-            conv_cfg = dcn
-        self.conv = build_conv_layer(
-            conv_cfg,
-            in_channels,
-            channels * radix,
-            kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=groups * radix,
-            bias=False)
-        self.norm0_name, norm0 = build_norm_layer(
-            norm_cfg, channels * radix, postfix=0)
-        self.add_module(self.norm0_name, norm0)
-        self.relu = nn.ReLU(inplace=True)
-        self.fc1 = build_conv_layer(
-            None, channels, inter_channels, 1, groups=self.groups)
-        self.norm1_name, norm1 = build_norm_layer(
-            norm_cfg, inter_channels, postfix=1)
-        self.add_module(self.norm1_name, norm1)
-        self.fc2 = build_conv_layer(
-            None, inter_channels, channels * radix, 1, groups=self.groups)
-        self.rsoftmax = RSoftmax(radix, groups)
-
-    @property
-    def norm0(self):
-        """nn.Module: the normalization layer named "norm0" """
-        return getattr(self, self.norm0_name)
-
-    @property
-    def norm1(self):
-        """nn.Module: the normalization layer named "norm1" """
-        return getattr(self, self.norm1_name)
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.norm0(x)
-        x = self.relu(x)
-
-        batch, rchannel = x.shape[:2]
-        batch = x.size(0)
-        if self.radix > 1:
-            splits = x.view(batch, self.radix, -1, *x.shape[2:])
-            gap = splits.sum(dim=1)
-        else:
-            gap = x
-        gap = F.adaptive_avg_pool2d(gap, 1)
-        gap = self.fc1(gap)
-
-        gap = self.norm1(gap)
-        gap = self.relu(gap)
-
-        atten = self.fc2(gap)
-        atten = self.rsoftmax(atten).view(batch, -1, 1, 1)
-
-        if self.radix > 1:
-            attens = atten.view(batch, self.radix, -1, *atten.shape[2:])
-            out = torch.sum(attens * splits, dim=1)
-        else:
-            out = atten * x
-        return out.contiguous()
-
-
-class Bottleneck(_Bottleneck):
-    """Bottleneck block for ResNeSt.
-
-    Args:
-        inplane (int): Input planes of this block.
-        planes (int): Middle planes of this block.
-        groups (int): Groups of conv2.
-        width_per_group (int): Width per group of conv2. 64x4d indicates
-            ``groups=64, width_per_group=4`` and 32x8d indicates
-            ``groups=32, width_per_group=8``.
-        radix (int): Radix of SpltAtConv2d. Default: 2
-        reduction_factor (int): Reduction factor of inter_channels in
-            SplitAttentionConv2d. Default: 4.
-        avg_down_stride (bool): Whether to use average pool for stride in
-            Bottleneck. Default: True.
-        kwargs (dict): Key word arguments for base class.
-    """
-    expansion = 4
-
-    def __init__(self,
-                 inplanes,
-                 planes,
-                 groups=1,
-                 base_width=4,
-                 base_channels=64,
-                 radix=2,
-                 reduction_factor=4,
-                 avg_down_stride=True,
-                 **kwargs):
-        """Bottleneck block for ResNeSt."""
-        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
-
-        if groups == 1:
-            width = self.planes
-        else:
-            width = math.floor(self.planes *
-                               (base_width / base_channels)) * groups
-
-        self.avg_down_stride = avg_down_stride and self.conv2_stride > 1
-
-        self.norm1_name, norm1 = build_norm_layer(
-            self.norm_cfg, width, postfix=1)
-        self.norm3_name, norm3 = build_norm_layer(
-            self.norm_cfg, self.planes * self.expansion, postfix=3)
-
-        self.conv1 = build_conv_layer(
-            self.conv_cfg,
-            self.inplanes,
-            width,
-            kernel_size=1,
-            stride=self.conv1_stride,
-            bias=False)
-        self.add_module(self.norm1_name, norm1)
-        self.with_modulated_dcn = False
-        self.conv2 = SplitAttentionConv2d(
-            width,
-            width,
-            kernel_size=3,
-            stride=1 if self.avg_down_stride else self.conv2_stride,
-            padding=self.dilation,
-            dilation=self.dilation,
-            groups=groups,
-            radix=radix,
-            reduction_factor=reduction_factor,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            dcn=self.dcn)
-        delattr(self, self.norm2_name)
-
-        if self.avg_down_stride:
-            self.avd_layer = nn.AvgPool2d(3, self.conv2_stride, padding=1)
-
-        self.conv3 = build_conv_layer(
-            self.conv_cfg,
-            width,
-            self.planes * self.expansion,
-            kernel_size=1,
-            bias=False)
-        self.add_module(self.norm3_name, norm3)
-
-    def forward(self, x):
-
-        def _inner_forward(x):
-            identity = x
-
-            out = self.conv1(x)
-            out = self.norm1(out)
-            out = self.relu(out)
-
-            if self.with_plugins:
-                out = self.forward_plugin(out, self.after_conv1_plugin_names)
-
-            out = self.conv2(out)
-
-            if self.avg_down_stride:
-                out = self.avd_layer(out)
-
-            if self.with_plugins:
-                out = self.forward_plugin(out, self.after_conv2_plugin_names)
-
-            out = self.conv3(out)
-            out = self.norm3(out)
-
-            if self.with_plugins:
-                out = self.forward_plugin(out, self.after_conv3_plugin_names)
-
-            if self.downsample is not None:
-                identity = self.downsample(x)
-
-            out += identity
-
-            return out
-
-        if self.with_cp and x.requires_grad:
-            out = cp.checkpoint(_inner_forward, x)
-        else:
-            out = _inner_forward(x)
-
-        out = self.relu(out)
-
-        return out
-
-
-@BACKBONES.register_module()
-class ResNeSt(ResNetV1d):
-    """ResNeSt backbone.
-
-    Args:
-        groups (int): Number of groups of Bottleneck. Default: 1
-        base_width (int): Base width of Bottleneck. Default: 4
-        radix (int): Radix of SpltAtConv2d. Default: 2
-        reduction_factor (int): Reduction factor of inter_channels in
-            SplitAttentionConv2d. Default: 4.
-        avg_down_stride (bool): Whether to use average pool for stride in
-            Bottleneck. Default: True.
-        kwargs (dict): Keyword arguments for ResNet.
-    """
-
-    arch_settings = {
-        50: (Bottleneck, (3, 4, 6, 3)),
-        101: (Bottleneck, (3, 4, 23, 3)),
-        152: (Bottleneck, (3, 8, 36, 3)),
-        200: (Bottleneck, (3, 24, 36, 3))
-    }
-
-    def __init__(self,
-                 groups=1,
-                 base_width=4,
-                 radix=2,
-                 reduction_factor=4,
-                 avg_down_stride=True,
-                 **kwargs):
-        self.groups = groups
-        self.base_width = base_width
-        self.radix = radix
-        self.reduction_factor = reduction_factor
-        self.avg_down_stride = avg_down_stride
-        super(ResNeSt, self).__init__(**kwargs)
-
-    def make_res_layer(self, **kwargs):
-        """Pack all blocks in a stage into a ``ResLayer``."""
-        return ResLayer(
-            groups=self.groups,
-            base_width=self.base_width,
-            base_channels=self.base_channels,
-            radix=self.radix,
-            reduction_factor=self.reduction_factor,
-            avg_down_stride=self.avg_down_stride,
-            **kwargs)
diff --git a/ControlNet/annotator/uniformer/mmseg/models/backbones/resnet.py b/ControlNet/annotator/uniformer/mmseg/models/backbones/resnet.py
deleted file mode 100644
index 4e52bf048d28ecb069db4728e5f05ad85ac53198..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/backbones/resnet.py
+++ /dev/null
@@ -1,688 +0,0 @@
-import torch.nn as nn
-import torch.utils.checkpoint as cp
-from annotator.uniformer.mmcv.cnn import (build_conv_layer, build_norm_layer, build_plugin_layer,
-                      constant_init, kaiming_init)
-from annotator.uniformer.mmcv.runner import load_checkpoint
-from annotator.uniformer.mmcv.utils.parrots_wrapper import _BatchNorm
-
-from annotator.uniformer.mmseg.utils import get_root_logger
-from ..builder import BACKBONES
-from ..utils import ResLayer
-
-
-class BasicBlock(nn.Module):
-    """Basic block for ResNet."""
-
-    expansion = 1
-
-    def __init__(self,
-                 inplanes,
-                 planes,
-                 stride=1,
-                 dilation=1,
-                 downsample=None,
-                 style='pytorch',
-                 with_cp=False,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 dcn=None,
-                 plugins=None):
-        super(BasicBlock, self).__init__()
-        assert dcn is None, 'Not implemented yet.'
-        assert plugins is None, 'Not implemented yet.'
-
-        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
-        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
-
-        self.conv1 = build_conv_layer(
-            conv_cfg,
-            inplanes,
-            planes,
-            3,
-            stride=stride,
-            padding=dilation,
-            dilation=dilation,
-            bias=False)
-        self.add_module(self.norm1_name, norm1)
-        self.conv2 = build_conv_layer(
-            conv_cfg, planes, planes, 3, padding=1, bias=False)
-        self.add_module(self.norm2_name, norm2)
-
-        self.relu = nn.ReLU(inplace=True)
-        self.downsample = downsample
-        self.stride = stride
-        self.dilation = dilation
-        self.with_cp = with_cp
-
-    @property
-    def norm1(self):
-        """nn.Module: normalization layer after the first convolution layer"""
-        return getattr(self, self.norm1_name)
-
-    @property
-    def norm2(self):
-        """nn.Module: normalization layer after the second convolution layer"""
-        return getattr(self, self.norm2_name)
-
-    def forward(self, x):
-        """Forward function."""
-
-        def _inner_forward(x):
-            identity = x
-
-            out = self.conv1(x)
-            out = self.norm1(out)
-            out = self.relu(out)
-
-            out = self.conv2(out)
-            out = self.norm2(out)
-
-            if self.downsample is not None:
-                identity = self.downsample(x)
-
-            out += identity
-
-            return out
-
-        if self.with_cp and x.requires_grad:
-            out = cp.checkpoint(_inner_forward, x)
-        else:
-            out = _inner_forward(x)
-
-        out = self.relu(out)
-
-        return out
-
-
-class Bottleneck(nn.Module):
-    """Bottleneck block for ResNet.
-
-    If style is "pytorch", the stride-two layer is the 3x3 conv layer, if it is
-    "caffe", the stride-two layer is the first 1x1 conv layer.
-    """
-
-    expansion = 4
-
-    def __init__(self,
-                 inplanes,
-                 planes,
-                 stride=1,
-                 dilation=1,
-                 downsample=None,
-                 style='pytorch',
-                 with_cp=False,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 dcn=None,
-                 plugins=None):
-        super(Bottleneck, self).__init__()
-        assert style in ['pytorch', 'caffe']
-        assert dcn is None or isinstance(dcn, dict)
-        assert plugins is None or isinstance(plugins, list)
-        if plugins is not None:
-            allowed_position = ['after_conv1', 'after_conv2', 'after_conv3']
-            assert all(p['position'] in allowed_position for p in plugins)
-
-        self.inplanes = inplanes
-        self.planes = planes
-        self.stride = stride
-        self.dilation = dilation
-        self.style = style
-        self.with_cp = with_cp
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.dcn = dcn
-        self.with_dcn = dcn is not None
-        self.plugins = plugins
-        self.with_plugins = plugins is not None
-
-        if self.with_plugins:
-            # collect plugins for conv1/conv2/conv3
-            self.after_conv1_plugins = [
-                plugin['cfg'] for plugin in plugins
-                if plugin['position'] == 'after_conv1'
-            ]
-            self.after_conv2_plugins = [
-                plugin['cfg'] for plugin in plugins
-                if plugin['position'] == 'after_conv2'
-            ]
-            self.after_conv3_plugins = [
-                plugin['cfg'] for plugin in plugins
-                if plugin['position'] == 'after_conv3'
-            ]
-
-        if self.style == 'pytorch':
-            self.conv1_stride = 1
-            self.conv2_stride = stride
-        else:
-            self.conv1_stride = stride
-            self.conv2_stride = 1
-
-        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
-        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
-        self.norm3_name, norm3 = build_norm_layer(
-            norm_cfg, planes * self.expansion, postfix=3)
-
-        self.conv1 = build_conv_layer(
-            conv_cfg,
-            inplanes,
-            planes,
-            kernel_size=1,
-            stride=self.conv1_stride,
-            bias=False)
-        self.add_module(self.norm1_name, norm1)
-        fallback_on_stride = False
-        if self.with_dcn:
-            fallback_on_stride = dcn.pop('fallback_on_stride', False)
-        if not self.with_dcn or fallback_on_stride:
-            self.conv2 = build_conv_layer(
-                conv_cfg,
-                planes,
-                planes,
-                kernel_size=3,
-                stride=self.conv2_stride,
-                padding=dilation,
-                dilation=dilation,
-                bias=False)
-        else:
-            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
-            self.conv2 = build_conv_layer(
-                dcn,
-                planes,
-                planes,
-                kernel_size=3,
-                stride=self.conv2_stride,
-                padding=dilation,
-                dilation=dilation,
-                bias=False)
-
-        self.add_module(self.norm2_name, norm2)
-        self.conv3 = build_conv_layer(
-            conv_cfg,
-            planes,
-            planes * self.expansion,
-            kernel_size=1,
-            bias=False)
-        self.add_module(self.norm3_name, norm3)
-
-        self.relu = nn.ReLU(inplace=True)
-        self.downsample = downsample
-
-        if self.with_plugins:
-            self.after_conv1_plugin_names = self.make_block_plugins(
-                planes, self.after_conv1_plugins)
-            self.after_conv2_plugin_names = self.make_block_plugins(
-                planes, self.after_conv2_plugins)
-            self.after_conv3_plugin_names = self.make_block_plugins(
-                planes * self.expansion, self.after_conv3_plugins)
-
-    def make_block_plugins(self, in_channels, plugins):
-        """make plugins for block.
-
-        Args:
-            in_channels (int): Input channels of plugin.
-            plugins (list[dict]): List of plugins cfg to build.
-
-        Returns:
-            list[str]: List of the names of plugin.
-        """
-        assert isinstance(plugins, list)
-        plugin_names = []
-        for plugin in plugins:
-            plugin = plugin.copy()
-            name, layer = build_plugin_layer(
-                plugin,
-                in_channels=in_channels,
-                postfix=plugin.pop('postfix', ''))
-            assert not hasattr(self, name), f'duplicate plugin {name}'
-            self.add_module(name, layer)
-            plugin_names.append(name)
-        return plugin_names
-
-    def forward_plugin(self, x, plugin_names):
-        """Forward function for plugins."""
-        out = x
-        for name in plugin_names:
-            out = getattr(self, name)(x)
-        return out
-
-    @property
-    def norm1(self):
-        """nn.Module: normalization layer after the first convolution layer"""
-        return getattr(self, self.norm1_name)
-
-    @property
-    def norm2(self):
-        """nn.Module: normalization layer after the second convolution layer"""
-        return getattr(self, self.norm2_name)
-
-    @property
-    def norm3(self):
-        """nn.Module: normalization layer after the third convolution layer"""
-        return getattr(self, self.norm3_name)
-
-    def forward(self, x):
-        """Forward function."""
-
-        def _inner_forward(x):
-            identity = x
-
-            out = self.conv1(x)
-            out = self.norm1(out)
-            out = self.relu(out)
-
-            if self.with_plugins:
-                out = self.forward_plugin(out, self.after_conv1_plugin_names)
-
-            out = self.conv2(out)
-            out = self.norm2(out)
-            out = self.relu(out)
-
-            if self.with_plugins:
-                out = self.forward_plugin(out, self.after_conv2_plugin_names)
-
-            out = self.conv3(out)
-            out = self.norm3(out)
-
-            if self.with_plugins:
-                out = self.forward_plugin(out, self.after_conv3_plugin_names)
-
-            if self.downsample is not None:
-                identity = self.downsample(x)
-
-            out += identity
-
-            return out
-
-        if self.with_cp and x.requires_grad:
-            out = cp.checkpoint(_inner_forward, x)
-        else:
-            out = _inner_forward(x)
-
-        out = self.relu(out)
-
-        return out
-
-
-@BACKBONES.register_module()
-class ResNet(nn.Module):
-    """ResNet backbone.
-
-    Args:
-        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
-        in_channels (int): Number of input image channels. Default" 3.
-        stem_channels (int): Number of stem channels. Default: 64.
-        base_channels (int): Number of base channels of res layer. Default: 64.
-        num_stages (int): Resnet stages, normally 4.
-        strides (Sequence[int]): Strides of the first block of each stage.
-        dilations (Sequence[int]): Dilation of each stage.
-        out_indices (Sequence[int]): Output from which stages.
-        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
-            layer is the 3x3 conv layer, otherwise the stride-two layer is
-            the first 1x1 conv layer.
-        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv
-        avg_down (bool): Use AvgPool instead of stride conv when
-            downsampling in the bottleneck.
-        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-            -1 means not freezing any parameters.
-        norm_cfg (dict): Dictionary to construct and config norm layer.
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only.
-        plugins (list[dict]): List of plugins for stages, each dict contains:
-
-            - cfg (dict, required): Cfg dict to build plugin.
-
-            - position (str, required): Position inside block to insert plugin,
-            options: 'after_conv1', 'after_conv2', 'after_conv3'.
-
-            - stages (tuple[bool], optional): Stages to apply plugin, length
-            should be same as 'num_stages'
-        multi_grid (Sequence[int]|None): Multi grid dilation rates of last
-            stage. Default: None
-        contract_dilation (bool): Whether contract first dilation of each layer
-            Default: False
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed.
-        zero_init_residual (bool): Whether to use zero init for last norm layer
-            in resblocks to let them behave as identity.
-
-    Example:
-        >>> from annotator.uniformer.mmseg.models import ResNet
-        >>> import torch
-        >>> self = ResNet(depth=18)
-        >>> self.eval()
-        >>> inputs = torch.rand(1, 3, 32, 32)
-        >>> level_outputs = self.forward(inputs)
-        >>> for level_out in level_outputs:
-        ...     print(tuple(level_out.shape))
-        (1, 64, 8, 8)
-        (1, 128, 4, 4)
-        (1, 256, 2, 2)
-        (1, 512, 1, 1)
-    """
-
-    arch_settings = {
-        18: (BasicBlock, (2, 2, 2, 2)),
-        34: (BasicBlock, (3, 4, 6, 3)),
-        50: (Bottleneck, (3, 4, 6, 3)),
-        101: (Bottleneck, (3, 4, 23, 3)),
-        152: (Bottleneck, (3, 8, 36, 3))
-    }
-
-    def __init__(self,
-                 depth,
-                 in_channels=3,
-                 stem_channels=64,
-                 base_channels=64,
-                 num_stages=4,
-                 strides=(1, 2, 2, 2),
-                 dilations=(1, 1, 1, 1),
-                 out_indices=(0, 1, 2, 3),
-                 style='pytorch',
-                 deep_stem=False,
-                 avg_down=False,
-                 frozen_stages=-1,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN', requires_grad=True),
-                 norm_eval=False,
-                 dcn=None,
-                 stage_with_dcn=(False, False, False, False),
-                 plugins=None,
-                 multi_grid=None,
-                 contract_dilation=False,
-                 with_cp=False,
-                 zero_init_residual=True):
-        super(ResNet, self).__init__()
-        if depth not in self.arch_settings:
-            raise KeyError(f'invalid depth {depth} for resnet')
-        self.depth = depth
-        self.stem_channels = stem_channels
-        self.base_channels = base_channels
-        self.num_stages = num_stages
-        assert num_stages >= 1 and num_stages <= 4
-        self.strides = strides
-        self.dilations = dilations
-        assert len(strides) == len(dilations) == num_stages
-        self.out_indices = out_indices
-        assert max(out_indices) < num_stages
-        self.style = style
-        self.deep_stem = deep_stem
-        self.avg_down = avg_down
-        self.frozen_stages = frozen_stages
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.with_cp = with_cp
-        self.norm_eval = norm_eval
-        self.dcn = dcn
-        self.stage_with_dcn = stage_with_dcn
-        if dcn is not None:
-            assert len(stage_with_dcn) == num_stages
-        self.plugins = plugins
-        self.multi_grid = multi_grid
-        self.contract_dilation = contract_dilation
-        self.zero_init_residual = zero_init_residual
-        self.block, stage_blocks = self.arch_settings[depth]
-        self.stage_blocks = stage_blocks[:num_stages]
-        self.inplanes = stem_channels
-
-        self._make_stem_layer(in_channels, stem_channels)
-
-        self.res_layers = []
-        for i, num_blocks in enumerate(self.stage_blocks):
-            stride = strides[i]
-            dilation = dilations[i]
-            dcn = self.dcn if self.stage_with_dcn[i] else None
-            if plugins is not None:
-                stage_plugins = self.make_stage_plugins(plugins, i)
-            else:
-                stage_plugins = None
-            # multi grid is applied to last layer only
-            stage_multi_grid = multi_grid if i == len(
-                self.stage_blocks) - 1 else None
-            planes = base_channels * 2**i
-            res_layer = self.make_res_layer(
-                block=self.block,
-                inplanes=self.inplanes,
-                planes=planes,
-                num_blocks=num_blocks,
-                stride=stride,
-                dilation=dilation,
-                style=self.style,
-                avg_down=self.avg_down,
-                with_cp=with_cp,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                dcn=dcn,
-                plugins=stage_plugins,
-                multi_grid=stage_multi_grid,
-                contract_dilation=contract_dilation)
-            self.inplanes = planes * self.block.expansion
-            layer_name = f'layer{i+1}'
-            self.add_module(layer_name, res_layer)
-            self.res_layers.append(layer_name)
-
-        self._freeze_stages()
-
-        self.feat_dim = self.block.expansion * base_channels * 2**(
-            len(self.stage_blocks) - 1)
-
-    def make_stage_plugins(self, plugins, stage_idx):
-        """make plugins for ResNet 'stage_idx'th stage .
-
-        Currently we support to insert 'context_block',
-        'empirical_attention_block', 'nonlocal_block' into the backbone like
-        ResNet/ResNeXt. They could be inserted after conv1/conv2/conv3 of
-        Bottleneck.
-
-        An example of plugins format could be :
-        >>> plugins=[
-        ...     dict(cfg=dict(type='xxx', arg1='xxx'),
-        ...          stages=(False, True, True, True),
-        ...          position='after_conv2'),
-        ...     dict(cfg=dict(type='yyy'),
-        ...          stages=(True, True, True, True),
-        ...          position='after_conv3'),
-        ...     dict(cfg=dict(type='zzz', postfix='1'),
-        ...          stages=(True, True, True, True),
-        ...          position='after_conv3'),
-        ...     dict(cfg=dict(type='zzz', postfix='2'),
-        ...          stages=(True, True, True, True),
-        ...          position='after_conv3')
-        ... ]
-        >>> self = ResNet(depth=18)
-        >>> stage_plugins = self.make_stage_plugins(plugins, 0)
-        >>> assert len(stage_plugins) == 3
-
-        Suppose 'stage_idx=0', the structure of blocks in the stage would be:
-            conv1-> conv2->conv3->yyy->zzz1->zzz2
-        Suppose 'stage_idx=1', the structure of blocks in the stage would be:
-            conv1-> conv2->xxx->conv3->yyy->zzz1->zzz2
-
-        If stages is missing, the plugin would be applied to all stages.
-
-        Args:
-            plugins (list[dict]): List of plugins cfg to build. The postfix is
-                required if multiple same type plugins are inserted.
-            stage_idx (int): Index of stage to build
-
-        Returns:
-            list[dict]: Plugins for current stage
-        """
-        stage_plugins = []
-        for plugin in plugins:
-            plugin = plugin.copy()
-            stages = plugin.pop('stages', None)
-            assert stages is None or len(stages) == self.num_stages
-            # whether to insert plugin into current stage
-            if stages is None or stages[stage_idx]:
-                stage_plugins.append(plugin)
-
-        return stage_plugins
-
-    def make_res_layer(self, **kwargs):
-        """Pack all blocks in a stage into a ``ResLayer``."""
-        return ResLayer(**kwargs)
-
-    @property
-    def norm1(self):
-        """nn.Module: the normalization layer named "norm1" """
-        return getattr(self, self.norm1_name)
-
-    def _make_stem_layer(self, in_channels, stem_channels):
-        """Make stem layer for ResNet."""
-        if self.deep_stem:
-            self.stem = nn.Sequential(
-                build_conv_layer(
-                    self.conv_cfg,
-                    in_channels,
-                    stem_channels // 2,
-                    kernel_size=3,
-                    stride=2,
-                    padding=1,
-                    bias=False),
-                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
-                nn.ReLU(inplace=True),
-                build_conv_layer(
-                    self.conv_cfg,
-                    stem_channels // 2,
-                    stem_channels // 2,
-                    kernel_size=3,
-                    stride=1,
-                    padding=1,
-                    bias=False),
-                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
-                nn.ReLU(inplace=True),
-                build_conv_layer(
-                    self.conv_cfg,
-                    stem_channels // 2,
-                    stem_channels,
-                    kernel_size=3,
-                    stride=1,
-                    padding=1,
-                    bias=False),
-                build_norm_layer(self.norm_cfg, stem_channels)[1],
-                nn.ReLU(inplace=True))
-        else:
-            self.conv1 = build_conv_layer(
-                self.conv_cfg,
-                in_channels,
-                stem_channels,
-                kernel_size=7,
-                stride=2,
-                padding=3,
-                bias=False)
-            self.norm1_name, norm1 = build_norm_layer(
-                self.norm_cfg, stem_channels, postfix=1)
-            self.add_module(self.norm1_name, norm1)
-            self.relu = nn.ReLU(inplace=True)
-        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-
-    def _freeze_stages(self):
-        """Freeze stages param and norm stats."""
-        if self.frozen_stages >= 0:
-            if self.deep_stem:
-                self.stem.eval()
-                for param in self.stem.parameters():
-                    param.requires_grad = False
-            else:
-                self.norm1.eval()
-                for m in [self.conv1, self.norm1]:
-                    for param in m.parameters():
-                        param.requires_grad = False
-
-        for i in range(1, self.frozen_stages + 1):
-            m = getattr(self, f'layer{i}')
-            m.eval()
-            for param in m.parameters():
-                param.requires_grad = False
-
-    def init_weights(self, pretrained=None):
-        """Initialize the weights in backbone.
-
-        Args:
-            pretrained (str, optional): Path to pre-trained weights.
-                Defaults to None.
-        """
-        if isinstance(pretrained, str):
-            logger = get_root_logger()
-            load_checkpoint(self, pretrained, strict=False, logger=logger)
-        elif pretrained is None:
-            for m in self.modules():
-                if isinstance(m, nn.Conv2d):
-                    kaiming_init(m)
-                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
-                    constant_init(m, 1)
-
-            if self.dcn is not None:
-                for m in self.modules():
-                    if isinstance(m, Bottleneck) and hasattr(
-                            m, 'conv2_offset'):
-                        constant_init(m.conv2_offset, 0)
-
-            if self.zero_init_residual:
-                for m in self.modules():
-                    if isinstance(m, Bottleneck):
-                        constant_init(m.norm3, 0)
-                    elif isinstance(m, BasicBlock):
-                        constant_init(m.norm2, 0)
-        else:
-            raise TypeError('pretrained must be a str or None')
-
-    def forward(self, x):
-        """Forward function."""
-        if self.deep_stem:
-            x = self.stem(x)
-        else:
-            x = self.conv1(x)
-            x = self.norm1(x)
-            x = self.relu(x)
-        x = self.maxpool(x)
-        outs = []
-        for i, layer_name in enumerate(self.res_layers):
-            res_layer = getattr(self, layer_name)
-            x = res_layer(x)
-            if i in self.out_indices:
-                outs.append(x)
-        return tuple(outs)
-
-    def train(self, mode=True):
-        """Convert the model into training mode while keep normalization layer
-        freezed."""
-        super(ResNet, self).train(mode)
-        self._freeze_stages()
-        if mode and self.norm_eval:
-            for m in self.modules():
-                # trick: eval have effect on BatchNorm only
-                if isinstance(m, _BatchNorm):
-                    m.eval()
-
-
-@BACKBONES.register_module()
-class ResNetV1c(ResNet):
-    """ResNetV1c variant described in [1]_.
-
-    Compared with default ResNet(ResNetV1b), ResNetV1c replaces the 7x7 conv
-    in the input stem with three 3x3 convs.
-
-    References:
-        .. [1] https://arxiv.org/pdf/1812.01187.pdf
-    """
-
-    def __init__(self, **kwargs):
-        super(ResNetV1c, self).__init__(
-            deep_stem=True, avg_down=False, **kwargs)
-
-
-@BACKBONES.register_module()
-class ResNetV1d(ResNet):
-    """ResNetV1d variant described in [1]_.
-
-    Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in
-    the input stem with three 3x3 convs. And in the downsampling block, a 2x2
-    avg_pool with stride 2 is added before conv, whose stride is changed to 1.
-    """
-
-    def __init__(self, **kwargs):
-        super(ResNetV1d, self).__init__(
-            deep_stem=True, avg_down=True, **kwargs)
diff --git a/ControlNet/annotator/uniformer/mmseg/models/backbones/resnext.py b/ControlNet/annotator/uniformer/mmseg/models/backbones/resnext.py
deleted file mode 100644
index 962249ad6fd9b50960ad6426f7ce3cac6ed8c5bc..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/backbones/resnext.py
+++ /dev/null
@@ -1,145 +0,0 @@
-import math
-
-from annotator.uniformer.mmcv.cnn import build_conv_layer, build_norm_layer
-
-from ..builder import BACKBONES
-from ..utils import ResLayer
-from .resnet import Bottleneck as _Bottleneck
-from .resnet import ResNet
-
-
-class Bottleneck(_Bottleneck):
-    """Bottleneck block for ResNeXt.
-
-    If style is "pytorch", the stride-two layer is the 3x3 conv layer, if it is
-    "caffe", the stride-two layer is the first 1x1 conv layer.
-    """
-
-    def __init__(self,
-                 inplanes,
-                 planes,
-                 groups=1,
-                 base_width=4,
-                 base_channels=64,
-                 **kwargs):
-        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
-
-        if groups == 1:
-            width = self.planes
-        else:
-            width = math.floor(self.planes *
-                               (base_width / base_channels)) * groups
-
-        self.norm1_name, norm1 = build_norm_layer(
-            self.norm_cfg, width, postfix=1)
-        self.norm2_name, norm2 = build_norm_layer(
-            self.norm_cfg, width, postfix=2)
-        self.norm3_name, norm3 = build_norm_layer(
-            self.norm_cfg, self.planes * self.expansion, postfix=3)
-
-        self.conv1 = build_conv_layer(
-            self.conv_cfg,
-            self.inplanes,
-            width,
-            kernel_size=1,
-            stride=self.conv1_stride,
-            bias=False)
-        self.add_module(self.norm1_name, norm1)
-        fallback_on_stride = False
-        self.with_modulated_dcn = False
-        if self.with_dcn:
-            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
-        if not self.with_dcn or fallback_on_stride:
-            self.conv2 = build_conv_layer(
-                self.conv_cfg,
-                width,
-                width,
-                kernel_size=3,
-                stride=self.conv2_stride,
-                padding=self.dilation,
-                dilation=self.dilation,
-                groups=groups,
-                bias=False)
-        else:
-            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
-            self.conv2 = build_conv_layer(
-                self.dcn,
-                width,
-                width,
-                kernel_size=3,
-                stride=self.conv2_stride,
-                padding=self.dilation,
-                dilation=self.dilation,
-                groups=groups,
-                bias=False)
-
-        self.add_module(self.norm2_name, norm2)
-        self.conv3 = build_conv_layer(
-            self.conv_cfg,
-            width,
-            self.planes * self.expansion,
-            kernel_size=1,
-            bias=False)
-        self.add_module(self.norm3_name, norm3)
-
-
-@BACKBONES.register_module()
-class ResNeXt(ResNet):
-    """ResNeXt backbone.
-
-    Args:
-        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
-        in_channels (int): Number of input image channels. Normally 3.
-        num_stages (int): Resnet stages, normally 4.
-        groups (int): Group of resnext.
-        base_width (int): Base width of resnext.
-        strides (Sequence[int]): Strides of the first block of each stage.
-        dilations (Sequence[int]): Dilation of each stage.
-        out_indices (Sequence[int]): Output from which stages.
-        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
-            layer is the 3x3 conv layer, otherwise the stride-two layer is
-            the first 1x1 conv layer.
-        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
-            not freezing any parameters.
-        norm_cfg (dict): dictionary to construct and config norm layer.
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed.
-        zero_init_residual (bool): whether to use zero init for last norm layer
-            in resblocks to let them behave as identity.
-
-    Example:
-        >>> from annotator.uniformer.mmseg.models import ResNeXt
-        >>> import torch
-        >>> self = ResNeXt(depth=50)
-        >>> self.eval()
-        >>> inputs = torch.rand(1, 3, 32, 32)
-        >>> level_outputs = self.forward(inputs)
-        >>> for level_out in level_outputs:
-        ...     print(tuple(level_out.shape))
-        (1, 256, 8, 8)
-        (1, 512, 4, 4)
-        (1, 1024, 2, 2)
-        (1, 2048, 1, 1)
-    """
-
-    arch_settings = {
-        50: (Bottleneck, (3, 4, 6, 3)),
-        101: (Bottleneck, (3, 4, 23, 3)),
-        152: (Bottleneck, (3, 8, 36, 3))
-    }
-
-    def __init__(self, groups=1, base_width=4, **kwargs):
-        self.groups = groups
-        self.base_width = base_width
-        super(ResNeXt, self).__init__(**kwargs)
-
-    def make_res_layer(self, **kwargs):
-        """Pack all blocks in a stage into a ``ResLayer``"""
-        return ResLayer(
-            groups=self.groups,
-            base_width=self.base_width,
-            base_channels=self.base_channels,
-            **kwargs)
diff --git a/ControlNet/annotator/uniformer/mmseg/models/backbones/unet.py b/ControlNet/annotator/uniformer/mmseg/models/backbones/unet.py
deleted file mode 100644
index 82caa16a94c195c192a2a920fb7bc7e60f0f3ce3..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/backbones/unet.py
+++ /dev/null
@@ -1,429 +0,0 @@
-import torch.nn as nn
-import torch.utils.checkpoint as cp
-from annotator.uniformer.mmcv.cnn import (UPSAMPLE_LAYERS, ConvModule, build_activation_layer,
-                      build_norm_layer, constant_init, kaiming_init)
-from annotator.uniformer.mmcv.runner import load_checkpoint
-from annotator.uniformer.mmcv.utils.parrots_wrapper import _BatchNorm
-
-from annotator.uniformer.mmseg.utils import get_root_logger
-from ..builder import BACKBONES
-from ..utils import UpConvBlock
-
-
-class BasicConvBlock(nn.Module):
-    """Basic convolutional block for UNet.
-
-    This module consists of several plain convolutional layers.
-
-    Args:
-        in_channels (int): Number of input channels.
-        out_channels (int): Number of output channels.
-        num_convs (int): Number of convolutional layers. Default: 2.
-        stride (int): Whether use stride convolution to downsample
-            the input feature map. If stride=2, it only uses stride convolution
-            in the first convolutional layer to downsample the input feature
-            map. Options are 1 or 2. Default: 1.
-        dilation (int): Whether use dilated convolution to expand the
-            receptive field. Set dilation rate of each convolutional layer and
-            the dilation rate of the first convolutional layer is always 1.
-            Default: 1.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-        conv_cfg (dict | None): Config dict for convolution layer.
-            Default: None.
-        norm_cfg (dict | None): Config dict for normalization layer.
-            Default: dict(type='BN').
-        act_cfg (dict | None): Config dict for activation layer in ConvModule.
-            Default: dict(type='ReLU').
-        dcn (bool): Use deformable convolution in convolutional layer or not.
-            Default: None.
-        plugins (dict): plugins for convolutional layers. Default: None.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 num_convs=2,
-                 stride=1,
-                 dilation=1,
-                 with_cp=False,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 act_cfg=dict(type='ReLU'),
-                 dcn=None,
-                 plugins=None):
-        super(BasicConvBlock, self).__init__()
-        assert dcn is None, 'Not implemented yet.'
-        assert plugins is None, 'Not implemented yet.'
-
-        self.with_cp = with_cp
-        convs = []
-        for i in range(num_convs):
-            convs.append(
-                ConvModule(
-                    in_channels=in_channels if i == 0 else out_channels,
-                    out_channels=out_channels,
-                    kernel_size=3,
-                    stride=stride if i == 0 else 1,
-                    dilation=1 if i == 0 else dilation,
-                    padding=1 if i == 0 else dilation,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg))
-
-        self.convs = nn.Sequential(*convs)
-
-    def forward(self, x):
-        """Forward function."""
-
-        if self.with_cp and x.requires_grad:
-            out = cp.checkpoint(self.convs, x)
-        else:
-            out = self.convs(x)
-        return out
-
-
-@UPSAMPLE_LAYERS.register_module()
-class DeconvModule(nn.Module):
-    """Deconvolution upsample module in decoder for UNet (2X upsample).
-
-    This module uses deconvolution to upsample feature map in the decoder
-    of UNet.
-
-    Args:
-        in_channels (int): Number of input channels.
-        out_channels (int): Number of output channels.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-        norm_cfg (dict | None): Config dict for normalization layer.
-            Default: dict(type='BN').
-        act_cfg (dict | None): Config dict for activation layer in ConvModule.
-            Default: dict(type='ReLU').
-        kernel_size (int): Kernel size of the convolutional layer. Default: 4.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 with_cp=False,
-                 norm_cfg=dict(type='BN'),
-                 act_cfg=dict(type='ReLU'),
-                 *,
-                 kernel_size=4,
-                 scale_factor=2):
-        super(DeconvModule, self).__init__()
-
-        assert (kernel_size - scale_factor >= 0) and\
-               (kernel_size - scale_factor) % 2 == 0,\
-               f'kernel_size should be greater than or equal to scale_factor '\
-               f'and (kernel_size - scale_factor) should be even numbers, '\
-               f'while the kernel size is {kernel_size} and scale_factor is '\
-               f'{scale_factor}.'
-
-        stride = scale_factor
-        padding = (kernel_size - scale_factor) // 2
-        self.with_cp = with_cp
-        deconv = nn.ConvTranspose2d(
-            in_channels,
-            out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding)
-
-        norm_name, norm = build_norm_layer(norm_cfg, out_channels)
-        activate = build_activation_layer(act_cfg)
-        self.deconv_upsamping = nn.Sequential(deconv, norm, activate)
-
-    def forward(self, x):
-        """Forward function."""
-
-        if self.with_cp and x.requires_grad:
-            out = cp.checkpoint(self.deconv_upsamping, x)
-        else:
-            out = self.deconv_upsamping(x)
-        return out
-
-
-@UPSAMPLE_LAYERS.register_module()
-class InterpConv(nn.Module):
-    """Interpolation upsample module in decoder for UNet.
-
-    This module uses interpolation to upsample feature map in the decoder
-    of UNet. It consists of one interpolation upsample layer and one
-    convolutional layer. It can be one interpolation upsample layer followed
-    by one convolutional layer (conv_first=False) or one convolutional layer
-    followed by one interpolation upsample layer (conv_first=True).
-
-    Args:
-        in_channels (int): Number of input channels.
-        out_channels (int): Number of output channels.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-        norm_cfg (dict | None): Config dict for normalization layer.
-            Default: dict(type='BN').
-        act_cfg (dict | None): Config dict for activation layer in ConvModule.
-            Default: dict(type='ReLU').
-        conv_cfg (dict | None): Config dict for convolution layer.
-            Default: None.
-        conv_first (bool): Whether convolutional layer or interpolation
-            upsample layer first. Default: False. It means interpolation
-            upsample layer followed by one convolutional layer.
-        kernel_size (int): Kernel size of the convolutional layer. Default: 1.
-        stride (int): Stride of the convolutional layer. Default: 1.
-        padding (int): Padding of the convolutional layer. Default: 1.
-        upsample_cfg (dict): Interpolation config of the upsample layer.
-            Default: dict(
-                scale_factor=2, mode='bilinear', align_corners=False).
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 with_cp=False,
-                 norm_cfg=dict(type='BN'),
-                 act_cfg=dict(type='ReLU'),
-                 *,
-                 conv_cfg=None,
-                 conv_first=False,
-                 kernel_size=1,
-                 stride=1,
-                 padding=0,
-                 upsample_cfg=dict(
-                     scale_factor=2, mode='bilinear', align_corners=False)):
-        super(InterpConv, self).__init__()
-
-        self.with_cp = with_cp
-        conv = ConvModule(
-            in_channels,
-            out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=act_cfg)
-        upsample = nn.Upsample(**upsample_cfg)
-        if conv_first:
-            self.interp_upsample = nn.Sequential(conv, upsample)
-        else:
-            self.interp_upsample = nn.Sequential(upsample, conv)
-
-    def forward(self, x):
-        """Forward function."""
-
-        if self.with_cp and x.requires_grad:
-            out = cp.checkpoint(self.interp_upsample, x)
-        else:
-            out = self.interp_upsample(x)
-        return out
-
-
-@BACKBONES.register_module()
-class UNet(nn.Module):
-    """UNet backbone.
-    U-Net: Convolutional Networks for Biomedical Image Segmentation.
-    https://arxiv.org/pdf/1505.04597.pdf
-
-    Args:
-        in_channels (int): Number of input image channels. Default" 3.
-        base_channels (int): Number of base channels of each stage.
-            The output channels of the first stage. Default: 64.
-        num_stages (int): Number of stages in encoder, normally 5. Default: 5.
-        strides (Sequence[int 1 | 2]): Strides of each stage in encoder.
-            len(strides) is equal to num_stages. Normally the stride of the
-            first stage in encoder is 1. If strides[i]=2, it uses stride
-            convolution to downsample in the correspondence encoder stage.
-            Default: (1, 1, 1, 1, 1).
-        enc_num_convs (Sequence[int]): Number of convolutional layers in the
-            convolution block of the correspondence encoder stage.
-            Default: (2, 2, 2, 2, 2).
-        dec_num_convs (Sequence[int]): Number of convolutional layers in the
-            convolution block of the correspondence decoder stage.
-            Default: (2, 2, 2, 2).
-        downsamples (Sequence[int]): Whether use MaxPool to downsample the
-            feature map after the first stage of encoder
-            (stages: [1, num_stages)). If the correspondence encoder stage use
-            stride convolution (strides[i]=2), it will never use MaxPool to
-            downsample, even downsamples[i-1]=True.
-            Default: (True, True, True, True).
-        enc_dilations (Sequence[int]): Dilation rate of each stage in encoder.
-            Default: (1, 1, 1, 1, 1).
-        dec_dilations (Sequence[int]): Dilation rate of each stage in decoder.
-            Default: (1, 1, 1, 1).
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-        conv_cfg (dict | None): Config dict for convolution layer.
-            Default: None.
-        norm_cfg (dict | None): Config dict for normalization layer.
-            Default: dict(type='BN').
-        act_cfg (dict | None): Config dict for activation layer in ConvModule.
-            Default: dict(type='ReLU').
-        upsample_cfg (dict): The upsample config of the upsample module in
-            decoder. Default: dict(type='InterpConv').
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only. Default: False.
-        dcn (bool): Use deformable convolution in convolutional layer or not.
-            Default: None.
-        plugins (dict): plugins for convolutional layers. Default: None.
-
-    Notice:
-        The input image size should be divisible by the whole downsample rate
-        of the encoder. More detail of the whole downsample rate can be found
-        in UNet._check_input_divisible.
-
-    """
-
-    def __init__(self,
-                 in_channels=3,
-                 base_channels=64,
-                 num_stages=5,
-                 strides=(1, 1, 1, 1, 1),
-                 enc_num_convs=(2, 2, 2, 2, 2),
-                 dec_num_convs=(2, 2, 2, 2),
-                 downsamples=(True, True, True, True),
-                 enc_dilations=(1, 1, 1, 1, 1),
-                 dec_dilations=(1, 1, 1, 1),
-                 with_cp=False,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 act_cfg=dict(type='ReLU'),
-                 upsample_cfg=dict(type='InterpConv'),
-                 norm_eval=False,
-                 dcn=None,
-                 plugins=None):
-        super(UNet, self).__init__()
-        assert dcn is None, 'Not implemented yet.'
-        assert plugins is None, 'Not implemented yet.'
-        assert len(strides) == num_stages, \
-            'The length of strides should be equal to num_stages, '\
-            f'while the strides is {strides}, the length of '\
-            f'strides is {len(strides)}, and the num_stages is '\
-            f'{num_stages}.'
-        assert len(enc_num_convs) == num_stages, \
-            'The length of enc_num_convs should be equal to num_stages, '\
-            f'while the enc_num_convs is {enc_num_convs}, the length of '\
-            f'enc_num_convs is {len(enc_num_convs)}, and the num_stages is '\
-            f'{num_stages}.'
-        assert len(dec_num_convs) == (num_stages-1), \
-            'The length of dec_num_convs should be equal to (num_stages-1), '\
-            f'while the dec_num_convs is {dec_num_convs}, the length of '\
-            f'dec_num_convs is {len(dec_num_convs)}, and the num_stages is '\
-            f'{num_stages}.'
-        assert len(downsamples) == (num_stages-1), \
-            'The length of downsamples should be equal to (num_stages-1), '\
-            f'while the downsamples is {downsamples}, the length of '\
-            f'downsamples is {len(downsamples)}, and the num_stages is '\
-            f'{num_stages}.'
-        assert len(enc_dilations) == num_stages, \
-            'The length of enc_dilations should be equal to num_stages, '\
-            f'while the enc_dilations is {enc_dilations}, the length of '\
-            f'enc_dilations is {len(enc_dilations)}, and the num_stages is '\
-            f'{num_stages}.'
-        assert len(dec_dilations) == (num_stages-1), \
-            'The length of dec_dilations should be equal to (num_stages-1), '\
-            f'while the dec_dilations is {dec_dilations}, the length of '\
-            f'dec_dilations is {len(dec_dilations)}, and the num_stages is '\
-            f'{num_stages}.'
-        self.num_stages = num_stages
-        self.strides = strides
-        self.downsamples = downsamples
-        self.norm_eval = norm_eval
-        self.base_channels = base_channels
-
-        self.encoder = nn.ModuleList()
-        self.decoder = nn.ModuleList()
-
-        for i in range(num_stages):
-            enc_conv_block = []
-            if i != 0:
-                if strides[i] == 1 and downsamples[i - 1]:
-                    enc_conv_block.append(nn.MaxPool2d(kernel_size=2))
-                upsample = (strides[i] != 1 or downsamples[i - 1])
-                self.decoder.append(
-                    UpConvBlock(
-                        conv_block=BasicConvBlock,
-                        in_channels=base_channels * 2**i,
-                        skip_channels=base_channels * 2**(i - 1),
-                        out_channels=base_channels * 2**(i - 1),
-                        num_convs=dec_num_convs[i - 1],
-                        stride=1,
-                        dilation=dec_dilations[i - 1],
-                        with_cp=with_cp,
-                        conv_cfg=conv_cfg,
-                        norm_cfg=norm_cfg,
-                        act_cfg=act_cfg,
-                        upsample_cfg=upsample_cfg if upsample else None,
-                        dcn=None,
-                        plugins=None))
-
-            enc_conv_block.append(
-                BasicConvBlock(
-                    in_channels=in_channels,
-                    out_channels=base_channels * 2**i,
-                    num_convs=enc_num_convs[i],
-                    stride=strides[i],
-                    dilation=enc_dilations[i],
-                    with_cp=with_cp,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
-                    dcn=None,
-                    plugins=None))
-            self.encoder.append((nn.Sequential(*enc_conv_block)))
-            in_channels = base_channels * 2**i
-
-    def forward(self, x):
-        self._check_input_divisible(x)
-        enc_outs = []
-        for enc in self.encoder:
-            x = enc(x)
-            enc_outs.append(x)
-        dec_outs = [x]
-        for i in reversed(range(len(self.decoder))):
-            x = self.decoder[i](enc_outs[i], x)
-            dec_outs.append(x)
-
-        return dec_outs
-
-    def train(self, mode=True):
-        """Convert the model into training mode while keep normalization layer
-        freezed."""
-        super(UNet, self).train(mode)
-        if mode and self.norm_eval:
-            for m in self.modules():
-                # trick: eval have effect on BatchNorm only
-                if isinstance(m, _BatchNorm):
-                    m.eval()
-
-    def _check_input_divisible(self, x):
-        h, w = x.shape[-2:]
-        whole_downsample_rate = 1
-        for i in range(1, self.num_stages):
-            if self.strides[i] == 2 or self.downsamples[i - 1]:
-                whole_downsample_rate *= 2
-        assert (h % whole_downsample_rate == 0) \
-            and (w % whole_downsample_rate == 0),\
-            f'The input image size {(h, w)} should be divisible by the whole '\
-            f'downsample rate {whole_downsample_rate}, when num_stages is '\
-            f'{self.num_stages}, strides is {self.strides}, and downsamples '\
-            f'is {self.downsamples}.'
-
-    def init_weights(self, pretrained=None):
-        """Initialize the weights in backbone.
-
-        Args:
-            pretrained (str, optional): Path to pre-trained weights.
-                Defaults to None.
-        """
-        if isinstance(pretrained, str):
-            logger = get_root_logger()
-            load_checkpoint(self, pretrained, strict=False, logger=logger)
-        elif pretrained is None:
-            for m in self.modules():
-                if isinstance(m, nn.Conv2d):
-                    kaiming_init(m)
-                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
-                    constant_init(m, 1)
-        else:
-            raise TypeError('pretrained must be a str or None')
diff --git a/ControlNet/annotator/uniformer/mmseg/models/backbones/uniformer.py b/ControlNet/annotator/uniformer/mmseg/models/backbones/uniformer.py
deleted file mode 100644
index 0c4bb88e4c928540cca9ab609988b916520f5b7a..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/backbones/uniformer.py
+++ /dev/null
@@ -1,422 +0,0 @@
-# --------------------------------------------------------
-# UniFormer
-# Copyright (c) 2022 SenseTime X-Lab
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Kunchang Li
-# --------------------------------------------------------
-
-from collections import OrderedDict
-import math
-
-from functools import partial
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.utils.checkpoint as checkpoint
-import numpy as np
-from timm.models.layers import DropPath, to_2tuple, trunc_normal_
-
-from annotator.uniformer.mmcv_custom import load_checkpoint
-from annotator.uniformer.mmseg.utils import get_root_logger
-from ..builder import BACKBONES
-
-
-class Mlp(nn.Module):
-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-class CMlp(nn.Module):
-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Conv2d(in_features, hidden_features, 1)
-        self.act = act_layer()
-        self.fc2 = nn.Conv2d(hidden_features, out_features, 1)
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-class CBlock(nn.Module):
-    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
-                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim)
-        self.norm1 = nn.BatchNorm2d(dim)
-        self.conv1 = nn.Conv2d(dim, dim, 1)
-        self.conv2 = nn.Conv2d(dim, dim, 1)
-        self.attn = nn.Conv2d(dim, dim, 5, padding=2, groups=dim)
-        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.norm2 = nn.BatchNorm2d(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = CMlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
-
-    def forward(self, x):
-        x = x + self.pos_embed(x)
-        x = x + self.drop_path(self.conv2(self.attn(self.conv1(self.norm1(x)))))
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-        return x
-
-
-class Attention(nn.Module):
-    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
-        self.scale = qk_scale or head_dim ** -0.5
-
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-    def forward(self, x):
-        B, N, C = x.shape
-        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
-
-        attn = (q @ k.transpose(-2, -1)) * self.scale
-        attn = attn.softmax(dim=-1)
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-
-
-class SABlock(nn.Module):
-    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
-                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim)
-        self.norm1 = norm_layer(dim)
-        self.attn = Attention(
-            dim,
-            num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
-            attn_drop=attn_drop, proj_drop=drop)
-        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
-
-    def forward(self, x):
-        x = x + self.pos_embed(x)
-        B, N, H, W = x.shape
-        x = x.flatten(2).transpose(1, 2)
-        x = x + self.drop_path(self.attn(self.norm1(x)))
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-        x = x.transpose(1, 2).reshape(B, N, H, W)
-        return x   
-
-
-def window_partition(x, window_size):
-    """
-    Args:
-        x: (B, H, W, C)
-        window_size (int): window size
-    Returns:
-        windows: (num_windows*B, window_size, window_size, C)
-    """
-    B, H, W, C = x.shape
-    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
-    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
-    return windows
-
-
-def window_reverse(windows, window_size, H, W):
-    """
-    Args:
-        windows: (num_windows*B, window_size, window_size, C)
-        window_size (int): Window size
-        H (int): Height of image
-        W (int): Width of image
-    Returns:
-        x: (B, H, W, C)
-    """
-    B = int(windows.shape[0] / (H * W / window_size / window_size))
-    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
-    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
-    return x
-
-
-class SABlock_Windows(nn.Module):
-    def __init__(self, dim, num_heads, window_size=14, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
-                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.window_size=window_size
-        self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim)
-        self.norm1 = norm_layer(dim)
-        self.attn = Attention(
-            dim,
-            num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
-            attn_drop=attn_drop, proj_drop=drop)
-        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
-
-    def forward(self, x):
-        x = x + self.pos_embed(x)
-        x = x.permute(0, 2, 3, 1)
-        B, H, W, C = x.shape
-        shortcut = x
-        x = self.norm1(x)
-
-        pad_l = pad_t = 0
-        pad_r = (self.window_size - W % self.window_size) % self.window_size
-        pad_b = (self.window_size - H % self.window_size) % self.window_size
-        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
-        _, Hp, Wp, _ = x.shape
-        
-        x_windows = window_partition(x, self.window_size)  # nW*B, window_size, window_size, C
-        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
-
-        # W-MSA/SW-MSA
-        attn_windows = self.attn(x_windows)  # nW*B, window_size*window_size, C
-
-        # merge windows
-        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
-        x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
-
-        # reverse cyclic shift
-        if pad_r > 0 or pad_b > 0:
-            x = x[:, :H, :W, :].contiguous()
-
-        x = shortcut + self.drop_path(x)
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-        x = x.permute(0, 3, 1, 2).reshape(B, C, H, W)
-        return x 
-             
-
-class PatchEmbed(nn.Module):
-    """ Image to Patch Embedding
-    """
-    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.num_patches = num_patches
-        self.norm = nn.LayerNorm(embed_dim)
-        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
-
-    def forward(self, x):
-        B, _, H, W = x.shape
-        x = self.proj(x)
-        B, _, H, W = x.shape
-        x = x.flatten(2).transpose(1, 2)
-        x = self.norm(x)
-        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
-        return x
-    
-
-@BACKBONES.register_module()   
-class UniFormer(nn.Module):
-    """ Vision Transformer
-    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`  -
-        https://arxiv.org/abs/2010.11929
-    """
-    def __init__(self, layers=[3, 4, 8, 3], img_size=224, in_chans=3, num_classes=80, embed_dim=[64, 128, 320, 512],
-                 head_dim=64, mlp_ratio=4., qkv_bias=True, qk_scale=None, representation_size=None,
-                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=partial(nn.LayerNorm, eps=1e-6),
-                 pretrained_path=None, use_checkpoint=False, checkpoint_num=[0, 0, 0, 0], 
-                 windows=False, hybrid=False, window_size=14):
-        """
-        Args:
-            layer (list): number of block in each layer
-            img_size (int, tuple): input image size
-            in_chans (int): number of input channels
-            num_classes (int): number of classes for classification head
-            embed_dim (int): embedding dimension
-            head_dim (int): dimension of attention heads
-            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
-            qkv_bias (bool): enable bias for qkv if True
-            qk_scale (float): override default qk scale of head_dim ** -0.5 if set
-            representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
-            drop_rate (float): dropout rate
-            attn_drop_rate (float): attention dropout rate
-            drop_path_rate (float): stochastic depth rate
-            norm_layer (nn.Module): normalization layer
-            pretrained_path (str): path of pretrained model
-            use_checkpoint (bool): whether use checkpoint
-            checkpoint_num (list): index for using checkpoint in every stage
-            windows (bool): whether use window MHRA
-            hybrid (bool): whether use hybrid MHRA
-            window_size (int): size of window (>14)
-        """
-        super().__init__()
-        self.num_classes = num_classes
-        self.use_checkpoint = use_checkpoint
-        self.checkpoint_num = checkpoint_num
-        self.windows = windows
-        print(f'Use Checkpoint: {self.use_checkpoint}')
-        print(f'Checkpoint Number: {self.checkpoint_num}')
-        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
-        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) 
-        
-        self.patch_embed1 = PatchEmbed(
-            img_size=img_size, patch_size=4, in_chans=in_chans, embed_dim=embed_dim[0])
-        self.patch_embed2 = PatchEmbed(
-            img_size=img_size // 4, patch_size=2, in_chans=embed_dim[0], embed_dim=embed_dim[1])
-        self.patch_embed3 = PatchEmbed(
-            img_size=img_size // 8, patch_size=2, in_chans=embed_dim[1], embed_dim=embed_dim[2])
-        self.patch_embed4 = PatchEmbed(
-            img_size=img_size // 16, patch_size=2, in_chans=embed_dim[2], embed_dim=embed_dim[3])
-
-        self.pos_drop = nn.Dropout(p=drop_rate)
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(layers))]  # stochastic depth decay rule
-        num_heads = [dim // head_dim for dim in embed_dim]
-        self.blocks1 = nn.ModuleList([
-            CBlock(
-                dim=embed_dim[0], num_heads=num_heads[0], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
-                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer)
-            for i in range(layers[0])])
-        self.norm1=norm_layer(embed_dim[0])
-        self.blocks2 = nn.ModuleList([
-            CBlock(
-                dim=embed_dim[1], num_heads=num_heads[1], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
-                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]], norm_layer=norm_layer)
-            for i in range(layers[1])])
-        self.norm2 = norm_layer(embed_dim[1])
-        if self.windows:
-            print('Use local window for all blocks in stage3')
-            self.blocks3 = nn.ModuleList([
-            SABlock_Windows(
-                dim=embed_dim[2], num_heads=num_heads[2], window_size=window_size, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
-                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer)
-            for i in range(layers[2])])
-        elif hybrid:
-            print('Use hybrid window for blocks in stage3')
-            block3 = []
-            for i in range(layers[2]):
-                if (i + 1) % 4 == 0:
-                    block3.append(SABlock(
-                    dim=embed_dim[2], num_heads=num_heads[2], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
-                    drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer))
-                else:
-                    block3.append(SABlock_Windows(
-                    dim=embed_dim[2], num_heads=num_heads[2], window_size=window_size, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
-                    drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer))
-            self.blocks3 = nn.ModuleList(block3)
-        else:
-            print('Use global window for all blocks in stage3')
-            self.blocks3 = nn.ModuleList([
-            SABlock(
-                dim=embed_dim[2], num_heads=num_heads[2], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
-                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer)
-            for i in range(layers[2])])
-        self.norm3 = norm_layer(embed_dim[2])
-        self.blocks4 = nn.ModuleList([
-            SABlock(
-                dim=embed_dim[3], num_heads=num_heads[3], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
-                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]+layers[2]], norm_layer=norm_layer)
-            for i in range(layers[3])])
-        self.norm4 = norm_layer(embed_dim[3])
-        
-        # Representation layer
-        if representation_size:
-            self.num_features = representation_size
-            self.pre_logits = nn.Sequential(OrderedDict([
-                ('fc', nn.Linear(embed_dim, representation_size)),
-                ('act', nn.Tanh())
-            ]))
-        else:
-            self.pre_logits = nn.Identity()
-        
-        self.apply(self._init_weights)
-        self.init_weights(pretrained=pretrained_path)
-        
-    def init_weights(self, pretrained):
-        if isinstance(pretrained, str):
-            logger = get_root_logger()
-            load_checkpoint(self, pretrained, map_location='cpu', strict=False, logger=logger)
-            print(f'Load pretrained model from {pretrained}')
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-
-    @torch.jit.ignore
-    def no_weight_decay(self):
-        return {'pos_embed', 'cls_token'}
-
-    def get_classifier(self):
-        return self.head
-
-    def reset_classifier(self, num_classes, global_pool=''):
-        self.num_classes = num_classes
-        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
-
-    def forward_features(self, x):
-        out = []
-        x = self.patch_embed1(x)
-        x = self.pos_drop(x)
-        for i, blk in enumerate(self.blocks1):
-            if self.use_checkpoint and i < self.checkpoint_num[0]:
-                x = checkpoint.checkpoint(blk, x)
-            else:
-                x = blk(x)
-        x_out = self.norm1(x.permute(0, 2, 3, 1))
-        out.append(x_out.permute(0, 3, 1, 2).contiguous())
-        x = self.patch_embed2(x)
-        for i, blk in enumerate(self.blocks2):
-            if self.use_checkpoint and i < self.checkpoint_num[1]:
-                x = checkpoint.checkpoint(blk, x)
-            else:
-                x = blk(x)
-        x_out = self.norm2(x.permute(0, 2, 3, 1))
-        out.append(x_out.permute(0, 3, 1, 2).contiguous())
-        x = self.patch_embed3(x)
-        for i, blk in enumerate(self.blocks3):
-            if self.use_checkpoint and i < self.checkpoint_num[2]:
-                x = checkpoint.checkpoint(blk, x)
-            else:
-                x = blk(x)
-        x_out = self.norm3(x.permute(0, 2, 3, 1))
-        out.append(x_out.permute(0, 3, 1, 2).contiguous())
-        x = self.patch_embed4(x)
-        for i, blk in enumerate(self.blocks4):
-            if self.use_checkpoint and i < self.checkpoint_num[3]:
-                x = checkpoint.checkpoint(blk, x)
-            else:
-                x = blk(x)
-        x_out = self.norm4(x.permute(0, 2, 3, 1))
-        out.append(x_out.permute(0, 3, 1, 2).contiguous())
-        return tuple(out)
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        return x
diff --git a/ControlNet/annotator/uniformer/mmseg/models/backbones/vit.py b/ControlNet/annotator/uniformer/mmseg/models/backbones/vit.py
deleted file mode 100644
index 59e4479650690e08cbc4cab9427aefda47c2116d..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/backbones/vit.py
+++ /dev/null
@@ -1,459 +0,0 @@
-"""Modified from https://github.com/rwightman/pytorch-image-
-models/blob/master/timm/models/vision_transformer.py."""
-
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.utils.checkpoint as cp
-from annotator.uniformer.mmcv.cnn import (Conv2d, Linear, build_activation_layer, build_norm_layer,
-                      constant_init, kaiming_init, normal_init)
-from annotator.uniformer.mmcv.runner import _load_checkpoint
-from annotator.uniformer.mmcv.utils.parrots_wrapper import _BatchNorm
-
-from annotator.uniformer.mmseg.utils import get_root_logger
-from ..builder import BACKBONES
-from ..utils import DropPath, trunc_normal_
-
-
-class Mlp(nn.Module):
-    """MLP layer for Encoder block.
-
-    Args:
-        in_features(int): Input dimension for the first fully
-            connected layer.
-        hidden_features(int): Output dimension for the first fully
-            connected layer.
-        out_features(int): Output dementsion for the second fully
-            connected layer.
-        act_cfg(dict): Config dict for activation layer.
-            Default: dict(type='GELU').
-        drop(float): Drop rate for the dropout layer. Dropout rate has
-            to be between 0 and 1. Default: 0.
-    """
-
-    def __init__(self,
-                 in_features,
-                 hidden_features=None,
-                 out_features=None,
-                 act_cfg=dict(type='GELU'),
-                 drop=0.):
-        super(Mlp, self).__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = Linear(in_features, hidden_features)
-        self.act = build_activation_layer(act_cfg)
-        self.fc2 = Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-class Attention(nn.Module):
-    """Attention layer for Encoder block.
-
-    Args:
-        dim (int): Dimension for the input vector.
-        num_heads (int): Number of parallel attention heads.
-        qkv_bias (bool): Enable bias for qkv if True. Default: False.
-        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
-        attn_drop (float): Drop rate for attention output weights.
-            Default: 0.
-        proj_drop (float): Drop rate for output weights. Default: 0.
-    """
-
-    def __init__(self,
-                 dim,
-                 num_heads=8,
-                 qkv_bias=False,
-                 qk_scale=None,
-                 attn_drop=0.,
-                 proj_drop=0.):
-        super(Attention, self).__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim**-0.5
-
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-    def forward(self, x):
-        b, n, c = x.shape
-        qkv = self.qkv(x).reshape(b, n, 3, self.num_heads,
-                                  c // self.num_heads).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[0], qkv[1], qkv[2]
-
-        attn = (q @ k.transpose(-2, -1)) * self.scale
-        attn = attn.softmax(dim=-1)
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(b, n, c)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-
-
-class Block(nn.Module):
-    """Implements encoder block with residual connection.
-
-    Args:
-        dim (int): The feature dimension.
-        num_heads (int): Number of parallel attention heads.
-        mlp_ratio (int): Ratio of mlp hidden dim to embedding dim.
-        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float): Drop rate for mlp output weights. Default: 0.
-        attn_drop (float): Drop rate for attention output weights.
-            Default: 0.
-        proj_drop (float): Drop rate for attn layer output weights.
-            Default: 0.
-        drop_path (float): Drop rate for paths of model.
-            Default: 0.
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='GELU').
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='LN', requires_grad=True).
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-    """
-
-    def __init__(self,
-                 dim,
-                 num_heads,
-                 mlp_ratio=4,
-                 qkv_bias=False,
-                 qk_scale=None,
-                 drop=0.,
-                 attn_drop=0.,
-                 proj_drop=0.,
-                 drop_path=0.,
-                 act_cfg=dict(type='GELU'),
-                 norm_cfg=dict(type='LN', eps=1e-6),
-                 with_cp=False):
-        super(Block, self).__init__()
-        self.with_cp = with_cp
-        _, self.norm1 = build_norm_layer(norm_cfg, dim)
-        self.attn = Attention(dim, num_heads, qkv_bias, qk_scale, attn_drop,
-                              proj_drop)
-        self.drop_path = DropPath(
-            drop_path) if drop_path > 0. else nn.Identity()
-        _, self.norm2 = build_norm_layer(norm_cfg, dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(
-            in_features=dim,
-            hidden_features=mlp_hidden_dim,
-            act_cfg=act_cfg,
-            drop=drop)
-
-    def forward(self, x):
-
-        def _inner_forward(x):
-            out = x + self.drop_path(self.attn(self.norm1(x)))
-            out = out + self.drop_path(self.mlp(self.norm2(out)))
-            return out
-
-        if self.with_cp and x.requires_grad:
-            out = cp.checkpoint(_inner_forward, x)
-        else:
-            out = _inner_forward(x)
-
-        return out
-
-
-class PatchEmbed(nn.Module):
-    """Image to Patch Embedding.
-
-    Args:
-        img_size (int | tuple): Input image size.
-            default: 224.
-        patch_size (int): Width and height for a patch.
-            default: 16.
-        in_channels (int): Input channels for images. Default: 3.
-        embed_dim (int): The embedding dimension. Default: 768.
-    """
-
-    def __init__(self,
-                 img_size=224,
-                 patch_size=16,
-                 in_channels=3,
-                 embed_dim=768):
-        super(PatchEmbed, self).__init__()
-        if isinstance(img_size, int):
-            self.img_size = (img_size, img_size)
-        elif isinstance(img_size, tuple):
-            self.img_size = img_size
-        else:
-            raise TypeError('img_size must be type of int or tuple')
-        h, w = self.img_size
-        self.patch_size = (patch_size, patch_size)
-        self.num_patches = (h // patch_size) * (w // patch_size)
-        self.proj = Conv2d(
-            in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
-
-    def forward(self, x):
-        return self.proj(x).flatten(2).transpose(1, 2)
-
-
-@BACKBONES.register_module()
-class VisionTransformer(nn.Module):
-    """Vision transformer backbone.
-
-    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for
-        Image Recognition at Scale` - https://arxiv.org/abs/2010.11929
-
-    Args:
-        img_size (tuple): input image size. Default: (224, 224).
-        patch_size (int, tuple): patch size. Default: 16.
-        in_channels (int): number of input channels. Default: 3.
-        embed_dim (int): embedding dimension. Default: 768.
-        depth (int): depth of transformer. Default: 12.
-        num_heads (int): number of attention heads. Default: 12.
-        mlp_ratio (int): ratio of mlp hidden dim to embedding dim.
-            Default: 4.
-        out_indices (list | tuple | int): Output from which stages.
-            Default: -1.
-        qkv_bias (bool): enable bias for qkv if True. Default: True.
-        qk_scale (float): override default qk scale of head_dim ** -0.5 if set.
-        drop_rate (float): dropout rate. Default: 0.
-        attn_drop_rate (float): attention dropout rate. Default: 0.
-        drop_path_rate (float): Rate of DropPath. Default: 0.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='LN', eps=1e-6, requires_grad=True).
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='GELU').
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only. Default: False.
-        final_norm (bool):  Whether to add a additional layer to normalize
-            final feature map. Default: False.
-        interpolate_mode (str): Select the interpolate mode for position
-            embeding vector resize. Default: bicubic.
-        with_cls_token (bool): If concatenating class token into image tokens
-            as transformer input. Default: True.
-        with_cp (bool): Use checkpoint or not. Using checkpoint
-            will save some memory while slowing down the training speed.
-            Default: False.
-    """
-
-    def __init__(self,
-                 img_size=(224, 224),
-                 patch_size=16,
-                 in_channels=3,
-                 embed_dim=768,
-                 depth=12,
-                 num_heads=12,
-                 mlp_ratio=4,
-                 out_indices=11,
-                 qkv_bias=True,
-                 qk_scale=None,
-                 drop_rate=0.,
-                 attn_drop_rate=0.,
-                 drop_path_rate=0.,
-                 norm_cfg=dict(type='LN', eps=1e-6, requires_grad=True),
-                 act_cfg=dict(type='GELU'),
-                 norm_eval=False,
-                 final_norm=False,
-                 with_cls_token=True,
-                 interpolate_mode='bicubic',
-                 with_cp=False):
-        super(VisionTransformer, self).__init__()
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.features = self.embed_dim = embed_dim
-        self.patch_embed = PatchEmbed(
-            img_size=img_size,
-            patch_size=patch_size,
-            in_channels=in_channels,
-            embed_dim=embed_dim)
-
-        self.with_cls_token = with_cls_token
-        self.cls_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
-        self.pos_embed = nn.Parameter(
-            torch.zeros(1, self.patch_embed.num_patches + 1, embed_dim))
-        self.pos_drop = nn.Dropout(p=drop_rate)
-
-        if isinstance(out_indices, int):
-            self.out_indices = [out_indices]
-        elif isinstance(out_indices, list) or isinstance(out_indices, tuple):
-            self.out_indices = out_indices
-        else:
-            raise TypeError('out_indices must be type of int, list or tuple')
-
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
-               ]  # stochastic depth decay rule
-        self.blocks = nn.ModuleList([
-            Block(
-                dim=embed_dim,
-                num_heads=num_heads,
-                mlp_ratio=mlp_ratio,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop=dpr[i],
-                attn_drop=attn_drop_rate,
-                act_cfg=act_cfg,
-                norm_cfg=norm_cfg,
-                with_cp=with_cp) for i in range(depth)
-        ])
-
-        self.interpolate_mode = interpolate_mode
-        self.final_norm = final_norm
-        if final_norm:
-            _, self.norm = build_norm_layer(norm_cfg, embed_dim)
-
-        self.norm_eval = norm_eval
-        self.with_cp = with_cp
-
-    def init_weights(self, pretrained=None):
-        if isinstance(pretrained, str):
-            logger = get_root_logger()
-            checkpoint = _load_checkpoint(pretrained, logger=logger)
-            if 'state_dict' in checkpoint:
-                state_dict = checkpoint['state_dict']
-            else:
-                state_dict = checkpoint
-
-            if 'pos_embed' in state_dict.keys():
-                if self.pos_embed.shape != state_dict['pos_embed'].shape:
-                    logger.info(msg=f'Resize the pos_embed shape from \
-{state_dict["pos_embed"].shape} to {self.pos_embed.shape}')
-                    h, w = self.img_size
-                    pos_size = int(
-                        math.sqrt(state_dict['pos_embed'].shape[1] - 1))
-                    state_dict['pos_embed'] = self.resize_pos_embed(
-                        state_dict['pos_embed'], (h, w), (pos_size, pos_size),
-                        self.patch_size, self.interpolate_mode)
-
-            self.load_state_dict(state_dict, False)
-
-        elif pretrained is None:
-            # We only implement the 'jax_impl' initialization implemented at
-            # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353  # noqa: E501
-            trunc_normal_(self.pos_embed, std=.02)
-            trunc_normal_(self.cls_token, std=.02)
-            for n, m in self.named_modules():
-                if isinstance(m, Linear):
-                    trunc_normal_(m.weight, std=.02)
-                    if m.bias is not None:
-                        if 'mlp' in n:
-                            normal_init(m.bias, std=1e-6)
-                        else:
-                            constant_init(m.bias, 0)
-                elif isinstance(m, Conv2d):
-                    kaiming_init(m.weight, mode='fan_in')
-                    if m.bias is not None:
-                        constant_init(m.bias, 0)
-                elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)):
-                    constant_init(m.bias, 0)
-                    constant_init(m.weight, 1.0)
-        else:
-            raise TypeError('pretrained must be a str or None')
-
-    def _pos_embeding(self, img, patched_img, pos_embed):
-        """Positiong embeding method.
-
-        Resize the pos_embed, if the input image size doesn't match
-            the training size.
-        Args:
-            img (torch.Tensor): The inference image tensor, the shape
-                must be [B, C, H, W].
-            patched_img (torch.Tensor): The patched image, it should be
-                shape of [B, L1, C].
-            pos_embed (torch.Tensor): The pos_embed weighs, it should be
-                shape of [B, L2, c].
-        Return:
-            torch.Tensor: The pos encoded image feature.
-        """
-        assert patched_img.ndim == 3 and pos_embed.ndim == 3, \
-            'the shapes of patched_img and pos_embed must be [B, L, C]'
-        x_len, pos_len = patched_img.shape[1], pos_embed.shape[1]
-        if x_len != pos_len:
-            if pos_len == (self.img_size[0] // self.patch_size) * (
-                    self.img_size[1] // self.patch_size) + 1:
-                pos_h = self.img_size[0] // self.patch_size
-                pos_w = self.img_size[1] // self.patch_size
-            else:
-                raise ValueError(
-                    'Unexpected shape of pos_embed, got {}.'.format(
-                        pos_embed.shape))
-            pos_embed = self.resize_pos_embed(pos_embed, img.shape[2:],
-                                              (pos_h, pos_w), self.patch_size,
-                                              self.interpolate_mode)
-        return self.pos_drop(patched_img + pos_embed)
-
-    @staticmethod
-    def resize_pos_embed(pos_embed, input_shpae, pos_shape, patch_size, mode):
-        """Resize pos_embed weights.
-
-        Resize pos_embed using bicubic interpolate method.
-        Args:
-            pos_embed (torch.Tensor): pos_embed weights.
-            input_shpae (tuple): Tuple for (input_h, intput_w).
-            pos_shape (tuple): Tuple for (pos_h, pos_w).
-            patch_size (int): Patch size.
-        Return:
-            torch.Tensor: The resized pos_embed of shape [B, L_new, C]
-        """
-        assert pos_embed.ndim == 3, 'shape of pos_embed must be [B, L, C]'
-        input_h, input_w = input_shpae
-        pos_h, pos_w = pos_shape
-        cls_token_weight = pos_embed[:, 0]
-        pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w):]
-        pos_embed_weight = pos_embed_weight.reshape(
-            1, pos_h, pos_w, pos_embed.shape[2]).permute(0, 3, 1, 2)
-        pos_embed_weight = F.interpolate(
-            pos_embed_weight,
-            size=[input_h // patch_size, input_w // patch_size],
-            align_corners=False,
-            mode=mode)
-        cls_token_weight = cls_token_weight.unsqueeze(1)
-        pos_embed_weight = torch.flatten(pos_embed_weight, 2).transpose(1, 2)
-        pos_embed = torch.cat((cls_token_weight, pos_embed_weight), dim=1)
-        return pos_embed
-
-    def forward(self, inputs):
-        B = inputs.shape[0]
-
-        x = self.patch_embed(inputs)
-
-        cls_tokens = self.cls_token.expand(B, -1, -1)
-        x = torch.cat((cls_tokens, x), dim=1)
-        x = self._pos_embeding(inputs, x, self.pos_embed)
-
-        if not self.with_cls_token:
-            # Remove class token for transformer input
-            x = x[:, 1:]
-
-        outs = []
-        for i, blk in enumerate(self.blocks):
-            x = blk(x)
-            if i == len(self.blocks) - 1:
-                if self.final_norm:
-                    x = self.norm(x)
-            if i in self.out_indices:
-                if self.with_cls_token:
-                    # Remove class token and reshape token for decoder head
-                    out = x[:, 1:]
-                else:
-                    out = x
-                B, _, C = out.shape
-                out = out.reshape(B, inputs.shape[2] // self.patch_size,
-                                  inputs.shape[3] // self.patch_size,
-                                  C).permute(0, 3, 1, 2)
-                outs.append(out)
-
-        return tuple(outs)
-
-    def train(self, mode=True):
-        super(VisionTransformer, self).train(mode)
-        if mode and self.norm_eval:
-            for m in self.modules():
-                if isinstance(m, nn.LayerNorm):
-                    m.eval()
diff --git a/ControlNet/annotator/uniformer/mmseg/models/builder.py b/ControlNet/annotator/uniformer/mmseg/models/builder.py
deleted file mode 100644
index 1f5b971252bfc971c3ffbaa27746d69b1d3ea9fd..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/builder.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import warnings
-
-from annotator.uniformer.mmcv.cnn import MODELS as MMCV_MODELS
-from annotator.uniformer.mmcv.utils import Registry
-
-MODELS = Registry('models', parent=MMCV_MODELS)
-
-BACKBONES = MODELS
-NECKS = MODELS
-HEADS = MODELS
-LOSSES = MODELS
-SEGMENTORS = MODELS
-
-
-def build_backbone(cfg):
-    """Build backbone."""
-    return BACKBONES.build(cfg)
-
-
-def build_neck(cfg):
-    """Build neck."""
-    return NECKS.build(cfg)
-
-
-def build_head(cfg):
-    """Build head."""
-    return HEADS.build(cfg)
-
-
-def build_loss(cfg):
-    """Build loss."""
-    return LOSSES.build(cfg)
-
-
-def build_segmentor(cfg, train_cfg=None, test_cfg=None):
-    """Build segmentor."""
-    if train_cfg is not None or test_cfg is not None:
-        warnings.warn(
-            'train_cfg and test_cfg is deprecated, '
-            'please specify them in model', UserWarning)
-    assert cfg.get('train_cfg') is None or train_cfg is None, \
-        'train_cfg specified in both outer field and model field '
-    assert cfg.get('test_cfg') is None or test_cfg is None, \
-        'test_cfg specified in both outer field and model field '
-    return SEGMENTORS.build(
-        cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg))
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/__init__.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/__init__.py
deleted file mode 100644
index ac66d3cfe0ea04af45c0f3594bf135841c3812e3..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from .ann_head import ANNHead
-from .apc_head import APCHead
-from .aspp_head import ASPPHead
-from .cc_head import CCHead
-from .da_head import DAHead
-from .dm_head import DMHead
-from .dnl_head import DNLHead
-from .ema_head import EMAHead
-from .enc_head import EncHead
-from .fcn_head import FCNHead
-from .fpn_head import FPNHead
-from .gc_head import GCHead
-from .lraspp_head import LRASPPHead
-from .nl_head import NLHead
-from .ocr_head import OCRHead
-# from .point_head import PointHead
-from .psa_head import PSAHead
-from .psp_head import PSPHead
-from .sep_aspp_head import DepthwiseSeparableASPPHead
-from .sep_fcn_head import DepthwiseSeparableFCNHead
-from .uper_head import UPerHead
-
-__all__ = [
-    'FCNHead', 'PSPHead', 'ASPPHead', 'PSAHead', 'NLHead', 'GCHead', 'CCHead',
-    'UPerHead', 'DepthwiseSeparableASPPHead', 'ANNHead', 'DAHead', 'OCRHead',
-    'EncHead', 'DepthwiseSeparableFCNHead', 'FPNHead', 'EMAHead', 'DNLHead',
-    'APCHead', 'DMHead', 'LRASPPHead'
-]
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/ann_head.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/ann_head.py
deleted file mode 100644
index 30aaacc2cafc568d3de71d1477b4de0dc0fea9d3..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/ann_head.py
+++ /dev/null
@@ -1,245 +0,0 @@
-import torch
-import torch.nn as nn
-from annotator.uniformer.mmcv.cnn import ConvModule
-
-from ..builder import HEADS
-from ..utils import SelfAttentionBlock as _SelfAttentionBlock
-from .decode_head import BaseDecodeHead
-
-
-class PPMConcat(nn.ModuleList):
-    """Pyramid Pooling Module that only concat the features of each layer.
-
-    Args:
-        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
-            Module.
-    """
-
-    def __init__(self, pool_scales=(1, 3, 6, 8)):
-        super(PPMConcat, self).__init__(
-            [nn.AdaptiveAvgPool2d(pool_scale) for pool_scale in pool_scales])
-
-    def forward(self, feats):
-        """Forward function."""
-        ppm_outs = []
-        for ppm in self:
-            ppm_out = ppm(feats)
-            ppm_outs.append(ppm_out.view(*feats.shape[:2], -1))
-        concat_outs = torch.cat(ppm_outs, dim=2)
-        return concat_outs
-
-
-class SelfAttentionBlock(_SelfAttentionBlock):
-    """Make a ANN used SelfAttentionBlock.
-
-    Args:
-        low_in_channels (int): Input channels of lower level feature,
-            which is the key feature for self-attention.
-        high_in_channels (int): Input channels of higher level feature,
-            which is the query feature for self-attention.
-        channels (int): Output channels of key/query transform.
-        out_channels (int): Output channels.
-        share_key_query (bool): Whether share projection weight between key
-            and query projection.
-        query_scale (int): The scale of query feature map.
-        key_pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
-            Module of key feature.
-        conv_cfg (dict|None): Config of conv layers.
-        norm_cfg (dict|None): Config of norm layers.
-        act_cfg (dict|None): Config of activation layers.
-    """
-
-    def __init__(self, low_in_channels, high_in_channels, channels,
-                 out_channels, share_key_query, query_scale, key_pool_scales,
-                 conv_cfg, norm_cfg, act_cfg):
-        key_psp = PPMConcat(key_pool_scales)
-        if query_scale > 1:
-            query_downsample = nn.MaxPool2d(kernel_size=query_scale)
-        else:
-            query_downsample = None
-        super(SelfAttentionBlock, self).__init__(
-            key_in_channels=low_in_channels,
-            query_in_channels=high_in_channels,
-            channels=channels,
-            out_channels=out_channels,
-            share_key_query=share_key_query,
-            query_downsample=query_downsample,
-            key_downsample=key_psp,
-            key_query_num_convs=1,
-            key_query_norm=True,
-            value_out_num_convs=1,
-            value_out_norm=False,
-            matmul_norm=True,
-            with_out=True,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=act_cfg)
-
-
-class AFNB(nn.Module):
-    """Asymmetric Fusion Non-local Block(AFNB)
-
-    Args:
-        low_in_channels (int): Input channels of lower level feature,
-            which is the key feature for self-attention.
-        high_in_channels (int): Input channels of higher level feature,
-            which is the query feature for self-attention.
-        channels (int): Output channels of key/query transform.
-        out_channels (int): Output channels.
-            and query projection.
-        query_scales (tuple[int]): The scales of query feature map.
-            Default: (1,)
-        key_pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
-            Module of key feature.
-        conv_cfg (dict|None): Config of conv layers.
-        norm_cfg (dict|None): Config of norm layers.
-        act_cfg (dict|None): Config of activation layers.
-    """
-
-    def __init__(self, low_in_channels, high_in_channels, channels,
-                 out_channels, query_scales, key_pool_scales, conv_cfg,
-                 norm_cfg, act_cfg):
-        super(AFNB, self).__init__()
-        self.stages = nn.ModuleList()
-        for query_scale in query_scales:
-            self.stages.append(
-                SelfAttentionBlock(
-                    low_in_channels=low_in_channels,
-                    high_in_channels=high_in_channels,
-                    channels=channels,
-                    out_channels=out_channels,
-                    share_key_query=False,
-                    query_scale=query_scale,
-                    key_pool_scales=key_pool_scales,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg))
-        self.bottleneck = ConvModule(
-            out_channels + high_in_channels,
-            out_channels,
-            1,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=None)
-
-    def forward(self, low_feats, high_feats):
-        """Forward function."""
-        priors = [stage(high_feats, low_feats) for stage in self.stages]
-        context = torch.stack(priors, dim=0).sum(dim=0)
-        output = self.bottleneck(torch.cat([context, high_feats], 1))
-        return output
-
-
-class APNB(nn.Module):
-    """Asymmetric Pyramid Non-local Block (APNB)
-
-    Args:
-        in_channels (int): Input channels of key/query feature,
-            which is the key feature for self-attention.
-        channels (int): Output channels of key/query transform.
-        out_channels (int): Output channels.
-        query_scales (tuple[int]): The scales of query feature map.
-            Default: (1,)
-        key_pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
-            Module of key feature.
-        conv_cfg (dict|None): Config of conv layers.
-        norm_cfg (dict|None): Config of norm layers.
-        act_cfg (dict|None): Config of activation layers.
-    """
-
-    def __init__(self, in_channels, channels, out_channels, query_scales,
-                 key_pool_scales, conv_cfg, norm_cfg, act_cfg):
-        super(APNB, self).__init__()
-        self.stages = nn.ModuleList()
-        for query_scale in query_scales:
-            self.stages.append(
-                SelfAttentionBlock(
-                    low_in_channels=in_channels,
-                    high_in_channels=in_channels,
-                    channels=channels,
-                    out_channels=out_channels,
-                    share_key_query=True,
-                    query_scale=query_scale,
-                    key_pool_scales=key_pool_scales,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg))
-        self.bottleneck = ConvModule(
-            2 * in_channels,
-            out_channels,
-            1,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=act_cfg)
-
-    def forward(self, feats):
-        """Forward function."""
-        priors = [stage(feats, feats) for stage in self.stages]
-        context = torch.stack(priors, dim=0).sum(dim=0)
-        output = self.bottleneck(torch.cat([context, feats], 1))
-        return output
-
-
-@HEADS.register_module()
-class ANNHead(BaseDecodeHead):
-    """Asymmetric Non-local Neural Networks for Semantic Segmentation.
-
-    This head is the implementation of `ANNNet
-    <https://arxiv.org/abs/1908.07678>`_.
-
-    Args:
-        project_channels (int): Projection channels for Nonlocal.
-        query_scales (tuple[int]): The scales of query feature map.
-            Default: (1,)
-        key_pool_scales (tuple[int]): The pooling scales of key feature map.
-            Default: (1, 3, 6, 8).
-    """
-
-    def __init__(self,
-                 project_channels,
-                 query_scales=(1, ),
-                 key_pool_scales=(1, 3, 6, 8),
-                 **kwargs):
-        super(ANNHead, self).__init__(
-            input_transform='multiple_select', **kwargs)
-        assert len(self.in_channels) == 2
-        low_in_channels, high_in_channels = self.in_channels
-        self.project_channels = project_channels
-        self.fusion = AFNB(
-            low_in_channels=low_in_channels,
-            high_in_channels=high_in_channels,
-            out_channels=high_in_channels,
-            channels=project_channels,
-            query_scales=query_scales,
-            key_pool_scales=key_pool_scales,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-        self.bottleneck = ConvModule(
-            high_in_channels,
-            self.channels,
-            3,
-            padding=1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-        self.context = APNB(
-            in_channels=self.channels,
-            out_channels=self.channels,
-            channels=project_channels,
-            query_scales=query_scales,
-            key_pool_scales=key_pool_scales,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-
-    def forward(self, inputs):
-        """Forward function."""
-        low_feats, high_feats = self._transform_inputs(inputs)
-        output = self.fusion(low_feats, high_feats)
-        output = self.dropout(output)
-        output = self.bottleneck(output)
-        output = self.context(output)
-        output = self.cls_seg(output)
-
-        return output
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/apc_head.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/apc_head.py
deleted file mode 100644
index c7038bdbe0edf2a1f184b6899486d2d190dda076..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/apc_head.py
+++ /dev/null
@@ -1,158 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from annotator.uniformer.mmcv.cnn import ConvModule
-
-from annotator.uniformer.mmseg.ops import resize
-from ..builder import HEADS
-from .decode_head import BaseDecodeHead
-
-
-class ACM(nn.Module):
-    """Adaptive Context Module used in APCNet.
-
-    Args:
-        pool_scale (int): Pooling scale used in Adaptive Context
-            Module to extract region features.
-        fusion (bool): Add one conv to fuse residual feature.
-        in_channels (int): Input channels.
-        channels (int): Channels after modules, before conv_seg.
-        conv_cfg (dict | None): Config of conv layers.
-        norm_cfg (dict | None): Config of norm layers.
-        act_cfg (dict): Config of activation layers.
-    """
-
-    def __init__(self, pool_scale, fusion, in_channels, channels, conv_cfg,
-                 norm_cfg, act_cfg):
-        super(ACM, self).__init__()
-        self.pool_scale = pool_scale
-        self.fusion = fusion
-        self.in_channels = in_channels
-        self.channels = channels
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
-        self.pooled_redu_conv = ConvModule(
-            self.in_channels,
-            self.channels,
-            1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-
-        self.input_redu_conv = ConvModule(
-            self.in_channels,
-            self.channels,
-            1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-
-        self.global_info = ConvModule(
-            self.channels,
-            self.channels,
-            1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-
-        self.gla = nn.Conv2d(self.channels, self.pool_scale**2, 1, 1, 0)
-
-        self.residual_conv = ConvModule(
-            self.channels,
-            self.channels,
-            1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-
-        if self.fusion:
-            self.fusion_conv = ConvModule(
-                self.channels,
-                self.channels,
-                1,
-                conv_cfg=self.conv_cfg,
-                norm_cfg=self.norm_cfg,
-                act_cfg=self.act_cfg)
-
-    def forward(self, x):
-        """Forward function."""
-        pooled_x = F.adaptive_avg_pool2d(x, self.pool_scale)
-        # [batch_size, channels, h, w]
-        x = self.input_redu_conv(x)
-        # [batch_size, channels, pool_scale, pool_scale]
-        pooled_x = self.pooled_redu_conv(pooled_x)
-        batch_size = x.size(0)
-        # [batch_size, pool_scale * pool_scale, channels]
-        pooled_x = pooled_x.view(batch_size, self.channels,
-                                 -1).permute(0, 2, 1).contiguous()
-        # [batch_size, h * w, pool_scale * pool_scale]
-        affinity_matrix = self.gla(x + resize(
-            self.global_info(F.adaptive_avg_pool2d(x, 1)), size=x.shape[2:])
-                                   ).permute(0, 2, 3, 1).reshape(
-                                       batch_size, -1, self.pool_scale**2)
-        affinity_matrix = F.sigmoid(affinity_matrix)
-        # [batch_size, h * w, channels]
-        z_out = torch.matmul(affinity_matrix, pooled_x)
-        # [batch_size, channels, h * w]
-        z_out = z_out.permute(0, 2, 1).contiguous()
-        # [batch_size, channels, h, w]
-        z_out = z_out.view(batch_size, self.channels, x.size(2), x.size(3))
-        z_out = self.residual_conv(z_out)
-        z_out = F.relu(z_out + x)
-        if self.fusion:
-            z_out = self.fusion_conv(z_out)
-
-        return z_out
-
-
-@HEADS.register_module()
-class APCHead(BaseDecodeHead):
-    """Adaptive Pyramid Context Network for Semantic Segmentation.
-
-    This head is the implementation of
-    `APCNet <https://openaccess.thecvf.com/content_CVPR_2019/papers/\
-    He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_\
-    CVPR_2019_paper.pdf>`_.
-
-    Args:
-        pool_scales (tuple[int]): Pooling scales used in Adaptive Context
-            Module. Default: (1, 2, 3, 6).
-        fusion (bool): Add one conv to fuse residual feature.
-    """
-
-    def __init__(self, pool_scales=(1, 2, 3, 6), fusion=True, **kwargs):
-        super(APCHead, self).__init__(**kwargs)
-        assert isinstance(pool_scales, (list, tuple))
-        self.pool_scales = pool_scales
-        self.fusion = fusion
-        acm_modules = []
-        for pool_scale in self.pool_scales:
-            acm_modules.append(
-                ACM(pool_scale,
-                    self.fusion,
-                    self.in_channels,
-                    self.channels,
-                    conv_cfg=self.conv_cfg,
-                    norm_cfg=self.norm_cfg,
-                    act_cfg=self.act_cfg))
-        self.acm_modules = nn.ModuleList(acm_modules)
-        self.bottleneck = ConvModule(
-            self.in_channels + len(pool_scales) * self.channels,
-            self.channels,
-            3,
-            padding=1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-
-    def forward(self, inputs):
-        """Forward function."""
-        x = self._transform_inputs(inputs)
-        acm_outs = [x]
-        for acm_module in self.acm_modules:
-            acm_outs.append(acm_module(x))
-        acm_outs = torch.cat(acm_outs, dim=1)
-        output = self.bottleneck(acm_outs)
-        output = self.cls_seg(output)
-        return output
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/aspp_head.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/aspp_head.py
deleted file mode 100644
index aa914b5bb25124d1ff199553d96713d6a80484c0..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/aspp_head.py
+++ /dev/null
@@ -1,107 +0,0 @@
-import torch
-import torch.nn as nn
-from annotator.uniformer.mmcv.cnn import ConvModule
-
-from annotator.uniformer.mmseg.ops import resize
-from ..builder import HEADS
-from .decode_head import BaseDecodeHead
-
-
-class ASPPModule(nn.ModuleList):
-    """Atrous Spatial Pyramid Pooling (ASPP) Module.
-
-    Args:
-        dilations (tuple[int]): Dilation rate of each layer.
-        in_channels (int): Input channels.
-        channels (int): Channels after modules, before conv_seg.
-        conv_cfg (dict|None): Config of conv layers.
-        norm_cfg (dict|None): Config of norm layers.
-        act_cfg (dict): Config of activation layers.
-    """
-
-    def __init__(self, dilations, in_channels, channels, conv_cfg, norm_cfg,
-                 act_cfg):
-        super(ASPPModule, self).__init__()
-        self.dilations = dilations
-        self.in_channels = in_channels
-        self.channels = channels
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
-        for dilation in dilations:
-            self.append(
-                ConvModule(
-                    self.in_channels,
-                    self.channels,
-                    1 if dilation == 1 else 3,
-                    dilation=dilation,
-                    padding=0 if dilation == 1 else dilation,
-                    conv_cfg=self.conv_cfg,
-                    norm_cfg=self.norm_cfg,
-                    act_cfg=self.act_cfg))
-
-    def forward(self, x):
-        """Forward function."""
-        aspp_outs = []
-        for aspp_module in self:
-            aspp_outs.append(aspp_module(x))
-
-        return aspp_outs
-
-
-@HEADS.register_module()
-class ASPPHead(BaseDecodeHead):
-    """Rethinking Atrous Convolution for Semantic Image Segmentation.
-
-    This head is the implementation of `DeepLabV3
-    <https://arxiv.org/abs/1706.05587>`_.
-
-    Args:
-        dilations (tuple[int]): Dilation rates for ASPP module.
-            Default: (1, 6, 12, 18).
-    """
-
-    def __init__(self, dilations=(1, 6, 12, 18), **kwargs):
-        super(ASPPHead, self).__init__(**kwargs)
-        assert isinstance(dilations, (list, tuple))
-        self.dilations = dilations
-        self.image_pool = nn.Sequential(
-            nn.AdaptiveAvgPool2d(1),
-            ConvModule(
-                self.in_channels,
-                self.channels,
-                1,
-                conv_cfg=self.conv_cfg,
-                norm_cfg=self.norm_cfg,
-                act_cfg=self.act_cfg))
-        self.aspp_modules = ASPPModule(
-            dilations,
-            self.in_channels,
-            self.channels,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-        self.bottleneck = ConvModule(
-            (len(dilations) + 1) * self.channels,
-            self.channels,
-            3,
-            padding=1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-
-    def forward(self, inputs):
-        """Forward function."""
-        x = self._transform_inputs(inputs)
-        aspp_outs = [
-            resize(
-                self.image_pool(x),
-                size=x.size()[2:],
-                mode='bilinear',
-                align_corners=self.align_corners)
-        ]
-        aspp_outs.extend(self.aspp_modules(x))
-        aspp_outs = torch.cat(aspp_outs, dim=1)
-        output = self.bottleneck(aspp_outs)
-        output = self.cls_seg(output)
-        return output
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/cascade_decode_head.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/cascade_decode_head.py
deleted file mode 100644
index d02122ca0e68743b1bf7a893afae96042f23838c..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/cascade_decode_head.py
+++ /dev/null
@@ -1,57 +0,0 @@
-from abc import ABCMeta, abstractmethod
-
-from .decode_head import BaseDecodeHead
-
-
-class BaseCascadeDecodeHead(BaseDecodeHead, metaclass=ABCMeta):
-    """Base class for cascade decode head used in
-    :class:`CascadeEncoderDecoder."""
-
-    def __init__(self, *args, **kwargs):
-        super(BaseCascadeDecodeHead, self).__init__(*args, **kwargs)
-
-    @abstractmethod
-    def forward(self, inputs, prev_output):
-        """Placeholder of forward function."""
-        pass
-
-    def forward_train(self, inputs, prev_output, img_metas, gt_semantic_seg,
-                      train_cfg):
-        """Forward function for training.
-        Args:
-            inputs (list[Tensor]): List of multi-level img features.
-            prev_output (Tensor): The output of previous decode head.
-            img_metas (list[dict]): List of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-                For details on the values of these keys see
-                `mmseg/datasets/pipelines/formatting.py:Collect`.
-            gt_semantic_seg (Tensor): Semantic segmentation masks
-                used if the architecture supports semantic segmentation task.
-            train_cfg (dict): The training config.
-
-        Returns:
-            dict[str, Tensor]: a dictionary of loss components
-        """
-        seg_logits = self.forward(inputs, prev_output)
-        losses = self.losses(seg_logits, gt_semantic_seg)
-
-        return losses
-
-    def forward_test(self, inputs, prev_output, img_metas, test_cfg):
-        """Forward function for testing.
-
-        Args:
-            inputs (list[Tensor]): List of multi-level img features.
-            prev_output (Tensor): The output of previous decode head.
-            img_metas (list[dict]): List of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-                For details on the values of these keys see
-                `mmseg/datasets/pipelines/formatting.py:Collect`.
-            test_cfg (dict): The testing config.
-
-        Returns:
-            Tensor: Output segmentation map.
-        """
-        return self.forward(inputs, prev_output)
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/cc_head.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/cc_head.py
deleted file mode 100644
index 5b9abb4e747f92657f4220b29788539340986c00..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/cc_head.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import torch
-
-from ..builder import HEADS
-from .fcn_head import FCNHead
-
-try:
-    from annotator.uniformer.mmcv.ops import CrissCrossAttention
-except ModuleNotFoundError:
-    CrissCrossAttention = None
-
-
-@HEADS.register_module()
-class CCHead(FCNHead):
-    """CCNet: Criss-Cross Attention for Semantic Segmentation.
-
-    This head is the implementation of `CCNet
-    <https://arxiv.org/abs/1811.11721>`_.
-
-    Args:
-        recurrence (int): Number of recurrence of Criss Cross Attention
-            module. Default: 2.
-    """
-
-    def __init__(self, recurrence=2, **kwargs):
-        if CrissCrossAttention is None:
-            raise RuntimeError('Please install mmcv-full for '
-                               'CrissCrossAttention ops')
-        super(CCHead, self).__init__(num_convs=2, **kwargs)
-        self.recurrence = recurrence
-        self.cca = CrissCrossAttention(self.channels)
-
-    def forward(self, inputs):
-        """Forward function."""
-        x = self._transform_inputs(inputs)
-        output = self.convs[0](x)
-        for _ in range(self.recurrence):
-            output = self.cca(output)
-        output = self.convs[1](output)
-        if self.concat_input:
-            output = self.conv_cat(torch.cat([x, output], dim=1))
-        output = self.cls_seg(output)
-        return output
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/da_head.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/da_head.py
deleted file mode 100644
index 5cd49fcfdc7c0a70f9485cc71843dcf3e0cb1774..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/da_head.py
+++ /dev/null
@@ -1,178 +0,0 @@
-import torch
-import torch.nn.functional as F
-from annotator.uniformer.mmcv.cnn import ConvModule, Scale
-from torch import nn
-
-from annotator.uniformer.mmseg.core import add_prefix
-from ..builder import HEADS
-from ..utils import SelfAttentionBlock as _SelfAttentionBlock
-from .decode_head import BaseDecodeHead
-
-
-class PAM(_SelfAttentionBlock):
-    """Position Attention Module (PAM)
-
-    Args:
-        in_channels (int): Input channels of key/query feature.
-        channels (int): Output channels of key/query transform.
-    """
-
-    def __init__(self, in_channels, channels):
-        super(PAM, self).__init__(
-            key_in_channels=in_channels,
-            query_in_channels=in_channels,
-            channels=channels,
-            out_channels=in_channels,
-            share_key_query=False,
-            query_downsample=None,
-            key_downsample=None,
-            key_query_num_convs=1,
-            key_query_norm=False,
-            value_out_num_convs=1,
-            value_out_norm=False,
-            matmul_norm=False,
-            with_out=False,
-            conv_cfg=None,
-            norm_cfg=None,
-            act_cfg=None)
-
-        self.gamma = Scale(0)
-
-    def forward(self, x):
-        """Forward function."""
-        out = super(PAM, self).forward(x, x)
-
-        out = self.gamma(out) + x
-        return out
-
-
-class CAM(nn.Module):
-    """Channel Attention Module (CAM)"""
-
-    def __init__(self):
-        super(CAM, self).__init__()
-        self.gamma = Scale(0)
-
-    def forward(self, x):
-        """Forward function."""
-        batch_size, channels, height, width = x.size()
-        proj_query = x.view(batch_size, channels, -1)
-        proj_key = x.view(batch_size, channels, -1).permute(0, 2, 1)
-        energy = torch.bmm(proj_query, proj_key)
-        energy_new = torch.max(
-            energy, -1, keepdim=True)[0].expand_as(energy) - energy
-        attention = F.softmax(energy_new, dim=-1)
-        proj_value = x.view(batch_size, channels, -1)
-
-        out = torch.bmm(attention, proj_value)
-        out = out.view(batch_size, channels, height, width)
-
-        out = self.gamma(out) + x
-        return out
-
-
-@HEADS.register_module()
-class DAHead(BaseDecodeHead):
-    """Dual Attention Network for Scene Segmentation.
-
-    This head is the implementation of `DANet
-    <https://arxiv.org/abs/1809.02983>`_.
-
-    Args:
-        pam_channels (int): The channels of Position Attention Module(PAM).
-    """
-
-    def __init__(self, pam_channels, **kwargs):
-        super(DAHead, self).__init__(**kwargs)
-        self.pam_channels = pam_channels
-        self.pam_in_conv = ConvModule(
-            self.in_channels,
-            self.channels,
-            3,
-            padding=1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-        self.pam = PAM(self.channels, pam_channels)
-        self.pam_out_conv = ConvModule(
-            self.channels,
-            self.channels,
-            3,
-            padding=1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-        self.pam_conv_seg = nn.Conv2d(
-            self.channels, self.num_classes, kernel_size=1)
-
-        self.cam_in_conv = ConvModule(
-            self.in_channels,
-            self.channels,
-            3,
-            padding=1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-        self.cam = CAM()
-        self.cam_out_conv = ConvModule(
-            self.channels,
-            self.channels,
-            3,
-            padding=1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-        self.cam_conv_seg = nn.Conv2d(
-            self.channels, self.num_classes, kernel_size=1)
-
-    def pam_cls_seg(self, feat):
-        """PAM feature classification."""
-        if self.dropout is not None:
-            feat = self.dropout(feat)
-        output = self.pam_conv_seg(feat)
-        return output
-
-    def cam_cls_seg(self, feat):
-        """CAM feature classification."""
-        if self.dropout is not None:
-            feat = self.dropout(feat)
-        output = self.cam_conv_seg(feat)
-        return output
-
-    def forward(self, inputs):
-        """Forward function."""
-        x = self._transform_inputs(inputs)
-        pam_feat = self.pam_in_conv(x)
-        pam_feat = self.pam(pam_feat)
-        pam_feat = self.pam_out_conv(pam_feat)
-        pam_out = self.pam_cls_seg(pam_feat)
-
-        cam_feat = self.cam_in_conv(x)
-        cam_feat = self.cam(cam_feat)
-        cam_feat = self.cam_out_conv(cam_feat)
-        cam_out = self.cam_cls_seg(cam_feat)
-
-        feat_sum = pam_feat + cam_feat
-        pam_cam_out = self.cls_seg(feat_sum)
-
-        return pam_cam_out, pam_out, cam_out
-
-    def forward_test(self, inputs, img_metas, test_cfg):
-        """Forward function for testing, only ``pam_cam`` is used."""
-        return self.forward(inputs)[0]
-
-    def losses(self, seg_logit, seg_label):
-        """Compute ``pam_cam``, ``pam``, ``cam`` loss."""
-        pam_cam_seg_logit, pam_seg_logit, cam_seg_logit = seg_logit
-        loss = dict()
-        loss.update(
-            add_prefix(
-                super(DAHead, self).losses(pam_cam_seg_logit, seg_label),
-                'pam_cam'))
-        loss.update(
-            add_prefix(
-                super(DAHead, self).losses(pam_seg_logit, seg_label), 'pam'))
-        loss.update(
-            add_prefix(
-                super(DAHead, self).losses(cam_seg_logit, seg_label), 'cam'))
-        return loss
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/decode_head.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/decode_head.py
deleted file mode 100644
index 88a661b8f6fec5d4c031d3d85e80777ee63951a6..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/decode_head.py
+++ /dev/null
@@ -1,234 +0,0 @@
-from abc import ABCMeta, abstractmethod
-
-import torch
-import torch.nn as nn
-from annotator.uniformer.mmcv.cnn import normal_init
-from annotator.uniformer.mmcv.runner import auto_fp16, force_fp32
-
-from annotator.uniformer.mmseg.core import build_pixel_sampler
-from annotator.uniformer.mmseg.ops import resize
-from ..builder import build_loss
-from ..losses import accuracy
-
-
-class BaseDecodeHead(nn.Module, metaclass=ABCMeta):
-    """Base class for BaseDecodeHead.
-
-    Args:
-        in_channels (int|Sequence[int]): Input channels.
-        channels (int): Channels after modules, before conv_seg.
-        num_classes (int): Number of classes.
-        dropout_ratio (float): Ratio of dropout layer. Default: 0.1.
-        conv_cfg (dict|None): Config of conv layers. Default: None.
-        norm_cfg (dict|None): Config of norm layers. Default: None.
-        act_cfg (dict): Config of activation layers.
-            Default: dict(type='ReLU')
-        in_index (int|Sequence[int]): Input feature index. Default: -1
-        input_transform (str|None): Transformation type of input features.
-            Options: 'resize_concat', 'multiple_select', None.
-            'resize_concat': Multiple feature maps will be resize to the
-                same size as first one and than concat together.
-                Usually used in FCN head of HRNet.
-            'multiple_select': Multiple feature maps will be bundle into
-                a list and passed into decode head.
-            None: Only one select feature map is allowed.
-            Default: None.
-        loss_decode (dict): Config of decode loss.
-            Default: dict(type='CrossEntropyLoss').
-        ignore_index (int | None): The label index to be ignored. When using
-            masked BCE loss, ignore_index should be set to None. Default: 255
-        sampler (dict|None): The config of segmentation map sampler.
-            Default: None.
-        align_corners (bool): align_corners argument of F.interpolate.
-            Default: False.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 channels,
-                 *,
-                 num_classes,
-                 dropout_ratio=0.1,
-                 conv_cfg=None,
-                 norm_cfg=None,
-                 act_cfg=dict(type='ReLU'),
-                 in_index=-1,
-                 input_transform=None,
-                 loss_decode=dict(
-                     type='CrossEntropyLoss',
-                     use_sigmoid=False,
-                     loss_weight=1.0),
-                 ignore_index=255,
-                 sampler=None,
-                 align_corners=False):
-        super(BaseDecodeHead, self).__init__()
-        self._init_inputs(in_channels, in_index, input_transform)
-        self.channels = channels
-        self.num_classes = num_classes
-        self.dropout_ratio = dropout_ratio
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
-        self.in_index = in_index
-        self.loss_decode = build_loss(loss_decode)
-        self.ignore_index = ignore_index
-        self.align_corners = align_corners
-        if sampler is not None:
-            self.sampler = build_pixel_sampler(sampler, context=self)
-        else:
-            self.sampler = None
-
-        self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1)
-        if dropout_ratio > 0:
-            self.dropout = nn.Dropout2d(dropout_ratio)
-        else:
-            self.dropout = None
-        self.fp16_enabled = False
-
-    def extra_repr(self):
-        """Extra repr."""
-        s = f'input_transform={self.input_transform}, ' \
-            f'ignore_index={self.ignore_index}, ' \
-            f'align_corners={self.align_corners}'
-        return s
-
-    def _init_inputs(self, in_channels, in_index, input_transform):
-        """Check and initialize input transforms.
-
-        The in_channels, in_index and input_transform must match.
-        Specifically, when input_transform is None, only single feature map
-        will be selected. So in_channels and in_index must be of type int.
-        When input_transform
-
-        Args:
-            in_channels (int|Sequence[int]): Input channels.
-            in_index (int|Sequence[int]): Input feature index.
-            input_transform (str|None): Transformation type of input features.
-                Options: 'resize_concat', 'multiple_select', None.
-                'resize_concat': Multiple feature maps will be resize to the
-                    same size as first one and than concat together.
-                    Usually used in FCN head of HRNet.
-                'multiple_select': Multiple feature maps will be bundle into
-                    a list and passed into decode head.
-                None: Only one select feature map is allowed.
-        """
-
-        if input_transform is not None:
-            assert input_transform in ['resize_concat', 'multiple_select']
-        self.input_transform = input_transform
-        self.in_index = in_index
-        if input_transform is not None:
-            assert isinstance(in_channels, (list, tuple))
-            assert isinstance(in_index, (list, tuple))
-            assert len(in_channels) == len(in_index)
-            if input_transform == 'resize_concat':
-                self.in_channels = sum(in_channels)
-            else:
-                self.in_channels = in_channels
-        else:
-            assert isinstance(in_channels, int)
-            assert isinstance(in_index, int)
-            self.in_channels = in_channels
-
-    def init_weights(self):
-        """Initialize weights of classification layer."""
-        normal_init(self.conv_seg, mean=0, std=0.01)
-
-    def _transform_inputs(self, inputs):
-        """Transform inputs for decoder.
-
-        Args:
-            inputs (list[Tensor]): List of multi-level img features.
-
-        Returns:
-            Tensor: The transformed inputs
-        """
-
-        if self.input_transform == 'resize_concat':
-            inputs = [inputs[i] for i in self.in_index]
-            upsampled_inputs = [
-                resize(
-                    input=x,
-                    size=inputs[0].shape[2:],
-                    mode='bilinear',
-                    align_corners=self.align_corners) for x in inputs
-            ]
-            inputs = torch.cat(upsampled_inputs, dim=1)
-        elif self.input_transform == 'multiple_select':
-            inputs = [inputs[i] for i in self.in_index]
-        else:
-            inputs = inputs[self.in_index]
-
-        return inputs
-
-    @auto_fp16()
-    @abstractmethod
-    def forward(self, inputs):
-        """Placeholder of forward function."""
-        pass
-
-    def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg):
-        """Forward function for training.
-        Args:
-            inputs (list[Tensor]): List of multi-level img features.
-            img_metas (list[dict]): List of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-                For details on the values of these keys see
-                `mmseg/datasets/pipelines/formatting.py:Collect`.
-            gt_semantic_seg (Tensor): Semantic segmentation masks
-                used if the architecture supports semantic segmentation task.
-            train_cfg (dict): The training config.
-
-        Returns:
-            dict[str, Tensor]: a dictionary of loss components
-        """
-        seg_logits = self.forward(inputs)
-        losses = self.losses(seg_logits, gt_semantic_seg)
-        return losses
-
-    def forward_test(self, inputs, img_metas, test_cfg):
-        """Forward function for testing.
-
-        Args:
-            inputs (list[Tensor]): List of multi-level img features.
-            img_metas (list[dict]): List of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-                For details on the values of these keys see
-                `mmseg/datasets/pipelines/formatting.py:Collect`.
-            test_cfg (dict): The testing config.
-
-        Returns:
-            Tensor: Output segmentation map.
-        """
-        return self.forward(inputs)
-
-    def cls_seg(self, feat):
-        """Classify each pixel."""
-        if self.dropout is not None:
-            feat = self.dropout(feat)
-        output = self.conv_seg(feat)
-        return output
-
-    @force_fp32(apply_to=('seg_logit', ))
-    def losses(self, seg_logit, seg_label):
-        """Compute segmentation loss."""
-        loss = dict()
-        seg_logit = resize(
-            input=seg_logit,
-            size=seg_label.shape[2:],
-            mode='bilinear',
-            align_corners=self.align_corners)
-        if self.sampler is not None:
-            seg_weight = self.sampler.sample(seg_logit, seg_label)
-        else:
-            seg_weight = None
-        seg_label = seg_label.squeeze(1)
-        loss['loss_seg'] = self.loss_decode(
-            seg_logit,
-            seg_label,
-            weight=seg_weight,
-            ignore_index=self.ignore_index)
-        loss['acc_seg'] = accuracy(seg_logit, seg_label)
-        return loss
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/dm_head.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/dm_head.py
deleted file mode 100644
index 19c963923126b53ce22f60813540a35badf24b3d..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/dm_head.py
+++ /dev/null
@@ -1,140 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from annotator.uniformer.mmcv.cnn import ConvModule, build_activation_layer, build_norm_layer
-
-from ..builder import HEADS
-from .decode_head import BaseDecodeHead
-
-
-class DCM(nn.Module):
-    """Dynamic Convolutional Module used in DMNet.
-
-    Args:
-        filter_size (int): The filter size of generated convolution kernel
-            used in Dynamic Convolutional Module.
-        fusion (bool): Add one conv to fuse DCM output feature.
-        in_channels (int): Input channels.
-        channels (int): Channels after modules, before conv_seg.
-        conv_cfg (dict | None): Config of conv layers.
-        norm_cfg (dict | None): Config of norm layers.
-        act_cfg (dict): Config of activation layers.
-    """
-
-    def __init__(self, filter_size, fusion, in_channels, channels, conv_cfg,
-                 norm_cfg, act_cfg):
-        super(DCM, self).__init__()
-        self.filter_size = filter_size
-        self.fusion = fusion
-        self.in_channels = in_channels
-        self.channels = channels
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
-        self.filter_gen_conv = nn.Conv2d(self.in_channels, self.channels, 1, 1,
-                                         0)
-
-        self.input_redu_conv = ConvModule(
-            self.in_channels,
-            self.channels,
-            1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-
-        if self.norm_cfg is not None:
-            self.norm = build_norm_layer(self.norm_cfg, self.channels)[1]
-        else:
-            self.norm = None
-        self.activate = build_activation_layer(self.act_cfg)
-
-        if self.fusion:
-            self.fusion_conv = ConvModule(
-                self.channels,
-                self.channels,
-                1,
-                conv_cfg=self.conv_cfg,
-                norm_cfg=self.norm_cfg,
-                act_cfg=self.act_cfg)
-
-    def forward(self, x):
-        """Forward function."""
-        generated_filter = self.filter_gen_conv(
-            F.adaptive_avg_pool2d(x, self.filter_size))
-        x = self.input_redu_conv(x)
-        b, c, h, w = x.shape
-        # [1, b * c, h, w], c = self.channels
-        x = x.view(1, b * c, h, w)
-        # [b * c, 1, filter_size, filter_size]
-        generated_filter = generated_filter.view(b * c, 1, self.filter_size,
-                                                 self.filter_size)
-        pad = (self.filter_size - 1) // 2
-        if (self.filter_size - 1) % 2 == 0:
-            p2d = (pad, pad, pad, pad)
-        else:
-            p2d = (pad + 1, pad, pad + 1, pad)
-        x = F.pad(input=x, pad=p2d, mode='constant', value=0)
-        # [1, b * c, h, w]
-        output = F.conv2d(input=x, weight=generated_filter, groups=b * c)
-        # [b, c, h, w]
-        output = output.view(b, c, h, w)
-        if self.norm is not None:
-            output = self.norm(output)
-        output = self.activate(output)
-
-        if self.fusion:
-            output = self.fusion_conv(output)
-
-        return output
-
-
-@HEADS.register_module()
-class DMHead(BaseDecodeHead):
-    """Dynamic Multi-scale Filters for Semantic Segmentation.
-
-    This head is the implementation of
-    `DMNet <https://openaccess.thecvf.com/content_ICCV_2019/papers/\
-        He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_\
-            ICCV_2019_paper.pdf>`_.
-
-    Args:
-        filter_sizes (tuple[int]): The size of generated convolutional filters
-            used in Dynamic Convolutional Module. Default: (1, 3, 5, 7).
-        fusion (bool): Add one conv to fuse DCM output feature.
-    """
-
-    def __init__(self, filter_sizes=(1, 3, 5, 7), fusion=False, **kwargs):
-        super(DMHead, self).__init__(**kwargs)
-        assert isinstance(filter_sizes, (list, tuple))
-        self.filter_sizes = filter_sizes
-        self.fusion = fusion
-        dcm_modules = []
-        for filter_size in self.filter_sizes:
-            dcm_modules.append(
-                DCM(filter_size,
-                    self.fusion,
-                    self.in_channels,
-                    self.channels,
-                    conv_cfg=self.conv_cfg,
-                    norm_cfg=self.norm_cfg,
-                    act_cfg=self.act_cfg))
-        self.dcm_modules = nn.ModuleList(dcm_modules)
-        self.bottleneck = ConvModule(
-            self.in_channels + len(filter_sizes) * self.channels,
-            self.channels,
-            3,
-            padding=1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-
-    def forward(self, inputs):
-        """Forward function."""
-        x = self._transform_inputs(inputs)
-        dcm_outs = [x]
-        for dcm_module in self.dcm_modules:
-            dcm_outs.append(dcm_module(x))
-        dcm_outs = torch.cat(dcm_outs, dim=1)
-        output = self.bottleneck(dcm_outs)
-        output = self.cls_seg(output)
-        return output
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/dnl_head.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/dnl_head.py
deleted file mode 100644
index 333280c5947066fd3c7ebcfe302a0e7ad65480d5..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/dnl_head.py
+++ /dev/null
@@ -1,131 +0,0 @@
-import torch
-from annotator.uniformer.mmcv.cnn import NonLocal2d
-from torch import nn
-
-from ..builder import HEADS
-from .fcn_head import FCNHead
-
-
-class DisentangledNonLocal2d(NonLocal2d):
-    """Disentangled Non-Local Blocks.
-
-    Args:
-        temperature (float): Temperature to adjust attention. Default: 0.05
-    """
-
-    def __init__(self, *arg, temperature, **kwargs):
-        super().__init__(*arg, **kwargs)
-        self.temperature = temperature
-        self.conv_mask = nn.Conv2d(self.in_channels, 1, kernel_size=1)
-
-    def embedded_gaussian(self, theta_x, phi_x):
-        """Embedded gaussian with temperature."""
-
-        # NonLocal2d pairwise_weight: [N, HxW, HxW]
-        pairwise_weight = torch.matmul(theta_x, phi_x)
-        if self.use_scale:
-            # theta_x.shape[-1] is `self.inter_channels`
-            pairwise_weight /= theta_x.shape[-1]**0.5
-        pairwise_weight /= self.temperature
-        pairwise_weight = pairwise_weight.softmax(dim=-1)
-        return pairwise_weight
-
-    def forward(self, x):
-        # x: [N, C, H, W]
-        n = x.size(0)
-
-        # g_x: [N, HxW, C]
-        g_x = self.g(x).view(n, self.inter_channels, -1)
-        g_x = g_x.permute(0, 2, 1)
-
-        # theta_x: [N, HxW, C], phi_x: [N, C, HxW]
-        if self.mode == 'gaussian':
-            theta_x = x.view(n, self.in_channels, -1)
-            theta_x = theta_x.permute(0, 2, 1)
-            if self.sub_sample:
-                phi_x = self.phi(x).view(n, self.in_channels, -1)
-            else:
-                phi_x = x.view(n, self.in_channels, -1)
-        elif self.mode == 'concatenation':
-            theta_x = self.theta(x).view(n, self.inter_channels, -1, 1)
-            phi_x = self.phi(x).view(n, self.inter_channels, 1, -1)
-        else:
-            theta_x = self.theta(x).view(n, self.inter_channels, -1)
-            theta_x = theta_x.permute(0, 2, 1)
-            phi_x = self.phi(x).view(n, self.inter_channels, -1)
-
-        # subtract mean
-        theta_x -= theta_x.mean(dim=-2, keepdim=True)
-        phi_x -= phi_x.mean(dim=-1, keepdim=True)
-
-        pairwise_func = getattr(self, self.mode)
-        # pairwise_weight: [N, HxW, HxW]
-        pairwise_weight = pairwise_func(theta_x, phi_x)
-
-        # y: [N, HxW, C]
-        y = torch.matmul(pairwise_weight, g_x)
-        # y: [N, C, H, W]
-        y = y.permute(0, 2, 1).contiguous().reshape(n, self.inter_channels,
-                                                    *x.size()[2:])
-
-        # unary_mask: [N, 1, HxW]
-        unary_mask = self.conv_mask(x)
-        unary_mask = unary_mask.view(n, 1, -1)
-        unary_mask = unary_mask.softmax(dim=-1)
-        # unary_x: [N, 1, C]
-        unary_x = torch.matmul(unary_mask, g_x)
-        # unary_x: [N, C, 1, 1]
-        unary_x = unary_x.permute(0, 2, 1).contiguous().reshape(
-            n, self.inter_channels, 1, 1)
-
-        output = x + self.conv_out(y + unary_x)
-
-        return output
-
-
-@HEADS.register_module()
-class DNLHead(FCNHead):
-    """Disentangled Non-Local Neural Networks.
-
-    This head is the implementation of `DNLNet
-    <https://arxiv.org/abs/2006.06668>`_.
-
-    Args:
-        reduction (int): Reduction factor of projection transform. Default: 2.
-        use_scale (bool): Whether to scale pairwise_weight by
-            sqrt(1/inter_channels). Default: False.
-        mode (str): The nonlocal mode. Options are 'embedded_gaussian',
-            'dot_product'. Default: 'embedded_gaussian.'.
-        temperature (float): Temperature to adjust attention. Default: 0.05
-    """
-
-    def __init__(self,
-                 reduction=2,
-                 use_scale=True,
-                 mode='embedded_gaussian',
-                 temperature=0.05,
-                 **kwargs):
-        super(DNLHead, self).__init__(num_convs=2, **kwargs)
-        self.reduction = reduction
-        self.use_scale = use_scale
-        self.mode = mode
-        self.temperature = temperature
-        self.dnl_block = DisentangledNonLocal2d(
-            in_channels=self.channels,
-            reduction=self.reduction,
-            use_scale=self.use_scale,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            mode=self.mode,
-            temperature=self.temperature)
-
-    def forward(self, inputs):
-        """Forward function."""
-        x = self._transform_inputs(inputs)
-        output = self.convs[0](x)
-        output = self.dnl_block(output)
-        output = self.convs[1](output)
-        if self.concat_input:
-            output = self.conv_cat(torch.cat([x, output], dim=1))
-        output = self.cls_seg(output)
-        return output
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/ema_head.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/ema_head.py
deleted file mode 100644
index 12267cb40569d2b5a4a2955a6dc2671377ff5e0a..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/ema_head.py
+++ /dev/null
@@ -1,168 +0,0 @@
-import math
-
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-import torch.nn.functional as F
-from annotator.uniformer.mmcv.cnn import ConvModule
-
-from ..builder import HEADS
-from .decode_head import BaseDecodeHead
-
-
-def reduce_mean(tensor):
-    """Reduce mean when distributed training."""
-    if not (dist.is_available() and dist.is_initialized()):
-        return tensor
-    tensor = tensor.clone()
-    dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM)
-    return tensor
-
-
-class EMAModule(nn.Module):
-    """Expectation Maximization Attention Module used in EMANet.
-
-    Args:
-        channels (int): Channels of the whole module.
-        num_bases (int): Number of bases.
-        num_stages (int): Number of the EM iterations.
-    """
-
-    def __init__(self, channels, num_bases, num_stages, momentum):
-        super(EMAModule, self).__init__()
-        assert num_stages >= 1, 'num_stages must be at least 1!'
-        self.num_bases = num_bases
-        self.num_stages = num_stages
-        self.momentum = momentum
-
-        bases = torch.zeros(1, channels, self.num_bases)
-        bases.normal_(0, math.sqrt(2. / self.num_bases))
-        # [1, channels, num_bases]
-        bases = F.normalize(bases, dim=1, p=2)
-        self.register_buffer('bases', bases)
-
-    def forward(self, feats):
-        """Forward function."""
-        batch_size, channels, height, width = feats.size()
-        # [batch_size, channels, height*width]
-        feats = feats.view(batch_size, channels, height * width)
-        # [batch_size, channels, num_bases]
-        bases = self.bases.repeat(batch_size, 1, 1)
-
-        with torch.no_grad():
-            for i in range(self.num_stages):
-                # [batch_size, height*width, num_bases]
-                attention = torch.einsum('bcn,bck->bnk', feats, bases)
-                attention = F.softmax(attention, dim=2)
-                # l1 norm
-                attention_normed = F.normalize(attention, dim=1, p=1)
-                # [batch_size, channels, num_bases]
-                bases = torch.einsum('bcn,bnk->bck', feats, attention_normed)
-                # l2 norm
-                bases = F.normalize(bases, dim=1, p=2)
-
-        feats_recon = torch.einsum('bck,bnk->bcn', bases, attention)
-        feats_recon = feats_recon.view(batch_size, channels, height, width)
-
-        if self.training:
-            bases = bases.mean(dim=0, keepdim=True)
-            bases = reduce_mean(bases)
-            # l2 norm
-            bases = F.normalize(bases, dim=1, p=2)
-            self.bases = (1 -
-                          self.momentum) * self.bases + self.momentum * bases
-
-        return feats_recon
-
-
-@HEADS.register_module()
-class EMAHead(BaseDecodeHead):
-    """Expectation Maximization Attention Networks for Semantic Segmentation.
-
-    This head is the implementation of `EMANet
-    <https://arxiv.org/abs/1907.13426>`_.
-
-    Args:
-        ema_channels (int): EMA module channels
-        num_bases (int): Number of bases.
-        num_stages (int): Number of the EM iterations.
-        concat_input (bool): Whether concat the input and output of convs
-            before classification layer. Default: True
-        momentum (float): Momentum to update the base. Default: 0.1.
-    """
-
-    def __init__(self,
-                 ema_channels,
-                 num_bases,
-                 num_stages,
-                 concat_input=True,
-                 momentum=0.1,
-                 **kwargs):
-        super(EMAHead, self).__init__(**kwargs)
-        self.ema_channels = ema_channels
-        self.num_bases = num_bases
-        self.num_stages = num_stages
-        self.concat_input = concat_input
-        self.momentum = momentum
-        self.ema_module = EMAModule(self.ema_channels, self.num_bases,
-                                    self.num_stages, self.momentum)
-
-        self.ema_in_conv = ConvModule(
-            self.in_channels,
-            self.ema_channels,
-            3,
-            padding=1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-        # project (0, inf) -> (-inf, inf)
-        self.ema_mid_conv = ConvModule(
-            self.ema_channels,
-            self.ema_channels,
-            1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=None,
-            act_cfg=None)
-        for param in self.ema_mid_conv.parameters():
-            param.requires_grad = False
-
-        self.ema_out_conv = ConvModule(
-            self.ema_channels,
-            self.ema_channels,
-            1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=None)
-        self.bottleneck = ConvModule(
-            self.ema_channels,
-            self.channels,
-            3,
-            padding=1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-        if self.concat_input:
-            self.conv_cat = ConvModule(
-                self.in_channels + self.channels,
-                self.channels,
-                kernel_size=3,
-                padding=1,
-                conv_cfg=self.conv_cfg,
-                norm_cfg=self.norm_cfg,
-                act_cfg=self.act_cfg)
-
-    def forward(self, inputs):
-        """Forward function."""
-        x = self._transform_inputs(inputs)
-        feats = self.ema_in_conv(x)
-        identity = feats
-        feats = self.ema_mid_conv(feats)
-        recon = self.ema_module(feats)
-        recon = F.relu(recon, inplace=True)
-        recon = self.ema_out_conv(recon)
-        output = F.relu(identity + recon, inplace=True)
-        output = self.bottleneck(output)
-        if self.concat_input:
-            output = self.conv_cat(torch.cat([x, output], dim=1))
-        output = self.cls_seg(output)
-        return output
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/enc_head.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/enc_head.py
deleted file mode 100644
index da57af617e05d41761628fd2d6d232655b32d905..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/enc_head.py
+++ /dev/null
@@ -1,187 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from annotator.uniformer.mmcv.cnn import ConvModule, build_norm_layer
-
-from annotator.uniformer.mmseg.ops import Encoding, resize
-from ..builder import HEADS, build_loss
-from .decode_head import BaseDecodeHead
-
-
-class EncModule(nn.Module):
-    """Encoding Module used in EncNet.
-
-    Args:
-        in_channels (int): Input channels.
-        num_codes (int): Number of code words.
-        conv_cfg (dict|None): Config of conv layers.
-        norm_cfg (dict|None): Config of norm layers.
-        act_cfg (dict): Config of activation layers.
-    """
-
-    def __init__(self, in_channels, num_codes, conv_cfg, norm_cfg, act_cfg):
-        super(EncModule, self).__init__()
-        self.encoding_project = ConvModule(
-            in_channels,
-            in_channels,
-            1,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=act_cfg)
-        # TODO: resolve this hack
-        # change to 1d
-        if norm_cfg is not None:
-            encoding_norm_cfg = norm_cfg.copy()
-            if encoding_norm_cfg['type'] in ['BN', 'IN']:
-                encoding_norm_cfg['type'] += '1d'
-            else:
-                encoding_norm_cfg['type'] = encoding_norm_cfg['type'].replace(
-                    '2d', '1d')
-        else:
-            # fallback to BN1d
-            encoding_norm_cfg = dict(type='BN1d')
-        self.encoding = nn.Sequential(
-            Encoding(channels=in_channels, num_codes=num_codes),
-            build_norm_layer(encoding_norm_cfg, num_codes)[1],
-            nn.ReLU(inplace=True))
-        self.fc = nn.Sequential(
-            nn.Linear(in_channels, in_channels), nn.Sigmoid())
-
-    def forward(self, x):
-        """Forward function."""
-        encoding_projection = self.encoding_project(x)
-        encoding_feat = self.encoding(encoding_projection).mean(dim=1)
-        batch_size, channels, _, _ = x.size()
-        gamma = self.fc(encoding_feat)
-        y = gamma.view(batch_size, channels, 1, 1)
-        output = F.relu_(x + x * y)
-        return encoding_feat, output
-
-
-@HEADS.register_module()
-class EncHead(BaseDecodeHead):
-    """Context Encoding for Semantic Segmentation.
-
-    This head is the implementation of `EncNet
-    <https://arxiv.org/abs/1803.08904>`_.
-
-    Args:
-        num_codes (int): Number of code words. Default: 32.
-        use_se_loss (bool): Whether use Semantic Encoding Loss (SE-loss) to
-            regularize the training. Default: True.
-        add_lateral (bool): Whether use lateral connection to fuse features.
-            Default: False.
-        loss_se_decode (dict): Config of decode loss.
-            Default: dict(type='CrossEntropyLoss', use_sigmoid=True).
-    """
-
-    def __init__(self,
-                 num_codes=32,
-                 use_se_loss=True,
-                 add_lateral=False,
-                 loss_se_decode=dict(
-                     type='CrossEntropyLoss',
-                     use_sigmoid=True,
-                     loss_weight=0.2),
-                 **kwargs):
-        super(EncHead, self).__init__(
-            input_transform='multiple_select', **kwargs)
-        self.use_se_loss = use_se_loss
-        self.add_lateral = add_lateral
-        self.num_codes = num_codes
-        self.bottleneck = ConvModule(
-            self.in_channels[-1],
-            self.channels,
-            3,
-            padding=1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-        if add_lateral:
-            self.lateral_convs = nn.ModuleList()
-            for in_channels in self.in_channels[:-1]:  # skip the last one
-                self.lateral_convs.append(
-                    ConvModule(
-                        in_channels,
-                        self.channels,
-                        1,
-                        conv_cfg=self.conv_cfg,
-                        norm_cfg=self.norm_cfg,
-                        act_cfg=self.act_cfg))
-            self.fusion = ConvModule(
-                len(self.in_channels) * self.channels,
-                self.channels,
-                3,
-                padding=1,
-                conv_cfg=self.conv_cfg,
-                norm_cfg=self.norm_cfg,
-                act_cfg=self.act_cfg)
-        self.enc_module = EncModule(
-            self.channels,
-            num_codes=num_codes,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-        if self.use_se_loss:
-            self.loss_se_decode = build_loss(loss_se_decode)
-            self.se_layer = nn.Linear(self.channels, self.num_classes)
-
-    def forward(self, inputs):
-        """Forward function."""
-        inputs = self._transform_inputs(inputs)
-        feat = self.bottleneck(inputs[-1])
-        if self.add_lateral:
-            laterals = [
-                resize(
-                    lateral_conv(inputs[i]),
-                    size=feat.shape[2:],
-                    mode='bilinear',
-                    align_corners=self.align_corners)
-                for i, lateral_conv in enumerate(self.lateral_convs)
-            ]
-            feat = self.fusion(torch.cat([feat, *laterals], 1))
-        encode_feat, output = self.enc_module(feat)
-        output = self.cls_seg(output)
-        if self.use_se_loss:
-            se_output = self.se_layer(encode_feat)
-            return output, se_output
-        else:
-            return output
-
-    def forward_test(self, inputs, img_metas, test_cfg):
-        """Forward function for testing, ignore se_loss."""
-        if self.use_se_loss:
-            return self.forward(inputs)[0]
-        else:
-            return self.forward(inputs)
-
-    @staticmethod
-    def _convert_to_onehot_labels(seg_label, num_classes):
-        """Convert segmentation label to onehot.
-
-        Args:
-            seg_label (Tensor): Segmentation label of shape (N, H, W).
-            num_classes (int): Number of classes.
-
-        Returns:
-            Tensor: Onehot labels of shape (N, num_classes).
-        """
-
-        batch_size = seg_label.size(0)
-        onehot_labels = seg_label.new_zeros((batch_size, num_classes))
-        for i in range(batch_size):
-            hist = seg_label[i].float().histc(
-                bins=num_classes, min=0, max=num_classes - 1)
-            onehot_labels[i] = hist > 0
-        return onehot_labels
-
-    def losses(self, seg_logit, seg_label):
-        """Compute segmentation and semantic encoding loss."""
-        seg_logit, se_seg_logit = seg_logit
-        loss = dict()
-        loss.update(super(EncHead, self).losses(seg_logit, seg_label))
-        se_loss = self.loss_se_decode(
-            se_seg_logit,
-            self._convert_to_onehot_labels(seg_label, self.num_classes))
-        loss['loss_se'] = se_loss
-        return loss
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/fcn_head.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/fcn_head.py
deleted file mode 100644
index edb32c283fa4baada6b4a0bf3f7540c3580c3468..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/fcn_head.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import torch
-import torch.nn as nn
-from annotator.uniformer.mmcv.cnn import ConvModule
-
-from ..builder import HEADS
-from .decode_head import BaseDecodeHead
-
-
-@HEADS.register_module()
-class FCNHead(BaseDecodeHead):
-    """Fully Convolution Networks for Semantic Segmentation.
-
-    This head is implemented of `FCNNet <https://arxiv.org/abs/1411.4038>`_.
-
-    Args:
-        num_convs (int): Number of convs in the head. Default: 2.
-        kernel_size (int): The kernel size for convs in the head. Default: 3.
-        concat_input (bool): Whether concat the input and output of convs
-            before classification layer.
-        dilation (int): The dilation rate for convs in the head. Default: 1.
-    """
-
-    def __init__(self,
-                 num_convs=2,
-                 kernel_size=3,
-                 concat_input=True,
-                 dilation=1,
-                 **kwargs):
-        assert num_convs >= 0 and dilation > 0 and isinstance(dilation, int)
-        self.num_convs = num_convs
-        self.concat_input = concat_input
-        self.kernel_size = kernel_size
-        super(FCNHead, self).__init__(**kwargs)
-        if num_convs == 0:
-            assert self.in_channels == self.channels
-
-        conv_padding = (kernel_size // 2) * dilation
-        convs = []
-        convs.append(
-            ConvModule(
-                self.in_channels,
-                self.channels,
-                kernel_size=kernel_size,
-                padding=conv_padding,
-                dilation=dilation,
-                conv_cfg=self.conv_cfg,
-                norm_cfg=self.norm_cfg,
-                act_cfg=self.act_cfg))
-        for i in range(num_convs - 1):
-            convs.append(
-                ConvModule(
-                    self.channels,
-                    self.channels,
-                    kernel_size=kernel_size,
-                    padding=conv_padding,
-                    dilation=dilation,
-                    conv_cfg=self.conv_cfg,
-                    norm_cfg=self.norm_cfg,
-                    act_cfg=self.act_cfg))
-        if num_convs == 0:
-            self.convs = nn.Identity()
-        else:
-            self.convs = nn.Sequential(*convs)
-        if self.concat_input:
-            self.conv_cat = ConvModule(
-                self.in_channels + self.channels,
-                self.channels,
-                kernel_size=kernel_size,
-                padding=kernel_size // 2,
-                conv_cfg=self.conv_cfg,
-                norm_cfg=self.norm_cfg,
-                act_cfg=self.act_cfg)
-
-    def forward(self, inputs):
-        """Forward function."""
-        x = self._transform_inputs(inputs)
-        output = self.convs(x)
-        if self.concat_input:
-            output = self.conv_cat(torch.cat([x, output], dim=1))
-        output = self.cls_seg(output)
-        return output
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/fpn_head.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/fpn_head.py
deleted file mode 100644
index 1241c55b0813d1ecdddf1e66e7c5031fbf78ed50..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/fpn_head.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import numpy as np
-import torch.nn as nn
-from annotator.uniformer.mmcv.cnn import ConvModule
-
-from annotator.uniformer.mmseg.ops import resize
-from ..builder import HEADS
-from .decode_head import BaseDecodeHead
-
-
-@HEADS.register_module()
-class FPNHead(BaseDecodeHead):
-    """Panoptic Feature Pyramid Networks.
-
-    This head is the implementation of `Semantic FPN
-    <https://arxiv.org/abs/1901.02446>`_.
-
-    Args:
-        feature_strides (tuple[int]): The strides for input feature maps.
-            stack_lateral. All strides suppose to be power of 2. The first
-            one is of largest resolution.
-    """
-
-    def __init__(self, feature_strides, **kwargs):
-        super(FPNHead, self).__init__(
-            input_transform='multiple_select', **kwargs)
-        assert len(feature_strides) == len(self.in_channels)
-        assert min(feature_strides) == feature_strides[0]
-        self.feature_strides = feature_strides
-
-        self.scale_heads = nn.ModuleList()
-        for i in range(len(feature_strides)):
-            head_length = max(
-                1,
-                int(np.log2(feature_strides[i]) - np.log2(feature_strides[0])))
-            scale_head = []
-            for k in range(head_length):
-                scale_head.append(
-                    ConvModule(
-                        self.in_channels[i] if k == 0 else self.channels,
-                        self.channels,
-                        3,
-                        padding=1,
-                        conv_cfg=self.conv_cfg,
-                        norm_cfg=self.norm_cfg,
-                        act_cfg=self.act_cfg))
-                if feature_strides[i] != feature_strides[0]:
-                    scale_head.append(
-                        nn.Upsample(
-                            scale_factor=2,
-                            mode='bilinear',
-                            align_corners=self.align_corners))
-            self.scale_heads.append(nn.Sequential(*scale_head))
-
-    def forward(self, inputs):
-
-        x = self._transform_inputs(inputs)
-
-        output = self.scale_heads[0](x[0])
-        for i in range(1, len(self.feature_strides)):
-            # non inplace
-            output = output + resize(
-                self.scale_heads[i](x[i]),
-                size=output.shape[2:],
-                mode='bilinear',
-                align_corners=self.align_corners)
-
-        output = self.cls_seg(output)
-        return output
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/gc_head.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/gc_head.py
deleted file mode 100644
index 70741245af975800840709911bd18d72247e3e04..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/gc_head.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import torch
-from annotator.uniformer.mmcv.cnn import ContextBlock
-
-from ..builder import HEADS
-from .fcn_head import FCNHead
-
-
-@HEADS.register_module()
-class GCHead(FCNHead):
-    """GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond.
-
-    This head is the implementation of `GCNet
-    <https://arxiv.org/abs/1904.11492>`_.
-
-    Args:
-        ratio (float): Multiplier of channels ratio. Default: 1/4.
-        pooling_type (str): The pooling type of context aggregation.
-            Options are 'att', 'avg'. Default: 'avg'.
-        fusion_types (tuple[str]): The fusion type for feature fusion.
-            Options are 'channel_add', 'channel_mul'. Default: ('channel_add',)
-    """
-
-    def __init__(self,
-                 ratio=1 / 4.,
-                 pooling_type='att',
-                 fusion_types=('channel_add', ),
-                 **kwargs):
-        super(GCHead, self).__init__(num_convs=2, **kwargs)
-        self.ratio = ratio
-        self.pooling_type = pooling_type
-        self.fusion_types = fusion_types
-        self.gc_block = ContextBlock(
-            in_channels=self.channels,
-            ratio=self.ratio,
-            pooling_type=self.pooling_type,
-            fusion_types=self.fusion_types)
-
-    def forward(self, inputs):
-        """Forward function."""
-        x = self._transform_inputs(inputs)
-        output = self.convs[0](x)
-        output = self.gc_block(output)
-        output = self.convs[1](output)
-        if self.concat_input:
-            output = self.conv_cat(torch.cat([x, output], dim=1))
-        output = self.cls_seg(output)
-        return output
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/lraspp_head.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/lraspp_head.py
deleted file mode 100644
index 69bf320934d787aaa11984a0c4effe9ad8015b22..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/lraspp_head.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import torch
-import torch.nn as nn
-from annotator.uniformer.mmcv import is_tuple_of
-from annotator.uniformer.mmcv.cnn import ConvModule
-
-from annotator.uniformer.mmseg.ops import resize
-from ..builder import HEADS
-from .decode_head import BaseDecodeHead
-
-
-@HEADS.register_module()
-class LRASPPHead(BaseDecodeHead):
-    """Lite R-ASPP (LRASPP) head is proposed in Searching for MobileNetV3.
-
-    This head is the improved implementation of `Searching for MobileNetV3
-    <https://ieeexplore.ieee.org/document/9008835>`_.
-
-    Args:
-        branch_channels (tuple[int]): The number of output channels in every
-            each branch. Default: (32, 64).
-    """
-
-    def __init__(self, branch_channels=(32, 64), **kwargs):
-        super(LRASPPHead, self).__init__(**kwargs)
-        if self.input_transform != 'multiple_select':
-            raise ValueError('in Lite R-ASPP (LRASPP) head, input_transform '
-                             f'must be \'multiple_select\'. But received '
-                             f'\'{self.input_transform}\'')
-        assert is_tuple_of(branch_channels, int)
-        assert len(branch_channels) == len(self.in_channels) - 1
-        self.branch_channels = branch_channels
-
-        self.convs = nn.Sequential()
-        self.conv_ups = nn.Sequential()
-        for i in range(len(branch_channels)):
-            self.convs.add_module(
-                f'conv{i}',
-                nn.Conv2d(
-                    self.in_channels[i], branch_channels[i], 1, bias=False))
-            self.conv_ups.add_module(
-                f'conv_up{i}',
-                ConvModule(
-                    self.channels + branch_channels[i],
-                    self.channels,
-                    1,
-                    norm_cfg=self.norm_cfg,
-                    act_cfg=self.act_cfg,
-                    bias=False))
-
-        self.conv_up_input = nn.Conv2d(self.channels, self.channels, 1)
-
-        self.aspp_conv = ConvModule(
-            self.in_channels[-1],
-            self.channels,
-            1,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg,
-            bias=False)
-        self.image_pool = nn.Sequential(
-            nn.AvgPool2d(kernel_size=49, stride=(16, 20)),
-            ConvModule(
-                self.in_channels[2],
-                self.channels,
-                1,
-                act_cfg=dict(type='Sigmoid'),
-                bias=False))
-
-    def forward(self, inputs):
-        """Forward function."""
-        inputs = self._transform_inputs(inputs)
-
-        x = inputs[-1]
-
-        x = self.aspp_conv(x) * resize(
-            self.image_pool(x),
-            size=x.size()[2:],
-            mode='bilinear',
-            align_corners=self.align_corners)
-        x = self.conv_up_input(x)
-
-        for i in range(len(self.branch_channels) - 1, -1, -1):
-            x = resize(
-                x,
-                size=inputs[i].size()[2:],
-                mode='bilinear',
-                align_corners=self.align_corners)
-            x = torch.cat([x, self.convs[i](inputs[i])], 1)
-            x = self.conv_ups[i](x)
-
-        return self.cls_seg(x)
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/nl_head.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/nl_head.py
deleted file mode 100644
index 3eee424199e6aa363b564e2a3340a070db04db86..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/nl_head.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import torch
-from annotator.uniformer.mmcv.cnn import NonLocal2d
-
-from ..builder import HEADS
-from .fcn_head import FCNHead
-
-
-@HEADS.register_module()
-class NLHead(FCNHead):
-    """Non-local Neural Networks.
-
-    This head is the implementation of `NLNet
-    <https://arxiv.org/abs/1711.07971>`_.
-
-    Args:
-        reduction (int): Reduction factor of projection transform. Default: 2.
-        use_scale (bool): Whether to scale pairwise_weight by
-            sqrt(1/inter_channels). Default: True.
-        mode (str): The nonlocal mode. Options are 'embedded_gaussian',
-            'dot_product'. Default: 'embedded_gaussian.'.
-    """
-
-    def __init__(self,
-                 reduction=2,
-                 use_scale=True,
-                 mode='embedded_gaussian',
-                 **kwargs):
-        super(NLHead, self).__init__(num_convs=2, **kwargs)
-        self.reduction = reduction
-        self.use_scale = use_scale
-        self.mode = mode
-        self.nl_block = NonLocal2d(
-            in_channels=self.channels,
-            reduction=self.reduction,
-            use_scale=self.use_scale,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            mode=self.mode)
-
-    def forward(self, inputs):
-        """Forward function."""
-        x = self._transform_inputs(inputs)
-        output = self.convs[0](x)
-        output = self.nl_block(output)
-        output = self.convs[1](output)
-        if self.concat_input:
-            output = self.conv_cat(torch.cat([x, output], dim=1))
-        output = self.cls_seg(output)
-        return output
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/ocr_head.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/ocr_head.py
deleted file mode 100644
index 715852e94e81dc46623972748285d2d19237a341..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/ocr_head.py
+++ /dev/null
@@ -1,127 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from annotator.uniformer.mmcv.cnn import ConvModule
-
-from annotator.uniformer.mmseg.ops import resize
-from ..builder import HEADS
-from ..utils import SelfAttentionBlock as _SelfAttentionBlock
-from .cascade_decode_head import BaseCascadeDecodeHead
-
-
-class SpatialGatherModule(nn.Module):
-    """Aggregate the context features according to the initial predicted
-    probability distribution.
-
-    Employ the soft-weighted method to aggregate the context.
-    """
-
-    def __init__(self, scale):
-        super(SpatialGatherModule, self).__init__()
-        self.scale = scale
-
-    def forward(self, feats, probs):
-        """Forward function."""
-        batch_size, num_classes, height, width = probs.size()
-        channels = feats.size(1)
-        probs = probs.view(batch_size, num_classes, -1)
-        feats = feats.view(batch_size, channels, -1)
-        # [batch_size, height*width, num_classes]
-        feats = feats.permute(0, 2, 1)
-        # [batch_size, channels, height*width]
-        probs = F.softmax(self.scale * probs, dim=2)
-        # [batch_size, channels, num_classes]
-        ocr_context = torch.matmul(probs, feats)
-        ocr_context = ocr_context.permute(0, 2, 1).contiguous().unsqueeze(3)
-        return ocr_context
-
-
-class ObjectAttentionBlock(_SelfAttentionBlock):
-    """Make a OCR used SelfAttentionBlock."""
-
-    def __init__(self, in_channels, channels, scale, conv_cfg, norm_cfg,
-                 act_cfg):
-        if scale > 1:
-            query_downsample = nn.MaxPool2d(kernel_size=scale)
-        else:
-            query_downsample = None
-        super(ObjectAttentionBlock, self).__init__(
-            key_in_channels=in_channels,
-            query_in_channels=in_channels,
-            channels=channels,
-            out_channels=in_channels,
-            share_key_query=False,
-            query_downsample=query_downsample,
-            key_downsample=None,
-            key_query_num_convs=2,
-            key_query_norm=True,
-            value_out_num_convs=1,
-            value_out_norm=True,
-            matmul_norm=True,
-            with_out=True,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=act_cfg)
-        self.bottleneck = ConvModule(
-            in_channels * 2,
-            in_channels,
-            1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-
-    def forward(self, query_feats, key_feats):
-        """Forward function."""
-        context = super(ObjectAttentionBlock,
-                        self).forward(query_feats, key_feats)
-        output = self.bottleneck(torch.cat([context, query_feats], dim=1))
-        if self.query_downsample is not None:
-            output = resize(query_feats)
-
-        return output
-
-
-@HEADS.register_module()
-class OCRHead(BaseCascadeDecodeHead):
-    """Object-Contextual Representations for Semantic Segmentation.
-
-    This head is the implementation of `OCRNet
-    <https://arxiv.org/abs/1909.11065>`_.
-
-    Args:
-        ocr_channels (int): The intermediate channels of OCR block.
-        scale (int): The scale of probability map in SpatialGatherModule in
-            Default: 1.
-    """
-
-    def __init__(self, ocr_channels, scale=1, **kwargs):
-        super(OCRHead, self).__init__(**kwargs)
-        self.ocr_channels = ocr_channels
-        self.scale = scale
-        self.object_context_block = ObjectAttentionBlock(
-            self.channels,
-            self.ocr_channels,
-            self.scale,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-        self.spatial_gather_module = SpatialGatherModule(self.scale)
-
-        self.bottleneck = ConvModule(
-            self.in_channels,
-            self.channels,
-            3,
-            padding=1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-
-    def forward(self, inputs, prev_output):
-        """Forward function."""
-        x = self._transform_inputs(inputs)
-        feats = self.bottleneck(x)
-        context = self.spatial_gather_module(feats, prev_output)
-        object_context = self.object_context_block(feats, context)
-        output = self.cls_seg(object_context)
-
-        return output
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/point_head.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/point_head.py
deleted file mode 100644
index 3342aa28bb8d264b2c3d01cbf5098d145943c193..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/point_head.py
+++ /dev/null
@@ -1,349 +0,0 @@
-# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend/point_head/point_head.py  # noqa
-
-import torch
-import torch.nn as nn
-from annotator.uniformer.mmcv.cnn import ConvModule, normal_init
-from annotator.uniformer.mmcv.ops import point_sample
-
-from annotator.uniformer.mmseg.models.builder import HEADS
-from annotator.uniformer.mmseg.ops import resize
-from ..losses import accuracy
-from .cascade_decode_head import BaseCascadeDecodeHead
-
-
-def calculate_uncertainty(seg_logits):
-    """Estimate uncertainty based on seg logits.
-
-    For each location of the prediction ``seg_logits`` we estimate
-    uncertainty as the difference between top first and top second
-    predicted logits.
-
-    Args:
-        seg_logits (Tensor): Semantic segmentation logits,
-            shape (batch_size, num_classes, height, width).
-
-    Returns:
-        scores (Tensor): T uncertainty scores with the most uncertain
-            locations having the highest uncertainty score, shape (
-            batch_size, 1, height, width)
-    """
-    top2_scores = torch.topk(seg_logits, k=2, dim=1)[0]
-    return (top2_scores[:, 1] - top2_scores[:, 0]).unsqueeze(1)
-
-
-@HEADS.register_module()
-class PointHead(BaseCascadeDecodeHead):
-    """A mask point head use in PointRend.
-
-    ``PointHead`` use shared multi-layer perceptron (equivalent to
-    nn.Conv1d) to predict the logit of input points. The fine-grained feature
-    and coarse feature will be concatenate together for predication.
-
-    Args:
-        num_fcs (int): Number of fc layers in the head. Default: 3.
-        in_channels (int): Number of input channels. Default: 256.
-        fc_channels (int): Number of fc channels. Default: 256.
-        num_classes (int): Number of classes for logits. Default: 80.
-        class_agnostic (bool): Whether use class agnostic classification.
-            If so, the output channels of logits will be 1. Default: False.
-        coarse_pred_each_layer (bool): Whether concatenate coarse feature with
-            the output of each fc layer. Default: True.
-        conv_cfg (dict|None): Dictionary to construct and config conv layer.
-            Default: dict(type='Conv1d'))
-        norm_cfg (dict|None): Dictionary to construct and config norm layer.
-            Default: None.
-        loss_point (dict): Dictionary to construct and config loss layer of
-            point head. Default: dict(type='CrossEntropyLoss', use_mask=True,
-            loss_weight=1.0).
-    """
-
-    def __init__(self,
-                 num_fcs=3,
-                 coarse_pred_each_layer=True,
-                 conv_cfg=dict(type='Conv1d'),
-                 norm_cfg=None,
-                 act_cfg=dict(type='ReLU', inplace=False),
-                 **kwargs):
-        super(PointHead, self).__init__(
-            input_transform='multiple_select',
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=act_cfg,
-            **kwargs)
-
-        self.num_fcs = num_fcs
-        self.coarse_pred_each_layer = coarse_pred_each_layer
-
-        fc_in_channels = sum(self.in_channels) + self.num_classes
-        fc_channels = self.channels
-        self.fcs = nn.ModuleList()
-        for k in range(num_fcs):
-            fc = ConvModule(
-                fc_in_channels,
-                fc_channels,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=act_cfg)
-            self.fcs.append(fc)
-            fc_in_channels = fc_channels
-            fc_in_channels += self.num_classes if self.coarse_pred_each_layer \
-                else 0
-        self.fc_seg = nn.Conv1d(
-            fc_in_channels,
-            self.num_classes,
-            kernel_size=1,
-            stride=1,
-            padding=0)
-        if self.dropout_ratio > 0:
-            self.dropout = nn.Dropout(self.dropout_ratio)
-        delattr(self, 'conv_seg')
-
-    def init_weights(self):
-        """Initialize weights of classification layer."""
-        normal_init(self.fc_seg, std=0.001)
-
-    def cls_seg(self, feat):
-        """Classify each pixel with fc."""
-        if self.dropout is not None:
-            feat = self.dropout(feat)
-        output = self.fc_seg(feat)
-        return output
-
-    def forward(self, fine_grained_point_feats, coarse_point_feats):
-        x = torch.cat([fine_grained_point_feats, coarse_point_feats], dim=1)
-        for fc in self.fcs:
-            x = fc(x)
-            if self.coarse_pred_each_layer:
-                x = torch.cat((x, coarse_point_feats), dim=1)
-        return self.cls_seg(x)
-
-    def _get_fine_grained_point_feats(self, x, points):
-        """Sample from fine grained features.
-
-        Args:
-            x (list[Tensor]): Feature pyramid from by neck or backbone.
-            points (Tensor): Point coordinates, shape (batch_size,
-                num_points, 2).
-
-        Returns:
-            fine_grained_feats (Tensor): Sampled fine grained feature,
-                shape (batch_size, sum(channels of x), num_points).
-        """
-
-        fine_grained_feats_list = [
-            point_sample(_, points, align_corners=self.align_corners)
-            for _ in x
-        ]
-        if len(fine_grained_feats_list) > 1:
-            fine_grained_feats = torch.cat(fine_grained_feats_list, dim=1)
-        else:
-            fine_grained_feats = fine_grained_feats_list[0]
-
-        return fine_grained_feats
-
-    def _get_coarse_point_feats(self, prev_output, points):
-        """Sample from fine grained features.
-
-        Args:
-            prev_output (list[Tensor]): Prediction of previous decode head.
-            points (Tensor): Point coordinates, shape (batch_size,
-                num_points, 2).
-
-        Returns:
-            coarse_feats (Tensor): Sampled coarse feature, shape (batch_size,
-                num_classes, num_points).
-        """
-
-        coarse_feats = point_sample(
-            prev_output, points, align_corners=self.align_corners)
-
-        return coarse_feats
-
-    def forward_train(self, inputs, prev_output, img_metas, gt_semantic_seg,
-                      train_cfg):
-        """Forward function for training.
-        Args:
-            inputs (list[Tensor]): List of multi-level img features.
-            prev_output (Tensor): The output of previous decode head.
-            img_metas (list[dict]): List of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-                For details on the values of these keys see
-                `mmseg/datasets/pipelines/formatting.py:Collect`.
-            gt_semantic_seg (Tensor): Semantic segmentation masks
-                used if the architecture supports semantic segmentation task.
-            train_cfg (dict): The training config.
-
-        Returns:
-            dict[str, Tensor]: a dictionary of loss components
-        """
-        x = self._transform_inputs(inputs)
-        with torch.no_grad():
-            points = self.get_points_train(
-                prev_output, calculate_uncertainty, cfg=train_cfg)
-        fine_grained_point_feats = self._get_fine_grained_point_feats(
-            x, points)
-        coarse_point_feats = self._get_coarse_point_feats(prev_output, points)
-        point_logits = self.forward(fine_grained_point_feats,
-                                    coarse_point_feats)
-        point_label = point_sample(
-            gt_semantic_seg.float(),
-            points,
-            mode='nearest',
-            align_corners=self.align_corners)
-        point_label = point_label.squeeze(1).long()
-
-        losses = self.losses(point_logits, point_label)
-
-        return losses
-
-    def forward_test(self, inputs, prev_output, img_metas, test_cfg):
-        """Forward function for testing.
-
-        Args:
-            inputs (list[Tensor]): List of multi-level img features.
-            prev_output (Tensor): The output of previous decode head.
-            img_metas (list[dict]): List of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-                For details on the values of these keys see
-                `mmseg/datasets/pipelines/formatting.py:Collect`.
-            test_cfg (dict): The testing config.
-
-        Returns:
-            Tensor: Output segmentation map.
-        """
-
-        x = self._transform_inputs(inputs)
-        refined_seg_logits = prev_output.clone()
-        for _ in range(test_cfg.subdivision_steps):
-            refined_seg_logits = resize(
-                refined_seg_logits,
-                scale_factor=test_cfg.scale_factor,
-                mode='bilinear',
-                align_corners=self.align_corners)
-            batch_size, channels, height, width = refined_seg_logits.shape
-            point_indices, points = self.get_points_test(
-                refined_seg_logits, calculate_uncertainty, cfg=test_cfg)
-            fine_grained_point_feats = self._get_fine_grained_point_feats(
-                x, points)
-            coarse_point_feats = self._get_coarse_point_feats(
-                prev_output, points)
-            point_logits = self.forward(fine_grained_point_feats,
-                                        coarse_point_feats)
-
-            point_indices = point_indices.unsqueeze(1).expand(-1, channels, -1)
-            refined_seg_logits = refined_seg_logits.reshape(
-                batch_size, channels, height * width)
-            refined_seg_logits = refined_seg_logits.scatter_(
-                2, point_indices, point_logits)
-            refined_seg_logits = refined_seg_logits.view(
-                batch_size, channels, height, width)
-
-        return refined_seg_logits
-
-    def losses(self, point_logits, point_label):
-        """Compute segmentation loss."""
-        loss = dict()
-        loss['loss_point'] = self.loss_decode(
-            point_logits, point_label, ignore_index=self.ignore_index)
-        loss['acc_point'] = accuracy(point_logits, point_label)
-        return loss
-
-    def get_points_train(self, seg_logits, uncertainty_func, cfg):
-        """Sample points for training.
-
-        Sample points in [0, 1] x [0, 1] coordinate space based on their
-        uncertainty. The uncertainties are calculated for each point using
-        'uncertainty_func' function that takes point's logit prediction as
-        input.
-
-        Args:
-            seg_logits (Tensor): Semantic segmentation logits, shape (
-                batch_size, num_classes, height, width).
-            uncertainty_func (func): uncertainty calculation function.
-            cfg (dict): Training config of point head.
-
-        Returns:
-            point_coords (Tensor): A tensor of shape (batch_size, num_points,
-                2) that contains the coordinates of ``num_points`` sampled
-                points.
-        """
-        num_points = cfg.num_points
-        oversample_ratio = cfg.oversample_ratio
-        importance_sample_ratio = cfg.importance_sample_ratio
-        assert oversample_ratio >= 1
-        assert 0 <= importance_sample_ratio <= 1
-        batch_size = seg_logits.shape[0]
-        num_sampled = int(num_points * oversample_ratio)
-        point_coords = torch.rand(
-            batch_size, num_sampled, 2, device=seg_logits.device)
-        point_logits = point_sample(seg_logits, point_coords)
-        # It is crucial to calculate uncertainty based on the sampled
-        # prediction value for the points. Calculating uncertainties of the
-        # coarse predictions first and sampling them for points leads to
-        # incorrect results.  To illustrate this: assume uncertainty func(
-        # logits)=-abs(logits), a sampled point between two coarse
-        # predictions with -1 and 1 logits has 0 logits, and therefore 0
-        # uncertainty value. However, if we calculate uncertainties for the
-        # coarse predictions first, both will have -1 uncertainty,
-        # and sampled point will get -1 uncertainty.
-        point_uncertainties = uncertainty_func(point_logits)
-        num_uncertain_points = int(importance_sample_ratio * num_points)
-        num_random_points = num_points - num_uncertain_points
-        idx = torch.topk(
-            point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
-        shift = num_sampled * torch.arange(
-            batch_size, dtype=torch.long, device=seg_logits.device)
-        idx += shift[:, None]
-        point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view(
-            batch_size, num_uncertain_points, 2)
-        if num_random_points > 0:
-            rand_point_coords = torch.rand(
-                batch_size, num_random_points, 2, device=seg_logits.device)
-            point_coords = torch.cat((point_coords, rand_point_coords), dim=1)
-        return point_coords
-
-    def get_points_test(self, seg_logits, uncertainty_func, cfg):
-        """Sample points for testing.
-
-        Find ``num_points`` most uncertain points from ``uncertainty_map``.
-
-        Args:
-            seg_logits (Tensor): A tensor of shape (batch_size, num_classes,
-                height, width) for class-specific or class-agnostic prediction.
-            uncertainty_func (func): uncertainty calculation function.
-            cfg (dict): Testing config of point head.
-
-        Returns:
-            point_indices (Tensor): A tensor of shape (batch_size, num_points)
-                that contains indices from [0, height x width) of the most
-                uncertain points.
-            point_coords (Tensor): A tensor of shape (batch_size, num_points,
-                2) that contains [0, 1] x [0, 1] normalized coordinates of the
-                most uncertain points from the ``height x width`` grid .
-        """
-
-        num_points = cfg.subdivision_num_points
-        uncertainty_map = uncertainty_func(seg_logits)
-        batch_size, _, height, width = uncertainty_map.shape
-        h_step = 1.0 / height
-        w_step = 1.0 / width
-
-        uncertainty_map = uncertainty_map.view(batch_size, height * width)
-        num_points = min(height * width, num_points)
-        point_indices = uncertainty_map.topk(num_points, dim=1)[1]
-        point_coords = torch.zeros(
-            batch_size,
-            num_points,
-            2,
-            dtype=torch.float,
-            device=seg_logits.device)
-        point_coords[:, :, 0] = w_step / 2.0 + (point_indices %
-                                                width).float() * w_step
-        point_coords[:, :, 1] = h_step / 2.0 + (point_indices //
-                                                width).float() * h_step
-        return point_indices, point_coords
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/psa_head.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/psa_head.py
deleted file mode 100644
index 480dbd1a081262e45bf87e32c4a339ac8f8b4ffb..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/psa_head.py
+++ /dev/null
@@ -1,196 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from annotator.uniformer.mmcv.cnn import ConvModule
-
-from annotator.uniformer.mmseg.ops import resize
-from ..builder import HEADS
-from .decode_head import BaseDecodeHead
-
-try:
-    from annotator.uniformer.mmcv.ops import PSAMask
-except ModuleNotFoundError:
-    PSAMask = None
-
-
-@HEADS.register_module()
-class PSAHead(BaseDecodeHead):
-    """Point-wise Spatial Attention Network for Scene Parsing.
-
-    This head is the implementation of `PSANet
-    <https://hszhao.github.io/papers/eccv18_psanet.pdf>`_.
-
-    Args:
-        mask_size (tuple[int]): The PSA mask size. It usually equals input
-            size.
-        psa_type (str): The type of psa module. Options are 'collect',
-            'distribute', 'bi-direction'. Default: 'bi-direction'
-        compact (bool): Whether use compact map for 'collect' mode.
-            Default: True.
-        shrink_factor (int): The downsample factors of psa mask. Default: 2.
-        normalization_factor (float): The normalize factor of attention.
-        psa_softmax (bool): Whether use softmax for attention.
-    """
-
-    def __init__(self,
-                 mask_size,
-                 psa_type='bi-direction',
-                 compact=False,
-                 shrink_factor=2,
-                 normalization_factor=1.0,
-                 psa_softmax=True,
-                 **kwargs):
-        if PSAMask is None:
-            raise RuntimeError('Please install mmcv-full for PSAMask ops')
-        super(PSAHead, self).__init__(**kwargs)
-        assert psa_type in ['collect', 'distribute', 'bi-direction']
-        self.psa_type = psa_type
-        self.compact = compact
-        self.shrink_factor = shrink_factor
-        self.mask_size = mask_size
-        mask_h, mask_w = mask_size
-        self.psa_softmax = psa_softmax
-        if normalization_factor is None:
-            normalization_factor = mask_h * mask_w
-        self.normalization_factor = normalization_factor
-
-        self.reduce = ConvModule(
-            self.in_channels,
-            self.channels,
-            kernel_size=1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-        self.attention = nn.Sequential(
-            ConvModule(
-                self.channels,
-                self.channels,
-                kernel_size=1,
-                conv_cfg=self.conv_cfg,
-                norm_cfg=self.norm_cfg,
-                act_cfg=self.act_cfg),
-            nn.Conv2d(
-                self.channels, mask_h * mask_w, kernel_size=1, bias=False))
-        if psa_type == 'bi-direction':
-            self.reduce_p = ConvModule(
-                self.in_channels,
-                self.channels,
-                kernel_size=1,
-                conv_cfg=self.conv_cfg,
-                norm_cfg=self.norm_cfg,
-                act_cfg=self.act_cfg)
-            self.attention_p = nn.Sequential(
-                ConvModule(
-                    self.channels,
-                    self.channels,
-                    kernel_size=1,
-                    conv_cfg=self.conv_cfg,
-                    norm_cfg=self.norm_cfg,
-                    act_cfg=self.act_cfg),
-                nn.Conv2d(
-                    self.channels, mask_h * mask_w, kernel_size=1, bias=False))
-            self.psamask_collect = PSAMask('collect', mask_size)
-            self.psamask_distribute = PSAMask('distribute', mask_size)
-        else:
-            self.psamask = PSAMask(psa_type, mask_size)
-        self.proj = ConvModule(
-            self.channels * (2 if psa_type == 'bi-direction' else 1),
-            self.in_channels,
-            kernel_size=1,
-            padding=1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-        self.bottleneck = ConvModule(
-            self.in_channels * 2,
-            self.channels,
-            kernel_size=3,
-            padding=1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-
-    def forward(self, inputs):
-        """Forward function."""
-        x = self._transform_inputs(inputs)
-        identity = x
-        align_corners = self.align_corners
-        if self.psa_type in ['collect', 'distribute']:
-            out = self.reduce(x)
-            n, c, h, w = out.size()
-            if self.shrink_factor != 1:
-                if h % self.shrink_factor and w % self.shrink_factor:
-                    h = (h - 1) // self.shrink_factor + 1
-                    w = (w - 1) // self.shrink_factor + 1
-                    align_corners = True
-                else:
-                    h = h // self.shrink_factor
-                    w = w // self.shrink_factor
-                    align_corners = False
-                out = resize(
-                    out,
-                    size=(h, w),
-                    mode='bilinear',
-                    align_corners=align_corners)
-            y = self.attention(out)
-            if self.compact:
-                if self.psa_type == 'collect':
-                    y = y.view(n, h * w,
-                               h * w).transpose(1, 2).view(n, h * w, h, w)
-            else:
-                y = self.psamask(y)
-            if self.psa_softmax:
-                y = F.softmax(y, dim=1)
-            out = torch.bmm(
-                out.view(n, c, h * w), y.view(n, h * w, h * w)).view(
-                    n, c, h, w) * (1.0 / self.normalization_factor)
-        else:
-            x_col = self.reduce(x)
-            x_dis = self.reduce_p(x)
-            n, c, h, w = x_col.size()
-            if self.shrink_factor != 1:
-                if h % self.shrink_factor and w % self.shrink_factor:
-                    h = (h - 1) // self.shrink_factor + 1
-                    w = (w - 1) // self.shrink_factor + 1
-                    align_corners = True
-                else:
-                    h = h // self.shrink_factor
-                    w = w // self.shrink_factor
-                    align_corners = False
-                x_col = resize(
-                    x_col,
-                    size=(h, w),
-                    mode='bilinear',
-                    align_corners=align_corners)
-                x_dis = resize(
-                    x_dis,
-                    size=(h, w),
-                    mode='bilinear',
-                    align_corners=align_corners)
-            y_col = self.attention(x_col)
-            y_dis = self.attention_p(x_dis)
-            if self.compact:
-                y_dis = y_dis.view(n, h * w,
-                                   h * w).transpose(1, 2).view(n, h * w, h, w)
-            else:
-                y_col = self.psamask_collect(y_col)
-                y_dis = self.psamask_distribute(y_dis)
-            if self.psa_softmax:
-                y_col = F.softmax(y_col, dim=1)
-                y_dis = F.softmax(y_dis, dim=1)
-            x_col = torch.bmm(
-                x_col.view(n, c, h * w), y_col.view(n, h * w, h * w)).view(
-                    n, c, h, w) * (1.0 / self.normalization_factor)
-            x_dis = torch.bmm(
-                x_dis.view(n, c, h * w), y_dis.view(n, h * w, h * w)).view(
-                    n, c, h, w) * (1.0 / self.normalization_factor)
-            out = torch.cat([x_col, x_dis], 1)
-        out = self.proj(out)
-        out = resize(
-            out,
-            size=identity.shape[2:],
-            mode='bilinear',
-            align_corners=align_corners)
-        out = self.bottleneck(torch.cat((identity, out), dim=1))
-        out = self.cls_seg(out)
-        return out
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/psp_head.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/psp_head.py
deleted file mode 100644
index b5f1e71c70c3a20f4007c263ec471a87bb214a48..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/psp_head.py
+++ /dev/null
@@ -1,101 +0,0 @@
-import torch
-import torch.nn as nn
-from annotator.uniformer.mmcv.cnn import ConvModule
-
-from annotator.uniformer.mmseg.ops import resize
-from ..builder import HEADS
-from .decode_head import BaseDecodeHead
-
-
-class PPM(nn.ModuleList):
-    """Pooling Pyramid Module used in PSPNet.
-
-    Args:
-        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
-            Module.
-        in_channels (int): Input channels.
-        channels (int): Channels after modules, before conv_seg.
-        conv_cfg (dict|None): Config of conv layers.
-        norm_cfg (dict|None): Config of norm layers.
-        act_cfg (dict): Config of activation layers.
-        align_corners (bool): align_corners argument of F.interpolate.
-    """
-
-    def __init__(self, pool_scales, in_channels, channels, conv_cfg, norm_cfg,
-                 act_cfg, align_corners):
-        super(PPM, self).__init__()
-        self.pool_scales = pool_scales
-        self.align_corners = align_corners
-        self.in_channels = in_channels
-        self.channels = channels
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
-        for pool_scale in pool_scales:
-            self.append(
-                nn.Sequential(
-                    nn.AdaptiveAvgPool2d(pool_scale),
-                    ConvModule(
-                        self.in_channels,
-                        self.channels,
-                        1,
-                        conv_cfg=self.conv_cfg,
-                        norm_cfg=self.norm_cfg,
-                        act_cfg=self.act_cfg)))
-
-    def forward(self, x):
-        """Forward function."""
-        ppm_outs = []
-        for ppm in self:
-            ppm_out = ppm(x)
-            upsampled_ppm_out = resize(
-                ppm_out,
-                size=x.size()[2:],
-                mode='bilinear',
-                align_corners=self.align_corners)
-            ppm_outs.append(upsampled_ppm_out)
-        return ppm_outs
-
-
-@HEADS.register_module()
-class PSPHead(BaseDecodeHead):
-    """Pyramid Scene Parsing Network.
-
-    This head is the implementation of
-    `PSPNet <https://arxiv.org/abs/1612.01105>`_.
-
-    Args:
-        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
-            Module. Default: (1, 2, 3, 6).
-    """
-
-    def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
-        super(PSPHead, self).__init__(**kwargs)
-        assert isinstance(pool_scales, (list, tuple))
-        self.pool_scales = pool_scales
-        self.psp_modules = PPM(
-            self.pool_scales,
-            self.in_channels,
-            self.channels,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg,
-            align_corners=self.align_corners)
-        self.bottleneck = ConvModule(
-            self.in_channels + len(pool_scales) * self.channels,
-            self.channels,
-            3,
-            padding=1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-
-    def forward(self, inputs):
-        """Forward function."""
-        x = self._transform_inputs(inputs)
-        psp_outs = [x]
-        psp_outs.extend(self.psp_modules(x))
-        psp_outs = torch.cat(psp_outs, dim=1)
-        output = self.bottleneck(psp_outs)
-        output = self.cls_seg(output)
-        return output
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/sep_aspp_head.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/sep_aspp_head.py
deleted file mode 100644
index 3339a7ac56e77dfc638e9bffb557d4699148686b..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/sep_aspp_head.py
+++ /dev/null
@@ -1,101 +0,0 @@
-import torch
-import torch.nn as nn
-from annotator.uniformer.mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
-
-from annotator.uniformer.mmseg.ops import resize
-from ..builder import HEADS
-from .aspp_head import ASPPHead, ASPPModule
-
-
-class DepthwiseSeparableASPPModule(ASPPModule):
-    """Atrous Spatial Pyramid Pooling (ASPP) Module with depthwise separable
-    conv."""
-
-    def __init__(self, **kwargs):
-        super(DepthwiseSeparableASPPModule, self).__init__(**kwargs)
-        for i, dilation in enumerate(self.dilations):
-            if dilation > 1:
-                self[i] = DepthwiseSeparableConvModule(
-                    self.in_channels,
-                    self.channels,
-                    3,
-                    dilation=dilation,
-                    padding=dilation,
-                    norm_cfg=self.norm_cfg,
-                    act_cfg=self.act_cfg)
-
-
-@HEADS.register_module()
-class DepthwiseSeparableASPPHead(ASPPHead):
-    """Encoder-Decoder with Atrous Separable Convolution for Semantic Image
-    Segmentation.
-
-    This head is the implementation of `DeepLabV3+
-    <https://arxiv.org/abs/1802.02611>`_.
-
-    Args:
-        c1_in_channels (int): The input channels of c1 decoder. If is 0,
-            the no decoder will be used.
-        c1_channels (int): The intermediate channels of c1 decoder.
-    """
-
-    def __init__(self, c1_in_channels, c1_channels, **kwargs):
-        super(DepthwiseSeparableASPPHead, self).__init__(**kwargs)
-        assert c1_in_channels >= 0
-        self.aspp_modules = DepthwiseSeparableASPPModule(
-            dilations=self.dilations,
-            in_channels=self.in_channels,
-            channels=self.channels,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-        if c1_in_channels > 0:
-            self.c1_bottleneck = ConvModule(
-                c1_in_channels,
-                c1_channels,
-                1,
-                conv_cfg=self.conv_cfg,
-                norm_cfg=self.norm_cfg,
-                act_cfg=self.act_cfg)
-        else:
-            self.c1_bottleneck = None
-        self.sep_bottleneck = nn.Sequential(
-            DepthwiseSeparableConvModule(
-                self.channels + c1_channels,
-                self.channels,
-                3,
-                padding=1,
-                norm_cfg=self.norm_cfg,
-                act_cfg=self.act_cfg),
-            DepthwiseSeparableConvModule(
-                self.channels,
-                self.channels,
-                3,
-                padding=1,
-                norm_cfg=self.norm_cfg,
-                act_cfg=self.act_cfg))
-
-    def forward(self, inputs):
-        """Forward function."""
-        x = self._transform_inputs(inputs)
-        aspp_outs = [
-            resize(
-                self.image_pool(x),
-                size=x.size()[2:],
-                mode='bilinear',
-                align_corners=self.align_corners)
-        ]
-        aspp_outs.extend(self.aspp_modules(x))
-        aspp_outs = torch.cat(aspp_outs, dim=1)
-        output = self.bottleneck(aspp_outs)
-        if self.c1_bottleneck is not None:
-            c1_output = self.c1_bottleneck(inputs[0])
-            output = resize(
-                input=output,
-                size=c1_output.shape[2:],
-                mode='bilinear',
-                align_corners=self.align_corners)
-            output = torch.cat([output, c1_output], dim=1)
-        output = self.sep_bottleneck(output)
-        output = self.cls_seg(output)
-        return output
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/sep_fcn_head.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/sep_fcn_head.py
deleted file mode 100644
index a0986143fa4f2bd36f5271354fe5f843f35b9e6f..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/sep_fcn_head.py
+++ /dev/null
@@ -1,51 +0,0 @@
-from annotator.uniformer.mmcv.cnn import DepthwiseSeparableConvModule
-
-from ..builder import HEADS
-from .fcn_head import FCNHead
-
-
-@HEADS.register_module()
-class DepthwiseSeparableFCNHead(FCNHead):
-    """Depthwise-Separable Fully Convolutional Network for Semantic
-    Segmentation.
-
-    This head is implemented according to Fast-SCNN paper.
-    Args:
-        in_channels(int): Number of output channels of FFM.
-        channels(int): Number of middle-stage channels in the decode head.
-        concat_input(bool): Whether to concatenate original decode input into
-            the result of several consecutive convolution layers.
-            Default: True.
-        num_classes(int): Used to determine the dimension of
-            final prediction tensor.
-        in_index(int): Correspond with 'out_indices' in FastSCNN backbone.
-        norm_cfg (dict | None): Config of norm layers.
-        align_corners (bool): align_corners argument of F.interpolate.
-            Default: False.
-        loss_decode(dict): Config of loss type and some
-            relevant additional options.
-    """
-
-    def __init__(self, **kwargs):
-        super(DepthwiseSeparableFCNHead, self).__init__(**kwargs)
-        self.convs[0] = DepthwiseSeparableConvModule(
-            self.in_channels,
-            self.channels,
-            kernel_size=self.kernel_size,
-            padding=self.kernel_size // 2,
-            norm_cfg=self.norm_cfg)
-        for i in range(1, self.num_convs):
-            self.convs[i] = DepthwiseSeparableConvModule(
-                self.channels,
-                self.channels,
-                kernel_size=self.kernel_size,
-                padding=self.kernel_size // 2,
-                norm_cfg=self.norm_cfg)
-
-        if self.concat_input:
-            self.conv_cat = DepthwiseSeparableConvModule(
-                self.in_channels + self.channels,
-                self.channels,
-                kernel_size=self.kernel_size,
-                padding=self.kernel_size // 2,
-                norm_cfg=self.norm_cfg)
diff --git a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/uper_head.py b/ControlNet/annotator/uniformer/mmseg/models/decode_heads/uper_head.py
deleted file mode 100644
index 9e1301b706b0d83ed714bbdee8ee24693f150455..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/decode_heads/uper_head.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import torch
-import torch.nn as nn
-from annotator.uniformer.mmcv.cnn import ConvModule
-
-from annotator.uniformer.mmseg.ops import resize
-from ..builder import HEADS
-from .decode_head import BaseDecodeHead
-from .psp_head import PPM
-
-
-@HEADS.register_module()
-class UPerHead(BaseDecodeHead):
-    """Unified Perceptual Parsing for Scene Understanding.
-
-    This head is the implementation of `UPerNet
-    <https://arxiv.org/abs/1807.10221>`_.
-
-    Args:
-        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
-            Module applied on the last feature. Default: (1, 2, 3, 6).
-    """
-
-    def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
-        super(UPerHead, self).__init__(
-            input_transform='multiple_select', **kwargs)
-        # PSP Module
-        self.psp_modules = PPM(
-            pool_scales,
-            self.in_channels[-1],
-            self.channels,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg,
-            align_corners=self.align_corners)
-        self.bottleneck = ConvModule(
-            self.in_channels[-1] + len(pool_scales) * self.channels,
-            self.channels,
-            3,
-            padding=1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-        # FPN Module
-        self.lateral_convs = nn.ModuleList()
-        self.fpn_convs = nn.ModuleList()
-        for in_channels in self.in_channels[:-1]:  # skip the top layer
-            l_conv = ConvModule(
-                in_channels,
-                self.channels,
-                1,
-                conv_cfg=self.conv_cfg,
-                norm_cfg=self.norm_cfg,
-                act_cfg=self.act_cfg,
-                inplace=False)
-            fpn_conv = ConvModule(
-                self.channels,
-                self.channels,
-                3,
-                padding=1,
-                conv_cfg=self.conv_cfg,
-                norm_cfg=self.norm_cfg,
-                act_cfg=self.act_cfg,
-                inplace=False)
-            self.lateral_convs.append(l_conv)
-            self.fpn_convs.append(fpn_conv)
-
-        self.fpn_bottleneck = ConvModule(
-            len(self.in_channels) * self.channels,
-            self.channels,
-            3,
-            padding=1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-
-    def psp_forward(self, inputs):
-        """Forward function of PSP module."""
-        x = inputs[-1]
-        psp_outs = [x]
-        psp_outs.extend(self.psp_modules(x))
-        psp_outs = torch.cat(psp_outs, dim=1)
-        output = self.bottleneck(psp_outs)
-
-        return output
-
-    def forward(self, inputs):
-        """Forward function."""
-
-        inputs = self._transform_inputs(inputs)
-
-        # build laterals
-        laterals = [
-            lateral_conv(inputs[i])
-            for i, lateral_conv in enumerate(self.lateral_convs)
-        ]
-
-        laterals.append(self.psp_forward(inputs))
-
-        # build top-down path
-        used_backbone_levels = len(laterals)
-        for i in range(used_backbone_levels - 1, 0, -1):
-            prev_shape = laterals[i - 1].shape[2:]
-            laterals[i - 1] += resize(
-                laterals[i],
-                size=prev_shape,
-                mode='bilinear',
-                align_corners=self.align_corners)
-
-        # build outputs
-        fpn_outs = [
-            self.fpn_convs[i](laterals[i])
-            for i in range(used_backbone_levels - 1)
-        ]
-        # append psp feature
-        fpn_outs.append(laterals[-1])
-
-        for i in range(used_backbone_levels - 1, 0, -1):
-            fpn_outs[i] = resize(
-                fpn_outs[i],
-                size=fpn_outs[0].shape[2:],
-                mode='bilinear',
-                align_corners=self.align_corners)
-        fpn_outs = torch.cat(fpn_outs, dim=1)
-        output = self.fpn_bottleneck(fpn_outs)
-        output = self.cls_seg(output)
-        return output
diff --git a/ControlNet/annotator/uniformer/mmseg/models/losses/__init__.py b/ControlNet/annotator/uniformer/mmseg/models/losses/__init__.py
deleted file mode 100644
index beca72045694273d63465bac2f27dbc6672271db..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/losses/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from .accuracy import Accuracy, accuracy
-from .cross_entropy_loss import (CrossEntropyLoss, binary_cross_entropy,
-                                 cross_entropy, mask_cross_entropy)
-from .dice_loss import DiceLoss
-from .lovasz_loss import LovaszLoss
-from .utils import reduce_loss, weight_reduce_loss, weighted_loss
-
-__all__ = [
-    'accuracy', 'Accuracy', 'cross_entropy', 'binary_cross_entropy',
-    'mask_cross_entropy', 'CrossEntropyLoss', 'reduce_loss',
-    'weight_reduce_loss', 'weighted_loss', 'LovaszLoss', 'DiceLoss'
-]
diff --git a/ControlNet/annotator/uniformer/mmseg/models/losses/accuracy.py b/ControlNet/annotator/uniformer/mmseg/models/losses/accuracy.py
deleted file mode 100644
index c0fd2e7e74a0f721c4a814c09d6e453e5956bb38..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/losses/accuracy.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import torch.nn as nn
-
-
-def accuracy(pred, target, topk=1, thresh=None):
-    """Calculate accuracy according to the prediction and target.
-
-    Args:
-        pred (torch.Tensor): The model prediction, shape (N, num_class, ...)
-        target (torch.Tensor): The target of each prediction, shape (N, , ...)
-        topk (int | tuple[int], optional): If the predictions in ``topk``
-            matches the target, the predictions will be regarded as
-            correct ones. Defaults to 1.
-        thresh (float, optional): If not None, predictions with scores under
-            this threshold are considered incorrect. Default to None.
-
-    Returns:
-        float | tuple[float]: If the input ``topk`` is a single integer,
-            the function will return a single float as accuracy. If
-            ``topk`` is a tuple containing multiple integers, the
-            function will return a tuple containing accuracies of
-            each ``topk`` number.
-    """
-    assert isinstance(topk, (int, tuple))
-    if isinstance(topk, int):
-        topk = (topk, )
-        return_single = True
-    else:
-        return_single = False
-
-    maxk = max(topk)
-    if pred.size(0) == 0:
-        accu = [pred.new_tensor(0.) for i in range(len(topk))]
-        return accu[0] if return_single else accu
-    assert pred.ndim == target.ndim + 1
-    assert pred.size(0) == target.size(0)
-    assert maxk <= pred.size(1), \
-        f'maxk {maxk} exceeds pred dimension {pred.size(1)}'
-    pred_value, pred_label = pred.topk(maxk, dim=1)
-    # transpose to shape (maxk, N, ...)
-    pred_label = pred_label.transpose(0, 1)
-    correct = pred_label.eq(target.unsqueeze(0).expand_as(pred_label))
-    if thresh is not None:
-        # Only prediction values larger than thresh are counted as correct
-        correct = correct & (pred_value > thresh).t()
-    res = []
-    for k in topk:
-        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
-        res.append(correct_k.mul_(100.0 / target.numel()))
-    return res[0] if return_single else res
-
-
-class Accuracy(nn.Module):
-    """Accuracy calculation module."""
-
-    def __init__(self, topk=(1, ), thresh=None):
-        """Module to calculate the accuracy.
-
-        Args:
-            topk (tuple, optional): The criterion used to calculate the
-                accuracy. Defaults to (1,).
-            thresh (float, optional): If not None, predictions with scores
-                under this threshold are considered incorrect. Default to None.
-        """
-        super().__init__()
-        self.topk = topk
-        self.thresh = thresh
-
-    def forward(self, pred, target):
-        """Forward function to calculate accuracy.
-
-        Args:
-            pred (torch.Tensor): Prediction of models.
-            target (torch.Tensor): Target for each prediction.
-
-        Returns:
-            tuple[float]: The accuracies under different topk criterions.
-        """
-        return accuracy(pred, target, self.topk, self.thresh)
diff --git a/ControlNet/annotator/uniformer/mmseg/models/losses/cross_entropy_loss.py b/ControlNet/annotator/uniformer/mmseg/models/losses/cross_entropy_loss.py
deleted file mode 100644
index 42c0790c98616bb69621deed55547fc04c7392ef..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/losses/cross_entropy_loss.py
+++ /dev/null
@@ -1,198 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ..builder import LOSSES
-from .utils import get_class_weight, weight_reduce_loss
-
-
-def cross_entropy(pred,
-                  label,
-                  weight=None,
-                  class_weight=None,
-                  reduction='mean',
-                  avg_factor=None,
-                  ignore_index=-100):
-    """The wrapper function for :func:`F.cross_entropy`"""
-    # class_weight is a manual rescaling weight given to each class.
-    # If given, has to be a Tensor of size C element-wise losses
-    loss = F.cross_entropy(
-        pred,
-        label,
-        weight=class_weight,
-        reduction='none',
-        ignore_index=ignore_index)
-
-    # apply weights and do the reduction
-    if weight is not None:
-        weight = weight.float()
-    loss = weight_reduce_loss(
-        loss, weight=weight, reduction=reduction, avg_factor=avg_factor)
-
-    return loss
-
-
-def _expand_onehot_labels(labels, label_weights, target_shape, ignore_index):
-    """Expand onehot labels to match the size of prediction."""
-    bin_labels = labels.new_zeros(target_shape)
-    valid_mask = (labels >= 0) & (labels != ignore_index)
-    inds = torch.nonzero(valid_mask, as_tuple=True)
-
-    if inds[0].numel() > 0:
-        if labels.dim() == 3:
-            bin_labels[inds[0], labels[valid_mask], inds[1], inds[2]] = 1
-        else:
-            bin_labels[inds[0], labels[valid_mask]] = 1
-
-    valid_mask = valid_mask.unsqueeze(1).expand(target_shape).float()
-    if label_weights is None:
-        bin_label_weights = valid_mask
-    else:
-        bin_label_weights = label_weights.unsqueeze(1).expand(target_shape)
-        bin_label_weights *= valid_mask
-
-    return bin_labels, bin_label_weights
-
-
-def binary_cross_entropy(pred,
-                         label,
-                         weight=None,
-                         reduction='mean',
-                         avg_factor=None,
-                         class_weight=None,
-                         ignore_index=255):
-    """Calculate the binary CrossEntropy loss.
-
-    Args:
-        pred (torch.Tensor): The prediction with shape (N, 1).
-        label (torch.Tensor): The learning label of the prediction.
-        weight (torch.Tensor, optional): Sample-wise loss weight.
-        reduction (str, optional): The method used to reduce the loss.
-            Options are "none", "mean" and "sum".
-        avg_factor (int, optional): Average factor that is used to average
-            the loss. Defaults to None.
-        class_weight (list[float], optional): The weight for each class.
-        ignore_index (int | None): The label index to be ignored. Default: 255
-
-    Returns:
-        torch.Tensor: The calculated loss
-    """
-    if pred.dim() != label.dim():
-        assert (pred.dim() == 2 and label.dim() == 1) or (
-                pred.dim() == 4 and label.dim() == 3), \
-            'Only pred shape [N, C], label shape [N] or pred shape [N, C, ' \
-            'H, W], label shape [N, H, W] are supported'
-        label, weight = _expand_onehot_labels(label, weight, pred.shape,
-                                              ignore_index)
-
-    # weighted element-wise losses
-    if weight is not None:
-        weight = weight.float()
-    loss = F.binary_cross_entropy_with_logits(
-        pred, label.float(), pos_weight=class_weight, reduction='none')
-    # do the reduction for the weighted loss
-    loss = weight_reduce_loss(
-        loss, weight, reduction=reduction, avg_factor=avg_factor)
-
-    return loss
-
-
-def mask_cross_entropy(pred,
-                       target,
-                       label,
-                       reduction='mean',
-                       avg_factor=None,
-                       class_weight=None,
-                       ignore_index=None):
-    """Calculate the CrossEntropy loss for masks.
-
-    Args:
-        pred (torch.Tensor): The prediction with shape (N, C), C is the number
-            of classes.
-        target (torch.Tensor): The learning label of the prediction.
-        label (torch.Tensor): ``label`` indicates the class label of the mask'
-            corresponding object. This will be used to select the mask in the
-            of the class which the object belongs to when the mask prediction
-            if not class-agnostic.
-        reduction (str, optional): The method used to reduce the loss.
-            Options are "none", "mean" and "sum".
-        avg_factor (int, optional): Average factor that is used to average
-            the loss. Defaults to None.
-        class_weight (list[float], optional): The weight for each class.
-        ignore_index (None): Placeholder, to be consistent with other loss.
-            Default: None.
-
-    Returns:
-        torch.Tensor: The calculated loss
-    """
-    assert ignore_index is None, 'BCE loss does not support ignore_index'
-    # TODO: handle these two reserved arguments
-    assert reduction == 'mean' and avg_factor is None
-    num_rois = pred.size()[0]
-    inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device)
-    pred_slice = pred[inds, label].squeeze(1)
-    return F.binary_cross_entropy_with_logits(
-        pred_slice, target, weight=class_weight, reduction='mean')[None]
-
-
-@LOSSES.register_module()
-class CrossEntropyLoss(nn.Module):
-    """CrossEntropyLoss.
-
-    Args:
-        use_sigmoid (bool, optional): Whether the prediction uses sigmoid
-            of softmax. Defaults to False.
-        use_mask (bool, optional): Whether to use mask cross entropy loss.
-            Defaults to False.
-        reduction (str, optional): . Defaults to 'mean'.
-            Options are "none", "mean" and "sum".
-        class_weight (list[float] | str, optional): Weight of each class. If in
-            str format, read them from a file. Defaults to None.
-        loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
-    """
-
-    def __init__(self,
-                 use_sigmoid=False,
-                 use_mask=False,
-                 reduction='mean',
-                 class_weight=None,
-                 loss_weight=1.0):
-        super(CrossEntropyLoss, self).__init__()
-        assert (use_sigmoid is False) or (use_mask is False)
-        self.use_sigmoid = use_sigmoid
-        self.use_mask = use_mask
-        self.reduction = reduction
-        self.loss_weight = loss_weight
-        self.class_weight = get_class_weight(class_weight)
-
-        if self.use_sigmoid:
-            self.cls_criterion = binary_cross_entropy
-        elif self.use_mask:
-            self.cls_criterion = mask_cross_entropy
-        else:
-            self.cls_criterion = cross_entropy
-
-    def forward(self,
-                cls_score,
-                label,
-                weight=None,
-                avg_factor=None,
-                reduction_override=None,
-                **kwargs):
-        """Forward function."""
-        assert reduction_override in (None, 'none', 'mean', 'sum')
-        reduction = (
-            reduction_override if reduction_override else self.reduction)
-        if self.class_weight is not None:
-            class_weight = cls_score.new_tensor(self.class_weight)
-        else:
-            class_weight = None
-        loss_cls = self.loss_weight * self.cls_criterion(
-            cls_score,
-            label,
-            weight,
-            class_weight=class_weight,
-            reduction=reduction,
-            avg_factor=avg_factor,
-            **kwargs)
-        return loss_cls
diff --git a/ControlNet/annotator/uniformer/mmseg/models/losses/dice_loss.py b/ControlNet/annotator/uniformer/mmseg/models/losses/dice_loss.py
deleted file mode 100644
index 27a77b962d7d8b3079c7d6cd9db52280c6fb4970..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/losses/dice_loss.py
+++ /dev/null
@@ -1,119 +0,0 @@
-"""Modified from https://github.com/LikeLy-Journey/SegmenTron/blob/master/
-segmentron/solver/loss.py (Apache-2.0 License)"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ..builder import LOSSES
-from .utils import get_class_weight, weighted_loss
-
-
-@weighted_loss
-def dice_loss(pred,
-              target,
-              valid_mask,
-              smooth=1,
-              exponent=2,
-              class_weight=None,
-              ignore_index=255):
-    assert pred.shape[0] == target.shape[0]
-    total_loss = 0
-    num_classes = pred.shape[1]
-    for i in range(num_classes):
-        if i != ignore_index:
-            dice_loss = binary_dice_loss(
-                pred[:, i],
-                target[..., i],
-                valid_mask=valid_mask,
-                smooth=smooth,
-                exponent=exponent)
-            if class_weight is not None:
-                dice_loss *= class_weight[i]
-            total_loss += dice_loss
-    return total_loss / num_classes
-
-
-@weighted_loss
-def binary_dice_loss(pred, target, valid_mask, smooth=1, exponent=2, **kwards):
-    assert pred.shape[0] == target.shape[0]
-    pred = pred.reshape(pred.shape[0], -1)
-    target = target.reshape(target.shape[0], -1)
-    valid_mask = valid_mask.reshape(valid_mask.shape[0], -1)
-
-    num = torch.sum(torch.mul(pred, target) * valid_mask, dim=1) * 2 + smooth
-    den = torch.sum(pred.pow(exponent) + target.pow(exponent), dim=1) + smooth
-
-    return 1 - num / den
-
-
-@LOSSES.register_module()
-class DiceLoss(nn.Module):
-    """DiceLoss.
-
-    This loss is proposed in `V-Net: Fully Convolutional Neural Networks for
-    Volumetric Medical Image Segmentation <https://arxiv.org/abs/1606.04797>`_.
-
-    Args:
-        loss_type (str, optional): Binary or multi-class loss.
-            Default: 'multi_class'. Options are "binary" and "multi_class".
-        smooth (float): A float number to smooth loss, and avoid NaN error.
-            Default: 1
-        exponent (float): An float number to calculate denominator
-            value: \\sum{x^exponent} + \\sum{y^exponent}. Default: 2.
-        reduction (str, optional): The method used to reduce the loss. Options
-            are "none", "mean" and "sum". This parameter only works when
-            per_image is True. Default: 'mean'.
-        class_weight (list[float] | str, optional): Weight of each class. If in
-            str format, read them from a file. Defaults to None.
-        loss_weight (float, optional): Weight of the loss. Default to 1.0.
-        ignore_index (int | None): The label index to be ignored. Default: 255.
-    """
-
-    def __init__(self,
-                 smooth=1,
-                 exponent=2,
-                 reduction='mean',
-                 class_weight=None,
-                 loss_weight=1.0,
-                 ignore_index=255,
-                 **kwards):
-        super(DiceLoss, self).__init__()
-        self.smooth = smooth
-        self.exponent = exponent
-        self.reduction = reduction
-        self.class_weight = get_class_weight(class_weight)
-        self.loss_weight = loss_weight
-        self.ignore_index = ignore_index
-
-    def forward(self,
-                pred,
-                target,
-                avg_factor=None,
-                reduction_override=None,
-                **kwards):
-        assert reduction_override in (None, 'none', 'mean', 'sum')
-        reduction = (
-            reduction_override if reduction_override else self.reduction)
-        if self.class_weight is not None:
-            class_weight = pred.new_tensor(self.class_weight)
-        else:
-            class_weight = None
-
-        pred = F.softmax(pred, dim=1)
-        num_classes = pred.shape[1]
-        one_hot_target = F.one_hot(
-            torch.clamp(target.long(), 0, num_classes - 1),
-            num_classes=num_classes)
-        valid_mask = (target != self.ignore_index).long()
-
-        loss = self.loss_weight * dice_loss(
-            pred,
-            one_hot_target,
-            valid_mask=valid_mask,
-            reduction=reduction,
-            avg_factor=avg_factor,
-            smooth=self.smooth,
-            exponent=self.exponent,
-            class_weight=class_weight,
-            ignore_index=self.ignore_index)
-        return loss
diff --git a/ControlNet/annotator/uniformer/mmseg/models/losses/lovasz_loss.py b/ControlNet/annotator/uniformer/mmseg/models/losses/lovasz_loss.py
deleted file mode 100644
index 6badb67f6d987b59fb07aa97caaaf89896e27a8d..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/losses/lovasz_loss.py
+++ /dev/null
@@ -1,303 +0,0 @@
-"""Modified from https://github.com/bermanmaxim/LovaszSoftmax/blob/master/pytor
-ch/lovasz_losses.py Lovasz-Softmax and Jaccard hinge loss in PyTorch Maxim
-Berman 2018 ESAT-PSI KU Leuven (MIT License)"""
-
-import annotator.uniformer.mmcv as mmcv
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ..builder import LOSSES
-from .utils import get_class_weight, weight_reduce_loss
-
-
-def lovasz_grad(gt_sorted):
-    """Computes gradient of the Lovasz extension w.r.t sorted errors.
-
-    See Alg. 1 in paper.
-    """
-    p = len(gt_sorted)
-    gts = gt_sorted.sum()
-    intersection = gts - gt_sorted.float().cumsum(0)
-    union = gts + (1 - gt_sorted).float().cumsum(0)
-    jaccard = 1. - intersection / union
-    if p > 1:  # cover 1-pixel case
-        jaccard[1:p] = jaccard[1:p] - jaccard[0:-1]
-    return jaccard
-
-
-def flatten_binary_logits(logits, labels, ignore_index=None):
-    """Flattens predictions in the batch (binary case) Remove labels equal to
-    'ignore_index'."""
-    logits = logits.view(-1)
-    labels = labels.view(-1)
-    if ignore_index is None:
-        return logits, labels
-    valid = (labels != ignore_index)
-    vlogits = logits[valid]
-    vlabels = labels[valid]
-    return vlogits, vlabels
-
-
-def flatten_probs(probs, labels, ignore_index=None):
-    """Flattens predictions in the batch."""
-    if probs.dim() == 3:
-        # assumes output of a sigmoid layer
-        B, H, W = probs.size()
-        probs = probs.view(B, 1, H, W)
-    B, C, H, W = probs.size()
-    probs = probs.permute(0, 2, 3, 1).contiguous().view(-1, C)  # B*H*W, C=P,C
-    labels = labels.view(-1)
-    if ignore_index is None:
-        return probs, labels
-    valid = (labels != ignore_index)
-    vprobs = probs[valid.nonzero().squeeze()]
-    vlabels = labels[valid]
-    return vprobs, vlabels
-
-
-def lovasz_hinge_flat(logits, labels):
-    """Binary Lovasz hinge loss.
-
-    Args:
-        logits (torch.Tensor): [P], logits at each prediction
-            (between -infty and +infty).
-        labels (torch.Tensor): [P], binary ground truth labels (0 or 1).
-
-    Returns:
-        torch.Tensor: The calculated loss.
-    """
-    if len(labels) == 0:
-        # only void pixels, the gradients should be 0
-        return logits.sum() * 0.
-    signs = 2. * labels.float() - 1.
-    errors = (1. - logits * signs)
-    errors_sorted, perm = torch.sort(errors, dim=0, descending=True)
-    perm = perm.data
-    gt_sorted = labels[perm]
-    grad = lovasz_grad(gt_sorted)
-    loss = torch.dot(F.relu(errors_sorted), grad)
-    return loss
-
-
-def lovasz_hinge(logits,
-                 labels,
-                 classes='present',
-                 per_image=False,
-                 class_weight=None,
-                 reduction='mean',
-                 avg_factor=None,
-                 ignore_index=255):
-    """Binary Lovasz hinge loss.
-
-    Args:
-        logits (torch.Tensor): [B, H, W], logits at each pixel
-            (between -infty and +infty).
-        labels (torch.Tensor): [B, H, W], binary ground truth masks (0 or 1).
-        classes (str | list[int], optional): Placeholder, to be consistent with
-            other loss. Default: None.
-        per_image (bool, optional): If per_image is True, compute the loss per
-            image instead of per batch. Default: False.
-        class_weight (list[float], optional): Placeholder, to be consistent
-            with other loss. Default: None.
-        reduction (str, optional): The method used to reduce the loss. Options
-            are "none", "mean" and "sum". This parameter only works when
-            per_image is True. Default: 'mean'.
-        avg_factor (int, optional): Average factor that is used to average
-            the loss. This parameter only works when per_image is True.
-            Default: None.
-        ignore_index (int | None): The label index to be ignored. Default: 255.
-
-    Returns:
-        torch.Tensor: The calculated loss.
-    """
-    if per_image:
-        loss = [
-            lovasz_hinge_flat(*flatten_binary_logits(
-                logit.unsqueeze(0), label.unsqueeze(0), ignore_index))
-            for logit, label in zip(logits, labels)
-        ]
-        loss = weight_reduce_loss(
-            torch.stack(loss), None, reduction, avg_factor)
-    else:
-        loss = lovasz_hinge_flat(
-            *flatten_binary_logits(logits, labels, ignore_index))
-    return loss
-
-
-def lovasz_softmax_flat(probs, labels, classes='present', class_weight=None):
-    """Multi-class Lovasz-Softmax loss.
-
-    Args:
-        probs (torch.Tensor): [P, C], class probabilities at each prediction
-            (between 0 and 1).
-        labels (torch.Tensor): [P], ground truth labels (between 0 and C - 1).
-        classes (str | list[int], optional): Classes chosen to calculate loss.
-            'all' for all classes, 'present' for classes present in labels, or
-            a list of classes to average. Default: 'present'.
-        class_weight (list[float], optional): The weight for each class.
-            Default: None.
-
-    Returns:
-        torch.Tensor: The calculated loss.
-    """
-    if probs.numel() == 0:
-        # only void pixels, the gradients should be 0
-        return probs * 0.
-    C = probs.size(1)
-    losses = []
-    class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes
-    for c in class_to_sum:
-        fg = (labels == c).float()  # foreground for class c
-        if (classes == 'present' and fg.sum() == 0):
-            continue
-        if C == 1:
-            if len(classes) > 1:
-                raise ValueError('Sigmoid output possible only with 1 class')
-            class_pred = probs[:, 0]
-        else:
-            class_pred = probs[:, c]
-        errors = (fg - class_pred).abs()
-        errors_sorted, perm = torch.sort(errors, 0, descending=True)
-        perm = perm.data
-        fg_sorted = fg[perm]
-        loss = torch.dot(errors_sorted, lovasz_grad(fg_sorted))
-        if class_weight is not None:
-            loss *= class_weight[c]
-        losses.append(loss)
-    return torch.stack(losses).mean()
-
-
-def lovasz_softmax(probs,
-                   labels,
-                   classes='present',
-                   per_image=False,
-                   class_weight=None,
-                   reduction='mean',
-                   avg_factor=None,
-                   ignore_index=255):
-    """Multi-class Lovasz-Softmax loss.
-
-    Args:
-        probs (torch.Tensor): [B, C, H, W], class probabilities at each
-            prediction (between 0 and 1).
-        labels (torch.Tensor): [B, H, W], ground truth labels (between 0 and
-            C - 1).
-        classes (str | list[int], optional): Classes chosen to calculate loss.
-            'all' for all classes, 'present' for classes present in labels, or
-            a list of classes to average. Default: 'present'.
-        per_image (bool, optional): If per_image is True, compute the loss per
-            image instead of per batch. Default: False.
-        class_weight (list[float], optional): The weight for each class.
-            Default: None.
-        reduction (str, optional): The method used to reduce the loss. Options
-            are "none", "mean" and "sum". This parameter only works when
-            per_image is True. Default: 'mean'.
-        avg_factor (int, optional): Average factor that is used to average
-            the loss. This parameter only works when per_image is True.
-            Default: None.
-        ignore_index (int | None): The label index to be ignored. Default: 255.
-
-    Returns:
-        torch.Tensor: The calculated loss.
-    """
-
-    if per_image:
-        loss = [
-            lovasz_softmax_flat(
-                *flatten_probs(
-                    prob.unsqueeze(0), label.unsqueeze(0), ignore_index),
-                classes=classes,
-                class_weight=class_weight)
-            for prob, label in zip(probs, labels)
-        ]
-        loss = weight_reduce_loss(
-            torch.stack(loss), None, reduction, avg_factor)
-    else:
-        loss = lovasz_softmax_flat(
-            *flatten_probs(probs, labels, ignore_index),
-            classes=classes,
-            class_weight=class_weight)
-    return loss
-
-
-@LOSSES.register_module()
-class LovaszLoss(nn.Module):
-    """LovaszLoss.
-
-    This loss is proposed in `The Lovasz-Softmax loss: A tractable surrogate
-    for the optimization of the intersection-over-union measure in neural
-    networks <https://arxiv.org/abs/1705.08790>`_.
-
-    Args:
-        loss_type (str, optional): Binary or multi-class loss.
-            Default: 'multi_class'. Options are "binary" and "multi_class".
-        classes (str | list[int], optional): Classes chosen to calculate loss.
-            'all' for all classes, 'present' for classes present in labels, or
-            a list of classes to average. Default: 'present'.
-        per_image (bool, optional): If per_image is True, compute the loss per
-            image instead of per batch. Default: False.
-        reduction (str, optional): The method used to reduce the loss. Options
-            are "none", "mean" and "sum". This parameter only works when
-            per_image is True. Default: 'mean'.
-        class_weight (list[float] | str, optional): Weight of each class. If in
-            str format, read them from a file. Defaults to None.
-        loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
-    """
-
-    def __init__(self,
-                 loss_type='multi_class',
-                 classes='present',
-                 per_image=False,
-                 reduction='mean',
-                 class_weight=None,
-                 loss_weight=1.0):
-        super(LovaszLoss, self).__init__()
-        assert loss_type in ('binary', 'multi_class'), "loss_type should be \
-                                                    'binary' or 'multi_class'."
-
-        if loss_type == 'binary':
-            self.cls_criterion = lovasz_hinge
-        else:
-            self.cls_criterion = lovasz_softmax
-        assert classes in ('all', 'present') or mmcv.is_list_of(classes, int)
-        if not per_image:
-            assert reduction == 'none', "reduction should be 'none' when \
-                                                        per_image is False."
-
-        self.classes = classes
-        self.per_image = per_image
-        self.reduction = reduction
-        self.loss_weight = loss_weight
-        self.class_weight = get_class_weight(class_weight)
-
-    def forward(self,
-                cls_score,
-                label,
-                weight=None,
-                avg_factor=None,
-                reduction_override=None,
-                **kwargs):
-        """Forward function."""
-        assert reduction_override in (None, 'none', 'mean', 'sum')
-        reduction = (
-            reduction_override if reduction_override else self.reduction)
-        if self.class_weight is not None:
-            class_weight = cls_score.new_tensor(self.class_weight)
-        else:
-            class_weight = None
-
-        # if multi-class loss, transform logits to probs
-        if self.cls_criterion == lovasz_softmax:
-            cls_score = F.softmax(cls_score, dim=1)
-
-        loss_cls = self.loss_weight * self.cls_criterion(
-            cls_score,
-            label,
-            self.classes,
-            self.per_image,
-            class_weight=class_weight,
-            reduction=reduction,
-            avg_factor=avg_factor,
-            **kwargs)
-        return loss_cls
diff --git a/ControlNet/annotator/uniformer/mmseg/models/losses/utils.py b/ControlNet/annotator/uniformer/mmseg/models/losses/utils.py
deleted file mode 100644
index 85aec9f3045240c3de96a928324ae8f5c3aebe8b..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/losses/utils.py
+++ /dev/null
@@ -1,121 +0,0 @@
-import functools
-
-import annotator.uniformer.mmcv as mmcv
-import numpy as np
-import torch.nn.functional as F
-
-
-def get_class_weight(class_weight):
-    """Get class weight for loss function.
-
-    Args:
-        class_weight (list[float] | str | None): If class_weight is a str,
-            take it as a file name and read from it.
-    """
-    if isinstance(class_weight, str):
-        # take it as a file path
-        if class_weight.endswith('.npy'):
-            class_weight = np.load(class_weight)
-        else:
-            # pkl, json or yaml
-            class_weight = mmcv.load(class_weight)
-
-    return class_weight
-
-
-def reduce_loss(loss, reduction):
-    """Reduce loss as specified.
-
-    Args:
-        loss (Tensor): Elementwise loss tensor.
-        reduction (str): Options are "none", "mean" and "sum".
-
-    Return:
-        Tensor: Reduced loss tensor.
-    """
-    reduction_enum = F._Reduction.get_enum(reduction)
-    # none: 0, elementwise_mean:1, sum: 2
-    if reduction_enum == 0:
-        return loss
-    elif reduction_enum == 1:
-        return loss.mean()
-    elif reduction_enum == 2:
-        return loss.sum()
-
-
-def weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None):
-    """Apply element-wise weight and reduce loss.
-
-    Args:
-        loss (Tensor): Element-wise loss.
-        weight (Tensor): Element-wise weights.
-        reduction (str): Same as built-in losses of PyTorch.
-        avg_factor (float): Avarage factor when computing the mean of losses.
-
-    Returns:
-        Tensor: Processed loss values.
-    """
-    # if weight is specified, apply element-wise weight
-    if weight is not None:
-        assert weight.dim() == loss.dim()
-        if weight.dim() > 1:
-            assert weight.size(1) == 1 or weight.size(1) == loss.size(1)
-        loss = loss * weight
-
-    # if avg_factor is not specified, just reduce the loss
-    if avg_factor is None:
-        loss = reduce_loss(loss, reduction)
-    else:
-        # if reduction is mean, then average the loss by avg_factor
-        if reduction == 'mean':
-            loss = loss.sum() / avg_factor
-        # if reduction is 'none', then do nothing, otherwise raise an error
-        elif reduction != 'none':
-            raise ValueError('avg_factor can not be used with reduction="sum"')
-    return loss
-
-
-def weighted_loss(loss_func):
-    """Create a weighted version of a given loss function.
-
-    To use this decorator, the loss function must have the signature like
-    `loss_func(pred, target, **kwargs)`. The function only needs to compute
-    element-wise loss without any reduction. This decorator will add weight
-    and reduction arguments to the function. The decorated function will have
-    the signature like `loss_func(pred, target, weight=None, reduction='mean',
-    avg_factor=None, **kwargs)`.
-
-    :Example:
-
-    >>> import torch
-    >>> @weighted_loss
-    >>> def l1_loss(pred, target):
-    >>>     return (pred - target).abs()
-
-    >>> pred = torch.Tensor([0, 2, 3])
-    >>> target = torch.Tensor([1, 1, 1])
-    >>> weight = torch.Tensor([1, 0, 1])
-
-    >>> l1_loss(pred, target)
-    tensor(1.3333)
-    >>> l1_loss(pred, target, weight)
-    tensor(1.)
-    >>> l1_loss(pred, target, reduction='none')
-    tensor([1., 1., 2.])
-    >>> l1_loss(pred, target, weight, avg_factor=2)
-    tensor(1.5000)
-    """
-
-    @functools.wraps(loss_func)
-    def wrapper(pred,
-                target,
-                weight=None,
-                reduction='mean',
-                avg_factor=None,
-                **kwargs):
-        # get element-wise loss
-        loss = loss_func(pred, target, **kwargs)
-        loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
-        return loss
-
-    return wrapper
diff --git a/ControlNet/annotator/uniformer/mmseg/models/necks/__init__.py b/ControlNet/annotator/uniformer/mmseg/models/necks/__init__.py
deleted file mode 100644
index 9b9d3d5b3fe80247642d962edd6fb787537d01d6..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/necks/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .fpn import FPN
-from .multilevel_neck import MultiLevelNeck
-
-__all__ = ['FPN', 'MultiLevelNeck']
diff --git a/ControlNet/annotator/uniformer/mmseg/models/necks/fpn.py b/ControlNet/annotator/uniformer/mmseg/models/necks/fpn.py
deleted file mode 100644
index a53b2a69500f8c2edb835abc3ff0ccc2173d1fb1..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/necks/fpn.py
+++ /dev/null
@@ -1,212 +0,0 @@
-import torch.nn as nn
-import torch.nn.functional as F
-from annotator.uniformer.mmcv.cnn import ConvModule, xavier_init
-
-from ..builder import NECKS
-
-
-@NECKS.register_module()
-class FPN(nn.Module):
-    """Feature Pyramid Network.
-
-    This is an implementation of - Feature Pyramid Networks for Object
-    Detection (https://arxiv.org/abs/1612.03144)
-
-    Args:
-        in_channels (List[int]): Number of input channels per scale.
-        out_channels (int): Number of output channels (used at each scale)
-        num_outs (int): Number of output scales.
-        start_level (int): Index of the start input backbone level used to
-            build the feature pyramid. Default: 0.
-        end_level (int): Index of the end input backbone level (exclusive) to
-            build the feature pyramid. Default: -1, which means the last level.
-        add_extra_convs (bool | str): If bool, it decides whether to add conv
-            layers on top of the original feature maps. Default to False.
-            If True, its actual mode is specified by `extra_convs_on_inputs`.
-            If str, it specifies the source feature map of the extra convs.
-            Only the following options are allowed
-
-            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
-            - 'on_lateral':  Last feature map after lateral convs.
-            - 'on_output': The last output feature map after fpn convs.
-        extra_convs_on_inputs (bool, deprecated): Whether to apply extra convs
-            on the original feature from the backbone. If True,
-            it is equivalent to `add_extra_convs='on_input'`. If False, it is
-            equivalent to set `add_extra_convs='on_output'`. Default to True.
-        relu_before_extra_convs (bool): Whether to apply relu before the extra
-            conv. Default: False.
-        no_norm_on_lateral (bool): Whether to apply norm on lateral.
-            Default: False.
-        conv_cfg (dict): Config dict for convolution layer. Default: None.
-        norm_cfg (dict): Config dict for normalization layer. Default: None.
-        act_cfg (str): Config dict for activation layer in ConvModule.
-            Default: None.
-        upsample_cfg (dict): Config dict for interpolate layer.
-            Default: `dict(mode='nearest')`
-
-    Example:
-        >>> import torch
-        >>> in_channels = [2, 3, 5, 7]
-        >>> scales = [340, 170, 84, 43]
-        >>> inputs = [torch.rand(1, c, s, s)
-        ...           for c, s in zip(in_channels, scales)]
-        >>> self = FPN(in_channels, 11, len(in_channels)).eval()
-        >>> outputs = self.forward(inputs)
-        >>> for i in range(len(outputs)):
-        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
-        outputs[0].shape = torch.Size([1, 11, 340, 340])
-        outputs[1].shape = torch.Size([1, 11, 170, 170])
-        outputs[2].shape = torch.Size([1, 11, 84, 84])
-        outputs[3].shape = torch.Size([1, 11, 43, 43])
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 num_outs,
-                 start_level=0,
-                 end_level=-1,
-                 add_extra_convs=False,
-                 extra_convs_on_inputs=False,
-                 relu_before_extra_convs=False,
-                 no_norm_on_lateral=False,
-                 conv_cfg=None,
-                 norm_cfg=None,
-                 act_cfg=None,
-                 upsample_cfg=dict(mode='nearest')):
-        super(FPN, self).__init__()
-        assert isinstance(in_channels, list)
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.num_ins = len(in_channels)
-        self.num_outs = num_outs
-        self.relu_before_extra_convs = relu_before_extra_convs
-        self.no_norm_on_lateral = no_norm_on_lateral
-        self.fp16_enabled = False
-        self.upsample_cfg = upsample_cfg.copy()
-
-        if end_level == -1:
-            self.backbone_end_level = self.num_ins
-            assert num_outs >= self.num_ins - start_level
-        else:
-            # if end_level < inputs, no extra level is allowed
-            self.backbone_end_level = end_level
-            assert end_level <= len(in_channels)
-            assert num_outs == end_level - start_level
-        self.start_level = start_level
-        self.end_level = end_level
-        self.add_extra_convs = add_extra_convs
-        assert isinstance(add_extra_convs, (str, bool))
-        if isinstance(add_extra_convs, str):
-            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
-            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
-        elif add_extra_convs:  # True
-            if extra_convs_on_inputs:
-                # For compatibility with previous release
-                # TODO: deprecate `extra_convs_on_inputs`
-                self.add_extra_convs = 'on_input'
-            else:
-                self.add_extra_convs = 'on_output'
-
-        self.lateral_convs = nn.ModuleList()
-        self.fpn_convs = nn.ModuleList()
-
-        for i in range(self.start_level, self.backbone_end_level):
-            l_conv = ConvModule(
-                in_channels[i],
-                out_channels,
-                1,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
-                act_cfg=act_cfg,
-                inplace=False)
-            fpn_conv = ConvModule(
-                out_channels,
-                out_channels,
-                3,
-                padding=1,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=act_cfg,
-                inplace=False)
-
-            self.lateral_convs.append(l_conv)
-            self.fpn_convs.append(fpn_conv)
-
-        # add extra conv layers (e.g., RetinaNet)
-        extra_levels = num_outs - self.backbone_end_level + self.start_level
-        if self.add_extra_convs and extra_levels >= 1:
-            for i in range(extra_levels):
-                if i == 0 and self.add_extra_convs == 'on_input':
-                    in_channels = self.in_channels[self.backbone_end_level - 1]
-                else:
-                    in_channels = out_channels
-                extra_fpn_conv = ConvModule(
-                    in_channels,
-                    out_channels,
-                    3,
-                    stride=2,
-                    padding=1,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
-                    inplace=False)
-                self.fpn_convs.append(extra_fpn_conv)
-
-    # default init_weights for conv(msra) and norm in ConvModule
-    def init_weights(self):
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                xavier_init(m, distribution='uniform')
-
-    def forward(self, inputs):
-        assert len(inputs) == len(self.in_channels)
-
-        # build laterals
-        laterals = [
-            lateral_conv(inputs[i + self.start_level])
-            for i, lateral_conv in enumerate(self.lateral_convs)
-        ]
-
-        # build top-down path
-        used_backbone_levels = len(laterals)
-        for i in range(used_backbone_levels - 1, 0, -1):
-            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
-            #  it cannot co-exist with `size` in `F.interpolate`.
-            if 'scale_factor' in self.upsample_cfg:
-                laterals[i - 1] += F.interpolate(laterals[i],
-                                                 **self.upsample_cfg)
-            else:
-                prev_shape = laterals[i - 1].shape[2:]
-                laterals[i - 1] += F.interpolate(
-                    laterals[i], size=prev_shape, **self.upsample_cfg)
-
-        # build outputs
-        # part 1: from original levels
-        outs = [
-            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
-        ]
-        # part 2: add extra levels
-        if self.num_outs > len(outs):
-            # use max pool to get more levels on top of outputs
-            # (e.g., Faster R-CNN, Mask R-CNN)
-            if not self.add_extra_convs:
-                for i in range(self.num_outs - used_backbone_levels):
-                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
-            # add conv layers on top of original feature maps (RetinaNet)
-            else:
-                if self.add_extra_convs == 'on_input':
-                    extra_source = inputs[self.backbone_end_level - 1]
-                elif self.add_extra_convs == 'on_lateral':
-                    extra_source = laterals[-1]
-                elif self.add_extra_convs == 'on_output':
-                    extra_source = outs[-1]
-                else:
-                    raise NotImplementedError
-                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
-                for i in range(used_backbone_levels + 1, self.num_outs):
-                    if self.relu_before_extra_convs:
-                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
-                    else:
-                        outs.append(self.fpn_convs[i](outs[-1]))
-        return tuple(outs)
diff --git a/ControlNet/annotator/uniformer/mmseg/models/necks/multilevel_neck.py b/ControlNet/annotator/uniformer/mmseg/models/necks/multilevel_neck.py
deleted file mode 100644
index 766144d8136326a1fab5906a153a0c0df69b6b60..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/necks/multilevel_neck.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import torch.nn as nn
-import torch.nn.functional as F
-from annotator.uniformer.mmcv.cnn import ConvModule
-
-from ..builder import NECKS
-
-
-@NECKS.register_module()
-class MultiLevelNeck(nn.Module):
-    """MultiLevelNeck.
-
-    A neck structure connect vit backbone and decoder_heads.
-    Args:
-        in_channels (List[int]): Number of input channels per scale.
-        out_channels (int): Number of output channels (used at each scale).
-        scales (List[int]): Scale factors for each input feature map.
-        norm_cfg (dict): Config dict for normalization layer. Default: None.
-        act_cfg (dict): Config dict for activation layer in ConvModule.
-            Default: None.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 scales=[0.5, 1, 2, 4],
-                 norm_cfg=None,
-                 act_cfg=None):
-        super(MultiLevelNeck, self).__init__()
-        assert isinstance(in_channels, list)
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.scales = scales
-        self.num_outs = len(scales)
-        self.lateral_convs = nn.ModuleList()
-        self.convs = nn.ModuleList()
-        for in_channel in in_channels:
-            self.lateral_convs.append(
-                ConvModule(
-                    in_channel,
-                    out_channels,
-                    kernel_size=1,
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg))
-        for _ in range(self.num_outs):
-            self.convs.append(
-                ConvModule(
-                    out_channels,
-                    out_channels,
-                    kernel_size=3,
-                    padding=1,
-                    stride=1,
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg))
-
-    def forward(self, inputs):
-        assert len(inputs) == len(self.in_channels)
-        print(inputs[0].shape)
-        inputs = [
-            lateral_conv(inputs[i])
-            for i, lateral_conv in enumerate(self.lateral_convs)
-        ]
-        # for len(inputs) not equal to self.num_outs
-        if len(inputs) == 1:
-            inputs = [inputs[0] for _ in range(self.num_outs)]
-        outs = []
-        for i in range(self.num_outs):
-            x_resize = F.interpolate(
-                inputs[i], scale_factor=self.scales[i], mode='bilinear')
-            outs.append(self.convs[i](x_resize))
-        return tuple(outs)
diff --git a/ControlNet/annotator/uniformer/mmseg/models/segmentors/__init__.py b/ControlNet/annotator/uniformer/mmseg/models/segmentors/__init__.py
deleted file mode 100644
index dca2f09405330743c476e190896bee39c45498ea..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/segmentors/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from .base import BaseSegmentor
-from .cascade_encoder_decoder import CascadeEncoderDecoder
-from .encoder_decoder import EncoderDecoder
-
-__all__ = ['BaseSegmentor', 'EncoderDecoder', 'CascadeEncoderDecoder']
diff --git a/ControlNet/annotator/uniformer/mmseg/models/segmentors/base.py b/ControlNet/annotator/uniformer/mmseg/models/segmentors/base.py
deleted file mode 100644
index 172fc63b736c4f13be1cd909433bc260760a1eaa..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/segmentors/base.py
+++ /dev/null
@@ -1,273 +0,0 @@
-import logging
-import warnings
-from abc import ABCMeta, abstractmethod
-from collections import OrderedDict
-
-import annotator.uniformer.mmcv as mmcv
-import numpy as np
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-from annotator.uniformer.mmcv.runner import auto_fp16
-
-
-class BaseSegmentor(nn.Module):
-    """Base class for segmentors."""
-
-    __metaclass__ = ABCMeta
-
-    def __init__(self):
-        super(BaseSegmentor, self).__init__()
-        self.fp16_enabled = False
-
-    @property
-    def with_neck(self):
-        """bool: whether the segmentor has neck"""
-        return hasattr(self, 'neck') and self.neck is not None
-
-    @property
-    def with_auxiliary_head(self):
-        """bool: whether the segmentor has auxiliary head"""
-        return hasattr(self,
-                       'auxiliary_head') and self.auxiliary_head is not None
-
-    @property
-    def with_decode_head(self):
-        """bool: whether the segmentor has decode head"""
-        return hasattr(self, 'decode_head') and self.decode_head is not None
-
-    @abstractmethod
-    def extract_feat(self, imgs):
-        """Placeholder for extract features from images."""
-        pass
-
-    @abstractmethod
-    def encode_decode(self, img, img_metas):
-        """Placeholder for encode images with backbone and decode into a
-        semantic segmentation map of the same size as input."""
-        pass
-
-    @abstractmethod
-    def forward_train(self, imgs, img_metas, **kwargs):
-        """Placeholder for Forward function for training."""
-        pass
-
-    @abstractmethod
-    def simple_test(self, img, img_meta, **kwargs):
-        """Placeholder for single image test."""
-        pass
-
-    @abstractmethod
-    def aug_test(self, imgs, img_metas, **kwargs):
-        """Placeholder for augmentation test."""
-        pass
-
-    def init_weights(self, pretrained=None):
-        """Initialize the weights in segmentor.
-
-        Args:
-            pretrained (str, optional): Path to pre-trained weights.
-                Defaults to None.
-        """
-        if pretrained is not None:
-            logger = logging.getLogger()
-            logger.info(f'load model from: {pretrained}')
-
-    def forward_test(self, imgs, img_metas, **kwargs):
-        """
-        Args:
-            imgs (List[Tensor]): the outer list indicates test-time
-                augmentations and inner Tensor should have a shape NxCxHxW,
-                which contains all images in the batch.
-            img_metas (List[List[dict]]): the outer list indicates test-time
-                augs (multiscale, flip, etc.) and the inner list indicates
-                images in a batch.
-        """
-        for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
-            if not isinstance(var, list):
-                raise TypeError(f'{name} must be a list, but got '
-                                f'{type(var)}')
-
-        num_augs = len(imgs)
-        if num_augs != len(img_metas):
-            raise ValueError(f'num of augmentations ({len(imgs)}) != '
-                             f'num of image meta ({len(img_metas)})')
-        # all images in the same aug batch all of the same ori_shape and pad
-        # shape
-        for img_meta in img_metas:
-            ori_shapes = [_['ori_shape'] for _ in img_meta]
-            assert all(shape == ori_shapes[0] for shape in ori_shapes)
-            img_shapes = [_['img_shape'] for _ in img_meta]
-            assert all(shape == img_shapes[0] for shape in img_shapes)
-            pad_shapes = [_['pad_shape'] for _ in img_meta]
-            assert all(shape == pad_shapes[0] for shape in pad_shapes)
-
-        if num_augs == 1:
-            return self.simple_test(imgs[0], img_metas[0], **kwargs)
-        else:
-            return self.aug_test(imgs, img_metas, **kwargs)
-
-    @auto_fp16(apply_to=('img', ))
-    def forward(self, img, img_metas, return_loss=True, **kwargs):
-        """Calls either :func:`forward_train` or :func:`forward_test` depending
-        on whether ``return_loss`` is ``True``.
-
-        Note this setting will change the expected inputs. When
-        ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
-        and List[dict]), and when ``resturn_loss=False``, img and img_meta
-        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
-        the outer list indicating test time augmentations.
-        """
-        if return_loss:
-            return self.forward_train(img, img_metas, **kwargs)
-        else:
-            return self.forward_test(img, img_metas, **kwargs)
-
-    def train_step(self, data_batch, optimizer, **kwargs):
-        """The iteration step during training.
-
-        This method defines an iteration step during training, except for the
-        back propagation and optimizer updating, which are done in an optimizer
-        hook. Note that in some complicated cases or models, the whole process
-        including back propagation and optimizer updating is also defined in
-        this method, such as GAN.
-
-        Args:
-            data (dict): The output of dataloader.
-            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
-                runner is passed to ``train_step()``. This argument is unused
-                and reserved.
-
-        Returns:
-            dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
-                ``num_samples``.
-                ``loss`` is a tensor for back propagation, which can be a
-                weighted sum of multiple losses.
-                ``log_vars`` contains all the variables to be sent to the
-                logger.
-                ``num_samples`` indicates the batch size (when the model is
-                DDP, it means the batch size on each GPU), which is used for
-                averaging the logs.
-        """
-        losses = self(**data_batch)
-        loss, log_vars = self._parse_losses(losses)
-
-        outputs = dict(
-            loss=loss,
-            log_vars=log_vars,
-            num_samples=len(data_batch['img_metas']))
-
-        return outputs
-
-    def val_step(self, data_batch, **kwargs):
-        """The iteration step during validation.
-
-        This method shares the same signature as :func:`train_step`, but used
-        during val epochs. Note that the evaluation after training epochs is
-        not implemented with this method, but an evaluation hook.
-        """
-        output = self(**data_batch, **kwargs)
-        return output
-
-    @staticmethod
-    def _parse_losses(losses):
-        """Parse the raw outputs (losses) of the network.
-
-        Args:
-            losses (dict): Raw output of the network, which usually contain
-                losses and other necessary information.
-
-        Returns:
-            tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor
-                which may be a weighted sum of all losses, log_vars contains
-                all the variables to be sent to the logger.
-        """
-        log_vars = OrderedDict()
-        for loss_name, loss_value in losses.items():
-            if isinstance(loss_value, torch.Tensor):
-                log_vars[loss_name] = loss_value.mean()
-            elif isinstance(loss_value, list):
-                log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
-            else:
-                raise TypeError(
-                    f'{loss_name} is not a tensor or list of tensors')
-
-        loss = sum(_value for _key, _value in log_vars.items()
-                   if 'loss' in _key)
-
-        log_vars['loss'] = loss
-        for loss_name, loss_value in log_vars.items():
-            # reduce loss when distributed training
-            if dist.is_available() and dist.is_initialized():
-                loss_value = loss_value.data.clone()
-                dist.all_reduce(loss_value.div_(dist.get_world_size()))
-            log_vars[loss_name] = loss_value.item()
-
-        return loss, log_vars
-
-    def show_result(self,
-                    img,
-                    result,
-                    palette=None,
-                    win_name='',
-                    show=False,
-                    wait_time=0,
-                    out_file=None,
-                    opacity=0.5):
-        """Draw `result` over `img`.
-
-        Args:
-            img (str or Tensor): The image to be displayed.
-            result (Tensor): The semantic segmentation results to draw over
-                `img`.
-            palette (list[list[int]]] | np.ndarray | None): The palette of
-                segmentation map. If None is given, random palette will be
-                generated. Default: None
-            win_name (str): The window name.
-            wait_time (int): Value of waitKey param.
-                Default: 0.
-            show (bool): Whether to show the image.
-                Default: False.
-            out_file (str or None): The filename to write the image.
-                Default: None.
-            opacity(float): Opacity of painted segmentation map.
-                Default 0.5.
-                Must be in (0, 1] range.
-        Returns:
-            img (Tensor): Only if not `show` or `out_file`
-        """
-        img = mmcv.imread(img)
-        img = img.copy()
-        seg = result[0]
-        if palette is None:
-            if self.PALETTE is None:
-                palette = np.random.randint(
-                    0, 255, size=(len(self.CLASSES), 3))
-            else:
-                palette = self.PALETTE
-        palette = np.array(palette)
-        assert palette.shape[0] == len(self.CLASSES)
-        assert palette.shape[1] == 3
-        assert len(palette.shape) == 2
-        assert 0 < opacity <= 1.0
-        color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
-        for label, color in enumerate(palette):
-            color_seg[seg == label, :] = color
-        # convert to BGR
-        color_seg = color_seg[..., ::-1]
-
-        img = img * (1 - opacity) + color_seg * opacity
-        img = img.astype(np.uint8)
-        # if out_file specified, do not show image in window
-        if out_file is not None:
-            show = False
-
-        if show:
-            mmcv.imshow(img, win_name, wait_time)
-        if out_file is not None:
-            mmcv.imwrite(img, out_file)
-
-        if not (show or out_file):
-            warnings.warn('show==False and out_file is not specified, only '
-                          'result image will be returned')
-            return img
diff --git a/ControlNet/annotator/uniformer/mmseg/models/segmentors/cascade_encoder_decoder.py b/ControlNet/annotator/uniformer/mmseg/models/segmentors/cascade_encoder_decoder.py
deleted file mode 100644
index 873957d8d6468147c994493d92ff5c1b15bfb703..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/segmentors/cascade_encoder_decoder.py
+++ /dev/null
@@ -1,98 +0,0 @@
-from torch import nn
-
-from annotator.uniformer.mmseg.core import add_prefix
-from annotator.uniformer.mmseg.ops import resize
-from .. import builder
-from ..builder import SEGMENTORS
-from .encoder_decoder import EncoderDecoder
-
-
-@SEGMENTORS.register_module()
-class CascadeEncoderDecoder(EncoderDecoder):
-    """Cascade Encoder Decoder segmentors.
-
-    CascadeEncoderDecoder almost the same as EncoderDecoder, while decoders of
-    CascadeEncoderDecoder are cascaded. The output of previous decoder_head
-    will be the input of next decoder_head.
-    """
-
-    def __init__(self,
-                 num_stages,
-                 backbone,
-                 decode_head,
-                 neck=None,
-                 auxiliary_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None):
-        self.num_stages = num_stages
-        super(CascadeEncoderDecoder, self).__init__(
-            backbone=backbone,
-            decode_head=decode_head,
-            neck=neck,
-            auxiliary_head=auxiliary_head,
-            train_cfg=train_cfg,
-            test_cfg=test_cfg,
-            pretrained=pretrained)
-
-    def _init_decode_head(self, decode_head):
-        """Initialize ``decode_head``"""
-        assert isinstance(decode_head, list)
-        assert len(decode_head) == self.num_stages
-        self.decode_head = nn.ModuleList()
-        for i in range(self.num_stages):
-            self.decode_head.append(builder.build_head(decode_head[i]))
-        self.align_corners = self.decode_head[-1].align_corners
-        self.num_classes = self.decode_head[-1].num_classes
-
-    def init_weights(self, pretrained=None):
-        """Initialize the weights in backbone and heads.
-
-        Args:
-            pretrained (str, optional): Path to pre-trained weights.
-                Defaults to None.
-        """
-        self.backbone.init_weights(pretrained=pretrained)
-        for i in range(self.num_stages):
-            self.decode_head[i].init_weights()
-        if self.with_auxiliary_head:
-            if isinstance(self.auxiliary_head, nn.ModuleList):
-                for aux_head in self.auxiliary_head:
-                    aux_head.init_weights()
-            else:
-                self.auxiliary_head.init_weights()
-
-    def encode_decode(self, img, img_metas):
-        """Encode images with backbone and decode into a semantic segmentation
-        map of the same size as input."""
-        x = self.extract_feat(img)
-        out = self.decode_head[0].forward_test(x, img_metas, self.test_cfg)
-        for i in range(1, self.num_stages):
-            out = self.decode_head[i].forward_test(x, out, img_metas,
-                                                   self.test_cfg)
-        out = resize(
-            input=out,
-            size=img.shape[2:],
-            mode='bilinear',
-            align_corners=self.align_corners)
-        return out
-
-    def _decode_head_forward_train(self, x, img_metas, gt_semantic_seg):
-        """Run forward function and calculate loss for decode head in
-        training."""
-        losses = dict()
-
-        loss_decode = self.decode_head[0].forward_train(
-            x, img_metas, gt_semantic_seg, self.train_cfg)
-
-        losses.update(add_prefix(loss_decode, 'decode_0'))
-
-        for i in range(1, self.num_stages):
-            # forward test again, maybe unnecessary for most methods.
-            prev_outputs = self.decode_head[i - 1].forward_test(
-                x, img_metas, self.test_cfg)
-            loss_decode = self.decode_head[i].forward_train(
-                x, prev_outputs, img_metas, gt_semantic_seg, self.train_cfg)
-            losses.update(add_prefix(loss_decode, f'decode_{i}'))
-
-        return losses
diff --git a/ControlNet/annotator/uniformer/mmseg/models/segmentors/encoder_decoder.py b/ControlNet/annotator/uniformer/mmseg/models/segmentors/encoder_decoder.py
deleted file mode 100644
index 98392ac04c4c44a7f4e7b1c0808266875877dd1f..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/segmentors/encoder_decoder.py
+++ /dev/null
@@ -1,298 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from annotator.uniformer.mmseg.core import add_prefix
-from annotator.uniformer.mmseg.ops import resize
-from .. import builder
-from ..builder import SEGMENTORS
-from .base import BaseSegmentor
-
-
-@SEGMENTORS.register_module()
-class EncoderDecoder(BaseSegmentor):
-    """Encoder Decoder segmentors.
-
-    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
-    Note that auxiliary_head is only used for deep supervision during training,
-    which could be dumped during inference.
-    """
-
-    def __init__(self,
-                 backbone,
-                 decode_head,
-                 neck=None,
-                 auxiliary_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None):
-        super(EncoderDecoder, self).__init__()
-        self.backbone = builder.build_backbone(backbone)
-        if neck is not None:
-            self.neck = builder.build_neck(neck)
-        self._init_decode_head(decode_head)
-        self._init_auxiliary_head(auxiliary_head)
-
-        self.train_cfg = train_cfg
-        self.test_cfg = test_cfg
-
-        self.init_weights(pretrained=pretrained)
-
-        assert self.with_decode_head
-
-    def _init_decode_head(self, decode_head):
-        """Initialize ``decode_head``"""
-        self.decode_head = builder.build_head(decode_head)
-        self.align_corners = self.decode_head.align_corners
-        self.num_classes = self.decode_head.num_classes
-
-    def _init_auxiliary_head(self, auxiliary_head):
-        """Initialize ``auxiliary_head``"""
-        if auxiliary_head is not None:
-            if isinstance(auxiliary_head, list):
-                self.auxiliary_head = nn.ModuleList()
-                for head_cfg in auxiliary_head:
-                    self.auxiliary_head.append(builder.build_head(head_cfg))
-            else:
-                self.auxiliary_head = builder.build_head(auxiliary_head)
-
-    def init_weights(self, pretrained=None):
-        """Initialize the weights in backbone and heads.
-
-        Args:
-            pretrained (str, optional): Path to pre-trained weights.
-                Defaults to None.
-        """
-
-        super(EncoderDecoder, self).init_weights(pretrained)
-        self.backbone.init_weights(pretrained=pretrained)
-        self.decode_head.init_weights()
-        if self.with_auxiliary_head:
-            if isinstance(self.auxiliary_head, nn.ModuleList):
-                for aux_head in self.auxiliary_head:
-                    aux_head.init_weights()
-            else:
-                self.auxiliary_head.init_weights()
-
-    def extract_feat(self, img):
-        """Extract features from images."""
-        x = self.backbone(img)
-        if self.with_neck:
-            x = self.neck(x)
-        return x
-
-    def encode_decode(self, img, img_metas):
-        """Encode images with backbone and decode into a semantic segmentation
-        map of the same size as input."""
-        x = self.extract_feat(img)
-        out = self._decode_head_forward_test(x, img_metas)
-        out = resize(
-            input=out,
-            size=img.shape[2:],
-            mode='bilinear',
-            align_corners=self.align_corners)
-        return out
-
-    def _decode_head_forward_train(self, x, img_metas, gt_semantic_seg):
-        """Run forward function and calculate loss for decode head in
-        training."""
-        losses = dict()
-        loss_decode = self.decode_head.forward_train(x, img_metas,
-                                                     gt_semantic_seg,
-                                                     self.train_cfg)
-
-        losses.update(add_prefix(loss_decode, 'decode'))
-        return losses
-
-    def _decode_head_forward_test(self, x, img_metas):
-        """Run forward function and calculate loss for decode head in
-        inference."""
-        seg_logits = self.decode_head.forward_test(x, img_metas, self.test_cfg)
-        return seg_logits
-
-    def _auxiliary_head_forward_train(self, x, img_metas, gt_semantic_seg):
-        """Run forward function and calculate loss for auxiliary head in
-        training."""
-        losses = dict()
-        if isinstance(self.auxiliary_head, nn.ModuleList):
-            for idx, aux_head in enumerate(self.auxiliary_head):
-                loss_aux = aux_head.forward_train(x, img_metas,
-                                                  gt_semantic_seg,
-                                                  self.train_cfg)
-                losses.update(add_prefix(loss_aux, f'aux_{idx}'))
-        else:
-            loss_aux = self.auxiliary_head.forward_train(
-                x, img_metas, gt_semantic_seg, self.train_cfg)
-            losses.update(add_prefix(loss_aux, 'aux'))
-
-        return losses
-
-    def forward_dummy(self, img):
-        """Dummy forward function."""
-        seg_logit = self.encode_decode(img, None)
-
-        return seg_logit
-
-    def forward_train(self, img, img_metas, gt_semantic_seg):
-        """Forward function for training.
-
-        Args:
-            img (Tensor): Input images.
-            img_metas (list[dict]): List of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-                For details on the values of these keys see
-                `mmseg/datasets/pipelines/formatting.py:Collect`.
-            gt_semantic_seg (Tensor): Semantic segmentation masks
-                used if the architecture supports semantic segmentation task.
-
-        Returns:
-            dict[str, Tensor]: a dictionary of loss components
-        """
-
-        x = self.extract_feat(img)
-
-        losses = dict()
-
-        loss_decode = self._decode_head_forward_train(x, img_metas,
-                                                      gt_semantic_seg)
-        losses.update(loss_decode)
-
-        if self.with_auxiliary_head:
-            loss_aux = self._auxiliary_head_forward_train(
-                x, img_metas, gt_semantic_seg)
-            losses.update(loss_aux)
-
-        return losses
-
-    # TODO refactor
-    def slide_inference(self, img, img_meta, rescale):
-        """Inference by sliding-window with overlap.
-
-        If h_crop > h_img or w_crop > w_img, the small patch will be used to
-        decode without padding.
-        """
-
-        h_stride, w_stride = self.test_cfg.stride
-        h_crop, w_crop = self.test_cfg.crop_size
-        batch_size, _, h_img, w_img = img.size()
-        num_classes = self.num_classes
-        h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
-        w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
-        preds = img.new_zeros((batch_size, num_classes, h_img, w_img))
-        count_mat = img.new_zeros((batch_size, 1, h_img, w_img))
-        for h_idx in range(h_grids):
-            for w_idx in range(w_grids):
-                y1 = h_idx * h_stride
-                x1 = w_idx * w_stride
-                y2 = min(y1 + h_crop, h_img)
-                x2 = min(x1 + w_crop, w_img)
-                y1 = max(y2 - h_crop, 0)
-                x1 = max(x2 - w_crop, 0)
-                crop_img = img[:, :, y1:y2, x1:x2]
-                crop_seg_logit = self.encode_decode(crop_img, img_meta)
-                preds += F.pad(crop_seg_logit,
-                               (int(x1), int(preds.shape[3] - x2), int(y1),
-                                int(preds.shape[2] - y2)))
-
-                count_mat[:, :, y1:y2, x1:x2] += 1
-        assert (count_mat == 0).sum() == 0
-        if torch.onnx.is_in_onnx_export():
-            # cast count_mat to constant while exporting to ONNX
-            count_mat = torch.from_numpy(
-                count_mat.cpu().detach().numpy()).to(device=img.device)
-        preds = preds / count_mat
-        if rescale:
-            preds = resize(
-                preds,
-                size=img_meta[0]['ori_shape'][:2],
-                mode='bilinear',
-                align_corners=self.align_corners,
-                warning=False)
-        return preds
-
-    def whole_inference(self, img, img_meta, rescale):
-        """Inference with full image."""
-
-        seg_logit = self.encode_decode(img, img_meta)
-        if rescale:
-            # support dynamic shape for onnx
-            if torch.onnx.is_in_onnx_export():
-                size = img.shape[2:]
-            else:
-                size = img_meta[0]['ori_shape'][:2]
-            seg_logit = resize(
-                seg_logit,
-                size=size,
-                mode='bilinear',
-                align_corners=self.align_corners,
-                warning=False)
-
-        return seg_logit
-
-    def inference(self, img, img_meta, rescale):
-        """Inference with slide/whole style.
-
-        Args:
-            img (Tensor): The input image of shape (N, 3, H, W).
-            img_meta (dict): Image info dict where each dict has: 'img_shape',
-                'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-                For details on the values of these keys see
-                `mmseg/datasets/pipelines/formatting.py:Collect`.
-            rescale (bool): Whether rescale back to original shape.
-
-        Returns:
-            Tensor: The output segmentation map.
-        """
-
-        assert self.test_cfg.mode in ['slide', 'whole']
-        ori_shape = img_meta[0]['ori_shape']
-        assert all(_['ori_shape'] == ori_shape for _ in img_meta)
-        if self.test_cfg.mode == 'slide':
-            seg_logit = self.slide_inference(img, img_meta, rescale)
-        else:
-            seg_logit = self.whole_inference(img, img_meta, rescale)
-        output = F.softmax(seg_logit, dim=1)
-        flip = img_meta[0]['flip']
-        if flip:
-            flip_direction = img_meta[0]['flip_direction']
-            assert flip_direction in ['horizontal', 'vertical']
-            if flip_direction == 'horizontal':
-                output = output.flip(dims=(3, ))
-            elif flip_direction == 'vertical':
-                output = output.flip(dims=(2, ))
-
-        return output
-
-    def simple_test(self, img, img_meta, rescale=True):
-        """Simple test with single image."""
-        seg_logit = self.inference(img, img_meta, rescale)
-        seg_pred = seg_logit.argmax(dim=1)
-        if torch.onnx.is_in_onnx_export():
-            # our inference backend only support 4D output
-            seg_pred = seg_pred.unsqueeze(0)
-            return seg_pred
-        seg_pred = seg_pred.cpu().numpy()
-        # unravel batch dim
-        seg_pred = list(seg_pred)
-        return seg_pred
-
-    def aug_test(self, imgs, img_metas, rescale=True):
-        """Test with augmentations.
-
-        Only rescale=True is supported.
-        """
-        # aug_test rescale all imgs back to ori_shape for now
-        assert rescale
-        # to save memory, we get augmented seg logit inplace
-        seg_logit = self.inference(imgs[0], img_metas[0], rescale)
-        for i in range(1, len(imgs)):
-            cur_seg_logit = self.inference(imgs[i], img_metas[i], rescale)
-            seg_logit += cur_seg_logit
-        seg_logit /= len(imgs)
-        seg_pred = seg_logit.argmax(dim=1)
-        seg_pred = seg_pred.cpu().numpy()
-        # unravel batch dim
-        seg_pred = list(seg_pred)
-        return seg_pred
diff --git a/ControlNet/annotator/uniformer/mmseg/models/utils/__init__.py b/ControlNet/annotator/uniformer/mmseg/models/utils/__init__.py
deleted file mode 100644
index 3d3bdd349b9f2ae499a2fcb2ac1d2e3c77befebe..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/utils/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from .drop import DropPath
-from .inverted_residual import InvertedResidual, InvertedResidualV3
-from .make_divisible import make_divisible
-from .res_layer import ResLayer
-from .se_layer import SELayer
-from .self_attention_block import SelfAttentionBlock
-from .up_conv_block import UpConvBlock
-from .weight_init import trunc_normal_
-
-__all__ = [
-    'ResLayer', 'SelfAttentionBlock', 'make_divisible', 'InvertedResidual',
-    'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'DropPath', 'trunc_normal_'
-]
diff --git a/ControlNet/annotator/uniformer/mmseg/models/utils/drop.py b/ControlNet/annotator/uniformer/mmseg/models/utils/drop.py
deleted file mode 100644
index 4520b0ff407d2a95a864086bdbca0065f222aa63..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/utils/drop.py
+++ /dev/null
@@ -1,31 +0,0 @@
-"""Modified from https://github.com/rwightman/pytorch-image-
-models/blob/master/timm/models/layers/drop.py."""
-
-import torch
-from torch import nn
-
-
-class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of
-    residual blocks).
-
-    Args:
-        drop_prob (float): Drop rate for paths of model. Dropout rate has
-            to be between 0 and 1. Default: 0.
-    """
-
-    def __init__(self, drop_prob=0.):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-        self.keep_prob = 1 - drop_prob
-
-    def forward(self, x):
-        if self.drop_prob == 0. or not self.training:
-            return x
-        shape = (x.shape[0], ) + (1, ) * (
-            x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
-        random_tensor = self.keep_prob + torch.rand(
-            shape, dtype=x.dtype, device=x.device)
-        random_tensor.floor_()  # binarize
-        output = x.div(self.keep_prob) * random_tensor
-        return output
diff --git a/ControlNet/annotator/uniformer/mmseg/models/utils/inverted_residual.py b/ControlNet/annotator/uniformer/mmseg/models/utils/inverted_residual.py
deleted file mode 100644
index 53b8fcd41f71d814738f1ac3f5acd3c3d701bf96..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/utils/inverted_residual.py
+++ /dev/null
@@ -1,208 +0,0 @@
-from annotator.uniformer.mmcv.cnn import ConvModule
-from torch import nn
-from torch.utils import checkpoint as cp
-
-from .se_layer import SELayer
-
-
-class InvertedResidual(nn.Module):
-    """InvertedResidual block for MobileNetV2.
-
-    Args:
-        in_channels (int): The input channels of the InvertedResidual block.
-        out_channels (int): The output channels of the InvertedResidual block.
-        stride (int): Stride of the middle (first) 3x3 convolution.
-        expand_ratio (int): Adjusts number of channels of the hidden layer
-            in InvertedResidual by this amount.
-        dilation (int): Dilation rate of depthwise conv. Default: 1
-        conv_cfg (dict): Config dict for convolution layer.
-            Default: None, which means using conv2d.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='BN').
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='ReLU6').
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-
-    Returns:
-        Tensor: The output tensor.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 stride,
-                 expand_ratio,
-                 dilation=1,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 act_cfg=dict(type='ReLU6'),
-                 with_cp=False):
-        super(InvertedResidual, self).__init__()
-        self.stride = stride
-        assert stride in [1, 2], f'stride must in [1, 2]. ' \
-            f'But received {stride}.'
-        self.with_cp = with_cp
-        self.use_res_connect = self.stride == 1 and in_channels == out_channels
-        hidden_dim = int(round(in_channels * expand_ratio))
-
-        layers = []
-        if expand_ratio != 1:
-            layers.append(
-                ConvModule(
-                    in_channels=in_channels,
-                    out_channels=hidden_dim,
-                    kernel_size=1,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg))
-        layers.extend([
-            ConvModule(
-                in_channels=hidden_dim,
-                out_channels=hidden_dim,
-                kernel_size=3,
-                stride=stride,
-                padding=dilation,
-                dilation=dilation,
-                groups=hidden_dim,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=act_cfg),
-            ConvModule(
-                in_channels=hidden_dim,
-                out_channels=out_channels,
-                kernel_size=1,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=None)
-        ])
-        self.conv = nn.Sequential(*layers)
-
-    def forward(self, x):
-
-        def _inner_forward(x):
-            if self.use_res_connect:
-                return x + self.conv(x)
-            else:
-                return self.conv(x)
-
-        if self.with_cp and x.requires_grad:
-            out = cp.checkpoint(_inner_forward, x)
-        else:
-            out = _inner_forward(x)
-
-        return out
-
-
-class InvertedResidualV3(nn.Module):
-    """Inverted Residual Block for MobileNetV3.
-
-    Args:
-        in_channels (int): The input channels of this Module.
-        out_channels (int): The output channels of this Module.
-        mid_channels (int): The input channels of the depthwise convolution.
-        kernel_size (int): The kernel size of the depthwise convolution.
-            Default: 3.
-        stride (int): The stride of the depthwise convolution. Default: 1.
-        se_cfg (dict): Config dict for se layer. Default: None, which means no
-            se layer.
-        with_expand_conv (bool): Use expand conv or not. If set False,
-            mid_channels must be the same with in_channels. Default: True.
-        conv_cfg (dict): Config dict for convolution layer. Default: None,
-            which means using conv2d.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='BN').
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='ReLU').
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-
-    Returns:
-        Tensor: The output tensor.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 mid_channels,
-                 kernel_size=3,
-                 stride=1,
-                 se_cfg=None,
-                 with_expand_conv=True,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 act_cfg=dict(type='ReLU'),
-                 with_cp=False):
-        super(InvertedResidualV3, self).__init__()
-        self.with_res_shortcut = (stride == 1 and in_channels == out_channels)
-        assert stride in [1, 2]
-        self.with_cp = with_cp
-        self.with_se = se_cfg is not None
-        self.with_expand_conv = with_expand_conv
-
-        if self.with_se:
-            assert isinstance(se_cfg, dict)
-        if not self.with_expand_conv:
-            assert mid_channels == in_channels
-
-        if self.with_expand_conv:
-            self.expand_conv = ConvModule(
-                in_channels=in_channels,
-                out_channels=mid_channels,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=act_cfg)
-        self.depthwise_conv = ConvModule(
-            in_channels=mid_channels,
-            out_channels=mid_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=kernel_size // 2,
-            groups=mid_channels,
-            conv_cfg=dict(
-                type='Conv2dAdaptivePadding') if stride == 2 else conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=act_cfg)
-
-        if self.with_se:
-            self.se = SELayer(**se_cfg)
-
-        self.linear_conv = ConvModule(
-            in_channels=mid_channels,
-            out_channels=out_channels,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=None)
-
-    def forward(self, x):
-
-        def _inner_forward(x):
-            out = x
-
-            if self.with_expand_conv:
-                out = self.expand_conv(out)
-
-            out = self.depthwise_conv(out)
-
-            if self.with_se:
-                out = self.se(out)
-
-            out = self.linear_conv(out)
-
-            if self.with_res_shortcut:
-                return x + out
-            else:
-                return out
-
-        if self.with_cp and x.requires_grad:
-            out = cp.checkpoint(_inner_forward, x)
-        else:
-            out = _inner_forward(x)
-
-        return out
diff --git a/ControlNet/annotator/uniformer/mmseg/models/utils/make_divisible.py b/ControlNet/annotator/uniformer/mmseg/models/utils/make_divisible.py
deleted file mode 100644
index 75ad756052529f52fe83bb95dd1f0ecfc9a13078..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/utils/make_divisible.py
+++ /dev/null
@@ -1,27 +0,0 @@
-def make_divisible(value, divisor, min_value=None, min_ratio=0.9):
-    """Make divisible function.
-
-    This function rounds the channel number to the nearest value that can be
-    divisible by the divisor. It is taken from the original tf repo. It ensures
-    that all layers have a channel number that is divisible by divisor. It can
-    be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py  # noqa
-
-    Args:
-        value (int): The original channel number.
-        divisor (int): The divisor to fully divide the channel number.
-        min_value (int): The minimum value of the output channel.
-            Default: None, means that the minimum value equal to the divisor.
-        min_ratio (float): The minimum ratio of the rounded channel number to
-            the original channel number. Default: 0.9.
-
-    Returns:
-        int: The modified output channel number.
-    """
-
-    if min_value is None:
-        min_value = divisor
-    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
-    # Make sure that round down does not go down by more than (1-min_ratio).
-    if new_value < min_ratio * value:
-        new_value += divisor
-    return new_value
diff --git a/ControlNet/annotator/uniformer/mmseg/models/utils/res_layer.py b/ControlNet/annotator/uniformer/mmseg/models/utils/res_layer.py
deleted file mode 100644
index b2c07b47007e92e4c3945b989e79f9d50306f5fe..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/utils/res_layer.py
+++ /dev/null
@@ -1,94 +0,0 @@
-from annotator.uniformer.mmcv.cnn import build_conv_layer, build_norm_layer
-from torch import nn as nn
-
-
-class ResLayer(nn.Sequential):
-    """ResLayer to build ResNet style backbone.
-
-    Args:
-        block (nn.Module): block used to build ResLayer.
-        inplanes (int): inplanes of block.
-        planes (int): planes of block.
-        num_blocks (int): number of blocks.
-        stride (int): stride of the first block. Default: 1
-        avg_down (bool): Use AvgPool instead of stride conv when
-            downsampling in the bottleneck. Default: False
-        conv_cfg (dict): dictionary to construct and config conv layer.
-            Default: None
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN')
-        multi_grid (int | None): Multi grid dilation rates of last
-            stage. Default: None
-        contract_dilation (bool): Whether contract first dilation of each layer
-            Default: False
-    """
-
-    def __init__(self,
-                 block,
-                 inplanes,
-                 planes,
-                 num_blocks,
-                 stride=1,
-                 dilation=1,
-                 avg_down=False,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 multi_grid=None,
-                 contract_dilation=False,
-                 **kwargs):
-        self.block = block
-
-        downsample = None
-        if stride != 1 or inplanes != planes * block.expansion:
-            downsample = []
-            conv_stride = stride
-            if avg_down:
-                conv_stride = 1
-                downsample.append(
-                    nn.AvgPool2d(
-                        kernel_size=stride,
-                        stride=stride,
-                        ceil_mode=True,
-                        count_include_pad=False))
-            downsample.extend([
-                build_conv_layer(
-                    conv_cfg,
-                    inplanes,
-                    planes * block.expansion,
-                    kernel_size=1,
-                    stride=conv_stride,
-                    bias=False),
-                build_norm_layer(norm_cfg, planes * block.expansion)[1]
-            ])
-            downsample = nn.Sequential(*downsample)
-
-        layers = []
-        if multi_grid is None:
-            if dilation > 1 and contract_dilation:
-                first_dilation = dilation // 2
-            else:
-                first_dilation = dilation
-        else:
-            first_dilation = multi_grid[0]
-        layers.append(
-            block(
-                inplanes=inplanes,
-                planes=planes,
-                stride=stride,
-                dilation=first_dilation,
-                downsample=downsample,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                **kwargs))
-        inplanes = planes * block.expansion
-        for i in range(1, num_blocks):
-            layers.append(
-                block(
-                    inplanes=inplanes,
-                    planes=planes,
-                    stride=1,
-                    dilation=dilation if multi_grid is None else multi_grid[i],
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    **kwargs))
-        super(ResLayer, self).__init__(*layers)
diff --git a/ControlNet/annotator/uniformer/mmseg/models/utils/se_layer.py b/ControlNet/annotator/uniformer/mmseg/models/utils/se_layer.py
deleted file mode 100644
index 083bd7d1ccee909c900c7aed2cc928bf14727f3e..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/utils/se_layer.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import annotator.uniformer.mmcv as mmcv
-import torch.nn as nn
-from annotator.uniformer.mmcv.cnn import ConvModule
-
-from .make_divisible import make_divisible
-
-
-class SELayer(nn.Module):
-    """Squeeze-and-Excitation Module.
-
-    Args:
-        channels (int): The input (and output) channels of the SE layer.
-        ratio (int): Squeeze ratio in SELayer, the intermediate channel will be
-            ``int(channels/ratio)``. Default: 16.
-        conv_cfg (None or dict): Config dict for convolution layer.
-            Default: None, which means using conv2d.
-        act_cfg (dict or Sequence[dict]): Config dict for activation layer.
-            If act_cfg is a dict, two activation layers will be configured
-            by this dict. If act_cfg is a sequence of dicts, the first
-            activation layer will be configured by the first dict and the
-            second activation layer will be configured by the second dict.
-            Default: (dict(type='ReLU'), dict(type='HSigmoid', bias=3.0,
-            divisor=6.0)).
-    """
-
-    def __init__(self,
-                 channels,
-                 ratio=16,
-                 conv_cfg=None,
-                 act_cfg=(dict(type='ReLU'),
-                          dict(type='HSigmoid', bias=3.0, divisor=6.0))):
-        super(SELayer, self).__init__()
-        if isinstance(act_cfg, dict):
-            act_cfg = (act_cfg, act_cfg)
-        assert len(act_cfg) == 2
-        assert mmcv.is_tuple_of(act_cfg, dict)
-        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
-        self.conv1 = ConvModule(
-            in_channels=channels,
-            out_channels=make_divisible(channels // ratio, 8),
-            kernel_size=1,
-            stride=1,
-            conv_cfg=conv_cfg,
-            act_cfg=act_cfg[0])
-        self.conv2 = ConvModule(
-            in_channels=make_divisible(channels // ratio, 8),
-            out_channels=channels,
-            kernel_size=1,
-            stride=1,
-            conv_cfg=conv_cfg,
-            act_cfg=act_cfg[1])
-
-    def forward(self, x):
-        out = self.global_avgpool(x)
-        out = self.conv1(out)
-        out = self.conv2(out)
-        return x * out
diff --git a/ControlNet/annotator/uniformer/mmseg/models/utils/self_attention_block.py b/ControlNet/annotator/uniformer/mmseg/models/utils/self_attention_block.py
deleted file mode 100644
index 440c7b73ee4706fde555595926d63a18d7574acc..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/utils/self_attention_block.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import torch
-from annotator.uniformer.mmcv.cnn import ConvModule, constant_init
-from torch import nn as nn
-from torch.nn import functional as F
-
-
-class SelfAttentionBlock(nn.Module):
-    """General self-attention block/non-local block.
-
-    Please refer to https://arxiv.org/abs/1706.03762 for details about key,
-    query and value.
-
-    Args:
-        key_in_channels (int): Input channels of key feature.
-        query_in_channels (int): Input channels of query feature.
-        channels (int): Output channels of key/query transform.
-        out_channels (int): Output channels.
-        share_key_query (bool): Whether share projection weight between key
-            and query projection.
-        query_downsample (nn.Module): Query downsample module.
-        key_downsample (nn.Module): Key downsample module.
-        key_query_num_convs (int): Number of convs for key/query projection.
-        value_num_convs (int): Number of convs for value projection.
-        matmul_norm (bool): Whether normalize attention map with sqrt of
-            channels
-        with_out (bool): Whether use out projection.
-        conv_cfg (dict|None): Config of conv layers.
-        norm_cfg (dict|None): Config of norm layers.
-        act_cfg (dict|None): Config of activation layers.
-    """
-
-    def __init__(self, key_in_channels, query_in_channels, channels,
-                 out_channels, share_key_query, query_downsample,
-                 key_downsample, key_query_num_convs, value_out_num_convs,
-                 key_query_norm, value_out_norm, matmul_norm, with_out,
-                 conv_cfg, norm_cfg, act_cfg):
-        super(SelfAttentionBlock, self).__init__()
-        if share_key_query:
-            assert key_in_channels == query_in_channels
-        self.key_in_channels = key_in_channels
-        self.query_in_channels = query_in_channels
-        self.out_channels = out_channels
-        self.channels = channels
-        self.share_key_query = share_key_query
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
-        self.key_project = self.build_project(
-            key_in_channels,
-            channels,
-            num_convs=key_query_num_convs,
-            use_conv_module=key_query_norm,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=act_cfg)
-        if share_key_query:
-            self.query_project = self.key_project
-        else:
-            self.query_project = self.build_project(
-                query_in_channels,
-                channels,
-                num_convs=key_query_num_convs,
-                use_conv_module=key_query_norm,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=act_cfg)
-        self.value_project = self.build_project(
-            key_in_channels,
-            channels if with_out else out_channels,
-            num_convs=value_out_num_convs,
-            use_conv_module=value_out_norm,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=act_cfg)
-        if with_out:
-            self.out_project = self.build_project(
-                channels,
-                out_channels,
-                num_convs=value_out_num_convs,
-                use_conv_module=value_out_norm,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=act_cfg)
-        else:
-            self.out_project = None
-
-        self.query_downsample = query_downsample
-        self.key_downsample = key_downsample
-        self.matmul_norm = matmul_norm
-
-        self.init_weights()
-
-    def init_weights(self):
-        """Initialize weight of later layer."""
-        if self.out_project is not None:
-            if not isinstance(self.out_project, ConvModule):
-                constant_init(self.out_project, 0)
-
-    def build_project(self, in_channels, channels, num_convs, use_conv_module,
-                      conv_cfg, norm_cfg, act_cfg):
-        """Build projection layer for key/query/value/out."""
-        if use_conv_module:
-            convs = [
-                ConvModule(
-                    in_channels,
-                    channels,
-                    1,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg)
-            ]
-            for _ in range(num_convs - 1):
-                convs.append(
-                    ConvModule(
-                        channels,
-                        channels,
-                        1,
-                        conv_cfg=conv_cfg,
-                        norm_cfg=norm_cfg,
-                        act_cfg=act_cfg))
-        else:
-            convs = [nn.Conv2d(in_channels, channels, 1)]
-            for _ in range(num_convs - 1):
-                convs.append(nn.Conv2d(channels, channels, 1))
-        if len(convs) > 1:
-            convs = nn.Sequential(*convs)
-        else:
-            convs = convs[0]
-        return convs
-
-    def forward(self, query_feats, key_feats):
-        """Forward function."""
-        batch_size = query_feats.size(0)
-        query = self.query_project(query_feats)
-        if self.query_downsample is not None:
-            query = self.query_downsample(query)
-        query = query.reshape(*query.shape[:2], -1)
-        query = query.permute(0, 2, 1).contiguous()
-
-        key = self.key_project(key_feats)
-        value = self.value_project(key_feats)
-        if self.key_downsample is not None:
-            key = self.key_downsample(key)
-            value = self.key_downsample(value)
-        key = key.reshape(*key.shape[:2], -1)
-        value = value.reshape(*value.shape[:2], -1)
-        value = value.permute(0, 2, 1).contiguous()
-
-        sim_map = torch.matmul(query, key)
-        if self.matmul_norm:
-            sim_map = (self.channels**-.5) * sim_map
-        sim_map = F.softmax(sim_map, dim=-1)
-
-        context = torch.matmul(sim_map, value)
-        context = context.permute(0, 2, 1).contiguous()
-        context = context.reshape(batch_size, -1, *query_feats.shape[2:])
-        if self.out_project is not None:
-            context = self.out_project(context)
-        return context
diff --git a/ControlNet/annotator/uniformer/mmseg/models/utils/up_conv_block.py b/ControlNet/annotator/uniformer/mmseg/models/utils/up_conv_block.py
deleted file mode 100644
index 378469da76cb7bff6a639e7877b3c275d50490fb..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/utils/up_conv_block.py
+++ /dev/null
@@ -1,101 +0,0 @@
-import torch
-import torch.nn as nn
-from annotator.uniformer.mmcv.cnn import ConvModule, build_upsample_layer
-
-
-class UpConvBlock(nn.Module):
-    """Upsample convolution block in decoder for UNet.
-
-    This upsample convolution block consists of one upsample module
-    followed by one convolution block. The upsample module expands the
-    high-level low-resolution feature map and the convolution block fuses
-    the upsampled high-level low-resolution feature map and the low-level
-    high-resolution feature map from encoder.
-
-    Args:
-        conv_block (nn.Sequential): Sequential of convolutional layers.
-        in_channels (int): Number of input channels of the high-level
-        skip_channels (int): Number of input channels of the low-level
-        high-resolution feature map from encoder.
-        out_channels (int): Number of output channels.
-        num_convs (int): Number of convolutional layers in the conv_block.
-            Default: 2.
-        stride (int): Stride of convolutional layer in conv_block. Default: 1.
-        dilation (int): Dilation rate of convolutional layer in conv_block.
-            Default: 1.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-        conv_cfg (dict | None): Config dict for convolution layer.
-            Default: None.
-        norm_cfg (dict | None): Config dict for normalization layer.
-            Default: dict(type='BN').
-        act_cfg (dict | None): Config dict for activation layer in ConvModule.
-            Default: dict(type='ReLU').
-        upsample_cfg (dict): The upsample config of the upsample module in
-            decoder. Default: dict(type='InterpConv'). If the size of
-            high-level feature map is the same as that of skip feature map
-            (low-level feature map from encoder), it does not need upsample the
-            high-level feature map and the upsample_cfg is None.
-        dcn (bool): Use deformable convolution in convolutional layer or not.
-            Default: None.
-        plugins (dict): plugins for convolutional layers. Default: None.
-    """
-
-    def __init__(self,
-                 conv_block,
-                 in_channels,
-                 skip_channels,
-                 out_channels,
-                 num_convs=2,
-                 stride=1,
-                 dilation=1,
-                 with_cp=False,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 act_cfg=dict(type='ReLU'),
-                 upsample_cfg=dict(type='InterpConv'),
-                 dcn=None,
-                 plugins=None):
-        super(UpConvBlock, self).__init__()
-        assert dcn is None, 'Not implemented yet.'
-        assert plugins is None, 'Not implemented yet.'
-
-        self.conv_block = conv_block(
-            in_channels=2 * skip_channels,
-            out_channels=out_channels,
-            num_convs=num_convs,
-            stride=stride,
-            dilation=dilation,
-            with_cp=with_cp,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=act_cfg,
-            dcn=None,
-            plugins=None)
-        if upsample_cfg is not None:
-            self.upsample = build_upsample_layer(
-                cfg=upsample_cfg,
-                in_channels=in_channels,
-                out_channels=skip_channels,
-                with_cp=with_cp,
-                norm_cfg=norm_cfg,
-                act_cfg=act_cfg)
-        else:
-            self.upsample = ConvModule(
-                in_channels,
-                skip_channels,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=act_cfg)
-
-    def forward(self, skip, x):
-        """Forward function."""
-
-        x = self.upsample(x)
-        out = torch.cat([skip, x], dim=1)
-        out = self.conv_block(out)
-
-        return out
diff --git a/ControlNet/annotator/uniformer/mmseg/models/utils/weight_init.py b/ControlNet/annotator/uniformer/mmseg/models/utils/weight_init.py
deleted file mode 100644
index 38141ba3d61f64ddfc0a31574b4648cbad96d7dd..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/models/utils/weight_init.py
+++ /dev/null
@@ -1,62 +0,0 @@
-"""Modified from https://github.com/rwightman/pytorch-image-
-models/blob/master/timm/models/layers/drop.py."""
-
-import math
-import warnings
-
-import torch
-
-
-def _no_grad_trunc_normal_(tensor, mean, std, a, b):
-    """Reference: https://people.sc.fsu.edu/~jburkardt/presentations
-    /truncated_normal.pdf"""
-
-    def norm_cdf(x):
-        # Computes standard normal cumulative distribution function
-        return (1. + math.erf(x / math.sqrt(2.))) / 2.
-
-    if (mean < a - 2 * std) or (mean > b + 2 * std):
-        warnings.warn(
-            'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. '
-            'The distribution of values may be incorrect.',
-            stacklevel=2)
-
-    with torch.no_grad():
-        # Values are generated by using a truncated uniform distribution and
-        # then using the inverse CDF for the normal distribution.
-        # Get upper and lower cdf values
-        lower_bound = norm_cdf((a - mean) / std)
-        upper_bound = norm_cdf((b - mean) / std)
-
-        # Uniformly fill tensor with values from [l, u], then translate to
-        # [2l-1, 2u-1].
-        tensor.uniform_(2 * lower_bound - 1, 2 * upper_bound - 1)
-
-        # Use inverse cdf transform for normal distribution to get truncated
-        # standard normal
-        tensor.erfinv_()
-
-        # Transform to proper mean, std
-        tensor.mul_(std * math.sqrt(2.))
-        tensor.add_(mean)
-
-        # Clamp to ensure it's in the proper range
-        tensor.clamp_(min=a, max=b)
-        return tensor
-
-
-def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
-    r"""Fills the input Tensor with values drawn from a truncated
-    normal distribution. The values are effectively drawn from the
-    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
-    with values outside :math:`[a, b]` redrawn until they are within
-    the bounds. The method used for generating the random values works
-    best when :math:`a \leq \text{mean} \leq b`.
-    Args:
-        tensor (``torch.Tensor``): an n-dimensional `torch.Tensor`
-        mean (float): the mean of the normal distribution
-        std (float): the standard deviation of the normal distribution
-        a (float): the minimum cutoff value
-        b (float): the maximum cutoff value
-    """
-    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
diff --git a/ControlNet/annotator/uniformer/mmseg/ops/__init__.py b/ControlNet/annotator/uniformer/mmseg/ops/__init__.py
deleted file mode 100644
index bec51c75b9363a9a19e9fb5c35f4e7dbd6f7751c..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/ops/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .encoding import Encoding
-from .wrappers import Upsample, resize
-
-__all__ = ['Upsample', 'resize', 'Encoding']
diff --git a/ControlNet/annotator/uniformer/mmseg/ops/encoding.py b/ControlNet/annotator/uniformer/mmseg/ops/encoding.py
deleted file mode 100644
index 7eb3629a6426550b8e4c537ee1ff4341893e489e..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/ops/encoding.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import torch
-from torch import nn
-from torch.nn import functional as F
-
-
-class Encoding(nn.Module):
-    """Encoding Layer: a learnable residual encoder.
-
-    Input is of shape  (batch_size, channels, height, width).
-    Output is of shape (batch_size, num_codes, channels).
-
-    Args:
-        channels: dimension of the features or feature channels
-        num_codes: number of code words
-    """
-
-    def __init__(self, channels, num_codes):
-        super(Encoding, self).__init__()
-        # init codewords and smoothing factor
-        self.channels, self.num_codes = channels, num_codes
-        std = 1. / ((num_codes * channels)**0.5)
-        # [num_codes, channels]
-        self.codewords = nn.Parameter(
-            torch.empty(num_codes, channels,
-                        dtype=torch.float).uniform_(-std, std),
-            requires_grad=True)
-        # [num_codes]
-        self.scale = nn.Parameter(
-            torch.empty(num_codes, dtype=torch.float).uniform_(-1, 0),
-            requires_grad=True)
-
-    @staticmethod
-    def scaled_l2(x, codewords, scale):
-        num_codes, channels = codewords.size()
-        batch_size = x.size(0)
-        reshaped_scale = scale.view((1, 1, num_codes))
-        expanded_x = x.unsqueeze(2).expand(
-            (batch_size, x.size(1), num_codes, channels))
-        reshaped_codewords = codewords.view((1, 1, num_codes, channels))
-
-        scaled_l2_norm = reshaped_scale * (
-            expanded_x - reshaped_codewords).pow(2).sum(dim=3)
-        return scaled_l2_norm
-
-    @staticmethod
-    def aggregate(assignment_weights, x, codewords):
-        num_codes, channels = codewords.size()
-        reshaped_codewords = codewords.view((1, 1, num_codes, channels))
-        batch_size = x.size(0)
-
-        expanded_x = x.unsqueeze(2).expand(
-            (batch_size, x.size(1), num_codes, channels))
-        encoded_feat = (assignment_weights.unsqueeze(3) *
-                        (expanded_x - reshaped_codewords)).sum(dim=1)
-        return encoded_feat
-
-    def forward(self, x):
-        assert x.dim() == 4 and x.size(1) == self.channels
-        # [batch_size, channels, height, width]
-        batch_size = x.size(0)
-        # [batch_size, height x width, channels]
-        x = x.view(batch_size, self.channels, -1).transpose(1, 2).contiguous()
-        # assignment_weights: [batch_size, channels, num_codes]
-        assignment_weights = F.softmax(
-            self.scaled_l2(x, self.codewords, self.scale), dim=2)
-        # aggregate
-        encoded_feat = self.aggregate(assignment_weights, x, self.codewords)
-        return encoded_feat
-
-    def __repr__(self):
-        repr_str = self.__class__.__name__
-        repr_str += f'(Nx{self.channels}xHxW =>Nx{self.num_codes}' \
-                    f'x{self.channels})'
-        return repr_str
diff --git a/ControlNet/annotator/uniformer/mmseg/ops/wrappers.py b/ControlNet/annotator/uniformer/mmseg/ops/wrappers.py
deleted file mode 100644
index 0ed9a0cb8d7c0e0ec2748dd89c652756653cac78..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/ops/wrappers.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import warnings
-
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-def resize(input,
-           size=None,
-           scale_factor=None,
-           mode='nearest',
-           align_corners=None,
-           warning=True):
-    if warning:
-        if size is not None and align_corners:
-            input_h, input_w = tuple(int(x) for x in input.shape[2:])
-            output_h, output_w = tuple(int(x) for x in size)
-            if output_h > input_h or output_w > output_h:
-                if ((output_h > 1 and output_w > 1 and input_h > 1
-                     and input_w > 1) and (output_h - 1) % (input_h - 1)
-                        and (output_w - 1) % (input_w - 1)):
-                    warnings.warn(
-                        f'When align_corners={align_corners}, '
-                        'the output would more aligned if '
-                        f'input size {(input_h, input_w)} is `x+1` and '
-                        f'out size {(output_h, output_w)} is `nx+1`')
-    return F.interpolate(input, size, scale_factor, mode, align_corners)
-
-
-class Upsample(nn.Module):
-
-    def __init__(self,
-                 size=None,
-                 scale_factor=None,
-                 mode='nearest',
-                 align_corners=None):
-        super(Upsample, self).__init__()
-        self.size = size
-        if isinstance(scale_factor, tuple):
-            self.scale_factor = tuple(float(factor) for factor in scale_factor)
-        else:
-            self.scale_factor = float(scale_factor) if scale_factor else None
-        self.mode = mode
-        self.align_corners = align_corners
-
-    def forward(self, x):
-        if not self.size:
-            size = [int(t * self.scale_factor) for t in x.shape[-2:]]
-        else:
-            size = self.size
-        return resize(x, size, None, self.mode, self.align_corners)
diff --git a/ControlNet/annotator/uniformer/mmseg/utils/__init__.py b/ControlNet/annotator/uniformer/mmseg/utils/__init__.py
deleted file mode 100644
index ac489e2dbbc0e6fa87f5088b4edcc20f8cadc1a6..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/utils/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .collect_env import collect_env
-from .logger import get_root_logger
-
-__all__ = ['get_root_logger', 'collect_env']
diff --git a/ControlNet/annotator/uniformer/mmseg/utils/collect_env.py b/ControlNet/annotator/uniformer/mmseg/utils/collect_env.py
deleted file mode 100644
index 65c2134ddbee9655161237dd0894d38c768c2624..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/utils/collect_env.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from annotator.uniformer.mmcv.utils import collect_env as collect_base_env
-from annotator.uniformer.mmcv.utils import get_git_hash
-
-import annotator.uniformer.mmseg as mmseg
-
-
-def collect_env():
-    """Collect the information of the running environments."""
-    env_info = collect_base_env()
-    env_info['MMSegmentation'] = f'{mmseg.__version__}+{get_git_hash()[:7]}'
-
-    return env_info
-
-
-if __name__ == '__main__':
-    for name, val in collect_env().items():
-        print('{}: {}'.format(name, val))
diff --git a/ControlNet/annotator/uniformer/mmseg/utils/logger.py b/ControlNet/annotator/uniformer/mmseg/utils/logger.py
deleted file mode 100644
index 4149d9eda3dfef07490352d22ac40c42460315e4..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/uniformer/mmseg/utils/logger.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import logging
-
-from annotator.uniformer.mmcv.utils import get_logger
-
-
-def get_root_logger(log_file=None, log_level=logging.INFO):
-    """Get the root logger.
-
-    The logger will be initialized if it has not been initialized. By default a
-    StreamHandler will be added. If `log_file` is specified, a FileHandler will
-    also be added. The name of the root logger is the top-level package name,
-    e.g., "mmseg".
-
-    Args:
-        log_file (str | None): The log filename. If specified, a FileHandler
-            will be added to the root logger.
-        log_level (int): The root logger level. Note that only the process of
-            rank 0 is affected, while other processes will set the level to
-            "Error" and be silent most of the time.
-
-    Returns:
-        logging.Logger: The root logger.
-    """
-
-    logger = get_logger(name='mmseg', log_file=log_file, log_level=log_level)
-
-    return logger
diff --git a/ControlNet/annotator/util.py b/ControlNet/annotator/util.py
deleted file mode 100644
index 90831643d19cc1b9b0940df3d4fd4d846ba74a05..0000000000000000000000000000000000000000
--- a/ControlNet/annotator/util.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import numpy as np
-import cv2
-import os
-
-
-annotator_ckpts_path = os.path.join(os.path.dirname(__file__), 'ckpts')
-
-
-def HWC3(x):
-    assert x.dtype == np.uint8
-    if x.ndim == 2:
-        x = x[:, :, None]
-    assert x.ndim == 3
-    H, W, C = x.shape
-    assert C == 1 or C == 3 or C == 4
-    if C == 3:
-        return x
-    if C == 1:
-        return np.concatenate([x, x, x], axis=2)
-    if C == 4:
-        color = x[:, :, 0:3].astype(np.float32)
-        alpha = x[:, :, 3:4].astype(np.float32) / 255.0
-        y = color * alpha + 255.0 * (1.0 - alpha)
-        y = y.clip(0, 255).astype(np.uint8)
-        return y
-
-
-def resize_image(input_image, resolution):
-    H, W, C = input_image.shape
-    H = float(H)
-    W = float(W)
-    k = float(resolution) / min(H, W)
-    H *= k
-    W *= k
-    H = int(np.round(H / 64.0)) * 64
-    W = int(np.round(W / 64.0)) * 64
-    img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
-    return img
diff --git a/ControlNet/cldm/cldm.py b/ControlNet/cldm/cldm.py
deleted file mode 100644
index c1af60ffa7b4426744351dabaf41b996169a7039..0000000000000000000000000000000000000000
--- a/ControlNet/cldm/cldm.py
+++ /dev/null
@@ -1,435 +0,0 @@
-import einops
-import torch
-import torch as th
-import torch.nn as nn
-
-from ControlNet.ldm.modules.diffusionmodules.util import (
-    conv_nd,
-    linear,
-    zero_module,
-    timestep_embedding,
-)
-
-from einops import rearrange, repeat
-from torchvision.utils import make_grid
-from ControlNet.ldm.modules.attention import SpatialTransformer
-from ControlNet.ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Downsample, AttentionBlock
-from ControlNet.ldm.models.diffusion.ddpm import LatentDiffusion
-from ControlNet.ldm.util import log_txt_as_img, exists, instantiate_from_config
-from ControlNet.ldm.models.diffusion.ddim import DDIMSampler
-
-
-class ControlledUnetModel(UNetModel):
-    def forward(self, x, timesteps=None, context=None, control=None, only_mid_control=False, **kwargs):
-        hs = []
-        with torch.no_grad():
-            t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
-            emb = self.time_embed(t_emb)
-            h = x.type(self.dtype)
-            for module in self.input_blocks:
-                h = module(h, emb, context)
-                hs.append(h)
-            h = self.middle_block(h, emb, context)
-
-        if control is not None:
-            h += control.pop()
-
-        for i, module in enumerate(self.output_blocks):
-            if only_mid_control or control is None:
-                h = torch.cat([h, hs.pop()], dim=1)
-            else:
-                h = torch.cat([h, hs.pop() + control.pop()], dim=1)
-            h = module(h, emb, context)
-
-        h = h.type(x.dtype)
-        return self.out(h)
-
-
-class ControlNet(nn.Module):
-    def __init__(
-            self,
-            image_size,
-            in_channels,
-            model_channels,
-            hint_channels,
-            num_res_blocks,
-            attention_resolutions,
-            dropout=0,
-            channel_mult=(1, 2, 4, 8),
-            conv_resample=True,
-            dims=2,
-            use_checkpoint=False,
-            use_fp16=False,
-            num_heads=-1,
-            num_head_channels=-1,
-            num_heads_upsample=-1,
-            use_scale_shift_norm=False,
-            resblock_updown=False,
-            use_new_attention_order=False,
-            use_spatial_transformer=False,  # custom transformer support
-            transformer_depth=1,  # custom transformer support
-            context_dim=None,  # custom transformer support
-            n_embed=None,  # custom support for prediction of discrete ids into codebook of first stage vq model
-            legacy=True,
-            disable_self_attentions=None,
-            num_attention_blocks=None,
-            disable_middle_self_attn=False,
-            use_linear_in_transformer=False,
-    ):
-        super().__init__()
-        if use_spatial_transformer:
-            assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
-
-        if context_dim is not None:
-            assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
-            from omegaconf.listconfig import ListConfig
-            if type(context_dim) == ListConfig:
-                context_dim = list(context_dim)
-
-        if num_heads_upsample == -1:
-            num_heads_upsample = num_heads
-
-        if num_heads == -1:
-            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
-
-        if num_head_channels == -1:
-            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
-
-        self.dims = dims
-        self.image_size = image_size
-        self.in_channels = in_channels
-        self.model_channels = model_channels
-        if isinstance(num_res_blocks, int):
-            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
-        else:
-            if len(num_res_blocks) != len(channel_mult):
-                raise ValueError("provide num_res_blocks either as an int (globally constant) or "
-                                 "as a list/tuple (per-level) with the same length as channel_mult")
-            self.num_res_blocks = num_res_blocks
-        if disable_self_attentions is not None:
-            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
-            assert len(disable_self_attentions) == len(channel_mult)
-        if num_attention_blocks is not None:
-            assert len(num_attention_blocks) == len(self.num_res_blocks)
-            assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks))))
-            print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
-                  f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
-                  f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
-                  f"attention will still not be set.")
-
-        self.attention_resolutions = attention_resolutions
-        self.dropout = dropout
-        self.channel_mult = channel_mult
-        self.conv_resample = conv_resample
-        self.use_checkpoint = use_checkpoint
-        self.dtype = th.float16 if use_fp16 else th.float32
-        self.num_heads = num_heads
-        self.num_head_channels = num_head_channels
-        self.num_heads_upsample = num_heads_upsample
-        self.predict_codebook_ids = n_embed is not None
-
-        time_embed_dim = model_channels * 4
-        self.time_embed = nn.Sequential(
-            linear(model_channels, time_embed_dim),
-            nn.SiLU(),
-            linear(time_embed_dim, time_embed_dim),
-        )
-
-        self.input_blocks = nn.ModuleList(
-            [
-                TimestepEmbedSequential(
-                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
-                )
-            ]
-        )
-        self.zero_convs = nn.ModuleList([self.make_zero_conv(model_channels)])
-
-        self.input_hint_block = TimestepEmbedSequential(
-            conv_nd(dims, hint_channels, 16, 3, padding=1),
-            nn.SiLU(),
-            conv_nd(dims, 16, 16, 3, padding=1),
-            nn.SiLU(),
-            conv_nd(dims, 16, 32, 3, padding=1, stride=2),
-            nn.SiLU(),
-            conv_nd(dims, 32, 32, 3, padding=1),
-            nn.SiLU(),
-            conv_nd(dims, 32, 96, 3, padding=1, stride=2),
-            nn.SiLU(),
-            conv_nd(dims, 96, 96, 3, padding=1),
-            nn.SiLU(),
-            conv_nd(dims, 96, 256, 3, padding=1, stride=2),
-            nn.SiLU(),
-            zero_module(conv_nd(dims, 256, model_channels, 3, padding=1))
-        )
-
-        self._feature_size = model_channels
-        input_block_chans = [model_channels]
-        ch = model_channels
-        ds = 1
-        for level, mult in enumerate(channel_mult):
-            for nr in range(self.num_res_blocks[level]):
-                layers = [
-                    ResBlock(
-                        ch,
-                        time_embed_dim,
-                        dropout,
-                        out_channels=mult * model_channels,
-                        dims=dims,
-                        use_checkpoint=use_checkpoint,
-                        use_scale_shift_norm=use_scale_shift_norm,
-                    )
-                ]
-                ch = mult * model_channels
-                if ds in attention_resolutions:
-                    if num_head_channels == -1:
-                        dim_head = ch // num_heads
-                    else:
-                        num_heads = ch // num_head_channels
-                        dim_head = num_head_channels
-                    if legacy:
-                        # num_heads = 1
-                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
-                    if exists(disable_self_attentions):
-                        disabled_sa = disable_self_attentions[level]
-                    else:
-                        disabled_sa = False
-
-                    if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
-                        layers.append(
-                            AttentionBlock(
-                                ch,
-                                use_checkpoint=use_checkpoint,
-                                num_heads=num_heads,
-                                num_head_channels=dim_head,
-                                use_new_attention_order=use_new_attention_order,
-                            ) if not use_spatial_transformer else SpatialTransformer(
-                                ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
-                                disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
-                                use_checkpoint=use_checkpoint
-                            )
-                        )
-                self.input_blocks.append(TimestepEmbedSequential(*layers))
-                self.zero_convs.append(self.make_zero_conv(ch))
-                self._feature_size += ch
-                input_block_chans.append(ch)
-            if level != len(channel_mult) - 1:
-                out_ch = ch
-                self.input_blocks.append(
-                    TimestepEmbedSequential(
-                        ResBlock(
-                            ch,
-                            time_embed_dim,
-                            dropout,
-                            out_channels=out_ch,
-                            dims=dims,
-                            use_checkpoint=use_checkpoint,
-                            use_scale_shift_norm=use_scale_shift_norm,
-                            down=True,
-                        )
-                        if resblock_updown
-                        else Downsample(
-                            ch, conv_resample, dims=dims, out_channels=out_ch
-                        )
-                    )
-                )
-                ch = out_ch
-                input_block_chans.append(ch)
-                self.zero_convs.append(self.make_zero_conv(ch))
-                ds *= 2
-                self._feature_size += ch
-
-        if num_head_channels == -1:
-            dim_head = ch // num_heads
-        else:
-            num_heads = ch // num_head_channels
-            dim_head = num_head_channels
-        if legacy:
-            # num_heads = 1
-            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
-        self.middle_block = TimestepEmbedSequential(
-            ResBlock(
-                ch,
-                time_embed_dim,
-                dropout,
-                dims=dims,
-                use_checkpoint=use_checkpoint,
-                use_scale_shift_norm=use_scale_shift_norm,
-            ),
-            AttentionBlock(
-                ch,
-                use_checkpoint=use_checkpoint,
-                num_heads=num_heads,
-                num_head_channels=dim_head,
-                use_new_attention_order=use_new_attention_order,
-            ) if not use_spatial_transformer else SpatialTransformer(  # always uses a self-attn
-                ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
-                disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
-                use_checkpoint=use_checkpoint
-            ),
-            ResBlock(
-                ch,
-                time_embed_dim,
-                dropout,
-                dims=dims,
-                use_checkpoint=use_checkpoint,
-                use_scale_shift_norm=use_scale_shift_norm,
-            ),
-        )
-        self.middle_block_out = self.make_zero_conv(ch)
-        self._feature_size += ch
-
-    def make_zero_conv(self, channels):
-        return TimestepEmbedSequential(zero_module(conv_nd(self.dims, channels, channels, 1, padding=0)))
-
-    def forward(self, x, hint, timesteps, context, **kwargs):
-        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
-        emb = self.time_embed(t_emb)
-
-        guided_hint = self.input_hint_block(hint, emb, context)
-
-        outs = []
-
-        h = x.type(self.dtype)
-        for module, zero_conv in zip(self.input_blocks, self.zero_convs):
-            if guided_hint is not None:
-                h = module(h, emb, context)
-                h += guided_hint
-                guided_hint = None
-            else:
-                h = module(h, emb, context)
-            outs.append(zero_conv(h, emb, context))
-
-        h = self.middle_block(h, emb, context)
-        outs.append(self.middle_block_out(h, emb, context))
-
-        return outs
-
-
-class ControlLDM(LatentDiffusion):
-
-    def __init__(self, control_stage_config, control_key, only_mid_control, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.control_model = instantiate_from_config(control_stage_config)
-        self.control_key = control_key
-        self.only_mid_control = only_mid_control
-        self.control_scales = [1.0] * 13
-
-    @torch.no_grad()
-    def get_input(self, batch, k, bs=None, *args, **kwargs):
-        x, c = super().get_input(batch, self.first_stage_key, *args, **kwargs)
-        control = batch[self.control_key]
-        if bs is not None:
-            control = control[:bs]
-        control = control.to(self.device)
-        control = einops.rearrange(control, 'b h w c -> b c h w')
-        control = control.to(memory_format=torch.contiguous_format).float()
-        return x, dict(c_crossattn=[c], c_concat=[control])
-
-    def apply_model(self, x_noisy, t, cond, *args, **kwargs):
-        assert isinstance(cond, dict)
-        diffusion_model = self.model.diffusion_model
-
-        cond_txt = torch.cat(cond['c_crossattn'], 1)
-
-        if cond['c_concat'] is None:
-            eps = diffusion_model(x=x_noisy, timesteps=t, context=cond_txt, control=None, only_mid_control=self.only_mid_control)
-        else:
-            control = self.control_model(x=x_noisy, hint=torch.cat(cond['c_concat'], 1), timesteps=t, context=cond_txt)
-            control = [c * scale for c, scale in zip(control, self.control_scales)]
-            eps = diffusion_model(x=x_noisy, timesteps=t, context=cond_txt, control=control, only_mid_control=self.only_mid_control)
-
-        return eps
-
-    @torch.no_grad()
-    def get_unconditional_conditioning(self, N):
-        return self.get_learned_conditioning([""] * N)
-
-    @torch.no_grad()
-    def log_images(self, batch, N=4, n_row=2, sample=False, ddim_steps=50, ddim_eta=0.0, return_keys=None,
-                   quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
-                   plot_diffusion_rows=False, unconditional_guidance_scale=9.0, unconditional_guidance_label=None,
-                   use_ema_scope=True,
-                   **kwargs):
-        use_ddim = ddim_steps is not None
-
-        log = dict()
-        z, c = self.get_input(batch, self.first_stage_key, bs=N)
-        c_cat, c = c["c_concat"][0][:N], c["c_crossattn"][0][:N]
-        N = min(z.shape[0], N)
-        n_row = min(z.shape[0], n_row)
-        log["reconstruction"] = self.decode_first_stage(z)
-        log["control"] = c_cat * 2.0 - 1.0
-        log["conditioning"] = log_txt_as_img((512, 512), batch[self.cond_stage_key], size=16)
-
-        if plot_diffusion_rows:
-            # get diffusion row
-            diffusion_row = list()
-            z_start = z[:n_row]
-            for t in range(self.num_timesteps):
-                if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
-                    t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
-                    t = t.to(self.device).long()
-                    noise = torch.randn_like(z_start)
-                    z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
-                    diffusion_row.append(self.decode_first_stage(z_noisy))
-
-            diffusion_row = torch.stack(diffusion_row)  # n_log_step, n_row, C, H, W
-            diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
-            diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
-            diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
-            log["diffusion_row"] = diffusion_grid
-
-        if sample:
-            # get denoise row
-            samples, z_denoise_row = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
-                                                     batch_size=N, ddim=use_ddim,
-                                                     ddim_steps=ddim_steps, eta=ddim_eta)
-            x_samples = self.decode_first_stage(samples)
-            log["samples"] = x_samples
-            if plot_denoise_rows:
-                denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
-                log["denoise_row"] = denoise_grid
-
-        if unconditional_guidance_scale > 1.0:
-            uc_cross = self.get_unconditional_conditioning(N)
-            uc_cat = c_cat  # torch.zeros_like(c_cat)
-            uc_full = {"c_concat": [uc_cat], "c_crossattn": [uc_cross]}
-            samples_cfg, _ = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
-                                             batch_size=N, ddim=use_ddim,
-                                             ddim_steps=ddim_steps, eta=ddim_eta,
-                                             unconditional_guidance_scale=unconditional_guidance_scale,
-                                             unconditional_conditioning=uc_full,
-                                             )
-            x_samples_cfg = self.decode_first_stage(samples_cfg)
-            log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
-
-        return log
-
-    @torch.no_grad()
-    def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs):
-        ddim_sampler = DDIMSampler(self)
-        b, c, h, w = cond["c_concat"][0].shape
-        shape = (self.channels, h // 8, w // 8)
-        samples, intermediates = ddim_sampler.sample(ddim_steps, batch_size, shape, cond, verbose=False, **kwargs)
-        return samples, intermediates
-
-    def configure_optimizers(self):
-        lr = self.learning_rate
-        params = list(self.control_model.parameters())
-        if not self.sd_locked:
-            params += list(self.model.diffusion_model.output_blocks.parameters())
-            params += list(self.model.diffusion_model.out.parameters())
-        opt = torch.optim.AdamW(params, lr=lr)
-        return opt
-
-    def low_vram_shift(self, is_diffusing):
-        if is_diffusing:
-            self.model = self.model.cuda()
-            self.control_model = self.control_model.cuda()
-            self.first_stage_model = self.first_stage_model.cpu()
-            self.cond_stage_model = self.cond_stage_model.cpu()
-        else:
-            self.model = self.model.cpu()
-            self.control_model = self.control_model.cpu()
-            self.first_stage_model = self.first_stage_model.cuda()
-            self.cond_stage_model = self.cond_stage_model.cuda()
diff --git a/ControlNet/cldm/ddim_hacked.py b/ControlNet/cldm/ddim_hacked.py
deleted file mode 100644
index 967fd53bd20418c2e629e77b28e6490c60fb968d..0000000000000000000000000000000000000000
--- a/ControlNet/cldm/ddim_hacked.py
+++ /dev/null
@@ -1,317 +0,0 @@
-"""SAMPLING ONLY."""
-
-import torch
-import numpy as np
-from tqdm import tqdm
-
-from ControlNet.ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like, extract_into_tensor
-
-
-class DDIMSampler(object):
-    def __init__(self, model, schedule="linear", **kwargs):
-        super().__init__()
-        self.model = model
-        self.ddpm_num_timesteps = model.num_timesteps
-        self.schedule = schedule
-
-    def register_buffer(self, name, attr):
-        if type(attr) == torch.Tensor:
-            if attr.device != torch.device("cuda"):
-                attr = attr.to(torch.device("cuda"))
-        setattr(self, name, attr)
-
-    def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
-        self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
-                                                  num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
-        alphas_cumprod = self.model.alphas_cumprod
-        assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
-        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
-
-        self.register_buffer('betas', to_torch(self.model.betas))
-        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
-        self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
-
-        # calculations for diffusion q(x_t | x_{t-1}) and others
-        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
-        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
-
-        # ddim sampling parameters
-        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
-                                                                                   ddim_timesteps=self.ddim_timesteps,
-                                                                                   eta=ddim_eta,verbose=verbose)
-        self.register_buffer('ddim_sigmas', ddim_sigmas)
-        self.register_buffer('ddim_alphas', ddim_alphas)
-        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
-        self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
-        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
-            (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
-                        1 - self.alphas_cumprod / self.alphas_cumprod_prev))
-        self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
-
-    @torch.no_grad()
-    def sample(self,
-               S,
-               batch_size,
-               shape,
-               conditioning=None,
-               callback=None,
-               normals_sequence=None,
-               img_callback=None,
-               quantize_x0=False,
-               eta=0.,
-               mask=None,
-               x0=None,
-               temperature=1.,
-               noise_dropout=0.,
-               score_corrector=None,
-               corrector_kwargs=None,
-               verbose=True,
-               x_T=None,
-               log_every_t=100,
-               unconditional_guidance_scale=1.,
-               unconditional_conditioning=None, # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
-               dynamic_threshold=None,
-               ucg_schedule=None,
-               **kwargs
-               ):
-        if conditioning is not None:
-            if isinstance(conditioning, dict):
-                ctmp = conditioning[list(conditioning.keys())[0]]
-                while isinstance(ctmp, list): ctmp = ctmp[0]
-                cbs = ctmp.shape[0]
-                if cbs != batch_size:
-                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
-
-            elif isinstance(conditioning, list):
-                for ctmp in conditioning:
-                    if ctmp.shape[0] != batch_size:
-                        print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
-
-            else:
-                if conditioning.shape[0] != batch_size:
-                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
-
-        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
-        # sampling
-        C, H, W = shape
-        size = (batch_size, C, H, W)
-        print(f'Data shape for DDIM sampling is {size}, eta {eta}')
-
-        samples, intermediates = self.ddim_sampling(conditioning, size,
-                                                    callback=callback,
-                                                    img_callback=img_callback,
-                                                    quantize_denoised=quantize_x0,
-                                                    mask=mask, x0=x0,
-                                                    ddim_use_original_steps=False,
-                                                    noise_dropout=noise_dropout,
-                                                    temperature=temperature,
-                                                    score_corrector=score_corrector,
-                                                    corrector_kwargs=corrector_kwargs,
-                                                    x_T=x_T,
-                                                    log_every_t=log_every_t,
-                                                    unconditional_guidance_scale=unconditional_guidance_scale,
-                                                    unconditional_conditioning=unconditional_conditioning,
-                                                    dynamic_threshold=dynamic_threshold,
-                                                    ucg_schedule=ucg_schedule
-                                                    )
-        return samples, intermediates
-
-    @torch.no_grad()
-    def ddim_sampling(self, cond, shape,
-                      x_T=None, ddim_use_original_steps=False,
-                      callback=None, timesteps=None, quantize_denoised=False,
-                      mask=None, x0=None, img_callback=None, log_every_t=100,
-                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None, dynamic_threshold=None,
-                      ucg_schedule=None):
-        device = self.model.betas.device
-        b = shape[0]
-        if x_T is None:
-            img = torch.randn(shape, device=device)
-        else:
-            img = x_T
-
-        if timesteps is None:
-            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
-        elif timesteps is not None and not ddim_use_original_steps:
-            subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
-            timesteps = self.ddim_timesteps[:subset_end]
-
-        intermediates = {'x_inter': [img], 'pred_x0': [img]}
-        time_range = reversed(range(0,timesteps)) if ddim_use_original_steps else np.flip(timesteps)
-        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
-        print(f"Running DDIM Sampling with {total_steps} timesteps")
-
-        iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
-
-        for i, step in enumerate(iterator):
-            index = total_steps - i - 1
-            ts = torch.full((b,), step, device=device, dtype=torch.long)
-
-            if mask is not None:
-                assert x0 is not None
-                img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass?
-                img = img_orig * mask + (1. - mask) * img
-
-            if ucg_schedule is not None:
-                assert len(ucg_schedule) == len(time_range)
-                unconditional_guidance_scale = ucg_schedule[i]
-
-            outs = self.p_sample_ddim(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
-                                      quantize_denoised=quantize_denoised, temperature=temperature,
-                                      noise_dropout=noise_dropout, score_corrector=score_corrector,
-                                      corrector_kwargs=corrector_kwargs,
-                                      unconditional_guidance_scale=unconditional_guidance_scale,
-                                      unconditional_conditioning=unconditional_conditioning,
-                                      dynamic_threshold=dynamic_threshold)
-            img, pred_x0 = outs
-            if callback: callback(i)
-            if img_callback: img_callback(pred_x0, i)
-
-            if index % log_every_t == 0 or index == total_steps - 1:
-                intermediates['x_inter'].append(img)
-                intermediates['pred_x0'].append(pred_x0)
-
-        return img, intermediates
-
-    @torch.no_grad()
-    def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
-                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None,
-                      dynamic_threshold=None):
-        b, *_, device = *x.shape, x.device
-
-        if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
-            model_output = self.model.apply_model(x, t, c)
-        else:
-            model_t = self.model.apply_model(x, t, c)
-            model_uncond = self.model.apply_model(x, t, unconditional_conditioning)
-            model_output = model_uncond + unconditional_guidance_scale * (model_t - model_uncond)
-
-        if self.model.parameterization == "v":
-            e_t = self.model.predict_eps_from_z_and_v(x, t, model_output)
-        else:
-            e_t = model_output
-
-        if score_corrector is not None:
-            assert self.model.parameterization == "eps", 'not implemented'
-            e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
-
-        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
-        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
-        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
-        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
-        # select parameters corresponding to the currently considered timestep
-        a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
-        a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
-        sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
-        sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
-
-        # current prediction for x_0
-        if self.model.parameterization != "v":
-            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
-        else:
-            pred_x0 = self.model.predict_start_from_z_and_v(x, t, model_output)
-
-        if quantize_denoised:
-            pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
-
-        if dynamic_threshold is not None:
-            raise NotImplementedError()
-
-        # direction pointing to x_t
-        dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
-        noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
-        if noise_dropout > 0.:
-            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
-        x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
-        return x_prev, pred_x0
-
-    @torch.no_grad()
-    def encode(self, x0, c, t_enc, use_original_steps=False, return_intermediates=None,
-               unconditional_guidance_scale=1.0, unconditional_conditioning=None, callback=None):
-        timesteps = np.arange(self.ddpm_num_timesteps) if use_original_steps else self.ddim_timesteps
-        num_reference_steps = timesteps.shape[0]
-
-        assert t_enc <= num_reference_steps
-        num_steps = t_enc
-
-        if use_original_steps:
-            alphas_next = self.alphas_cumprod[:num_steps]
-            alphas = self.alphas_cumprod_prev[:num_steps]
-        else:
-            alphas_next = self.ddim_alphas[:num_steps]
-            alphas = torch.tensor(self.ddim_alphas_prev[:num_steps])
-
-        x_next = x0
-        intermediates = []
-        inter_steps = []
-        for i in tqdm(range(num_steps), desc='Encoding Image'):
-            t = torch.full((x0.shape[0],), timesteps[i], device=self.model.device, dtype=torch.long)
-            if unconditional_guidance_scale == 1.:
-                noise_pred = self.model.apply_model(x_next, t, c)
-            else:
-                assert unconditional_conditioning is not None
-                e_t_uncond, noise_pred = torch.chunk(
-                    self.model.apply_model(torch.cat((x_next, x_next)), torch.cat((t, t)),
-                                           torch.cat((unconditional_conditioning, c))), 2)
-                noise_pred = e_t_uncond + unconditional_guidance_scale * (noise_pred - e_t_uncond)
-
-            xt_weighted = (alphas_next[i] / alphas[i]).sqrt() * x_next
-            weighted_noise_pred = alphas_next[i].sqrt() * (
-                    (1 / alphas_next[i] - 1).sqrt() - (1 / alphas[i] - 1).sqrt()) * noise_pred
-            x_next = xt_weighted + weighted_noise_pred
-            if return_intermediates and i % (
-                    num_steps // return_intermediates) == 0 and i < num_steps - 1:
-                intermediates.append(x_next)
-                inter_steps.append(i)
-            elif return_intermediates and i >= num_steps - 2:
-                intermediates.append(x_next)
-                inter_steps.append(i)
-            if callback: callback(i)
-
-        out = {'x_encoded': x_next, 'intermediate_steps': inter_steps}
-        if return_intermediates:
-            out.update({'intermediates': intermediates})
-        return x_next, out
-
-    @torch.no_grad()
-    def stochastic_encode(self, x0, t, use_original_steps=False, noise=None):
-        # fast, but does not allow for exact reconstruction
-        # t serves as an index to gather the correct alphas
-        if use_original_steps:
-            sqrt_alphas_cumprod = self.sqrt_alphas_cumprod
-            sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod
-        else:
-            sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas)
-            sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas
-
-        if noise is None:
-            noise = torch.randn_like(x0)
-        return (extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 +
-                extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise)
-
-    @torch.no_grad()
-    def decode(self, x_latent, cond, t_start, unconditional_guidance_scale=1.0, unconditional_conditioning=None,
-               use_original_steps=False, callback=None):
-
-        timesteps = np.arange(self.ddpm_num_timesteps) if use_original_steps else self.ddim_timesteps
-        timesteps = timesteps[:t_start]
-
-        time_range = np.flip(timesteps)
-        total_steps = timesteps.shape[0]
-        print(f"Running DDIM Sampling with {total_steps} timesteps")
-
-        iterator = tqdm(time_range, desc='Decoding image', total=total_steps)
-        x_dec = x_latent
-        for i, step in enumerate(iterator):
-            index = total_steps - i - 1
-            ts = torch.full((x_latent.shape[0],), step, device=x_latent.device, dtype=torch.long)
-            x_dec, _ = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps,
-                                          unconditional_guidance_scale=unconditional_guidance_scale,
-                                          unconditional_conditioning=unconditional_conditioning)
-            if callback: callback(i)
-        return x_dec
diff --git a/ControlNet/cldm/hack.py b/ControlNet/cldm/hack.py
deleted file mode 100644
index 18a1d3923c9c99bfacc45b9d876a9b3b662e6351..0000000000000000000000000000000000000000
--- a/ControlNet/cldm/hack.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import torch
-import einops
-
-import ControlNet.ldm.modules.encoders.modules
-import ControlNet.ldm.modules.attention
-
-from transformers import logging
-from ControlNet.ldm.modules.attention import default
-
-
-def disable_verbosity():
-    logging.set_verbosity_error()
-    print('logging improved.')
-    return
-
-
-def enable_sliced_attention():
-    ControlNet.ldm.modules.attention.CrossAttention.forward = _hacked_sliced_attentin_forward
-    print('Enabled sliced_attention.')
-    return
-
-
-def hack_everything(clip_skip=0):
-    disable_verbosity()
-    ControlNet.ldm.modules.encoders.modules.FrozenCLIPEmbedder.forward = _hacked_clip_forward
-    ControlNet.ldm.modules.encoders.modules.FrozenCLIPEmbedder.clip_skip = clip_skip
-    print('Enabled clip hacks.')
-    return
-
-
-# Written by Lvmin
-def _hacked_clip_forward(self, text):
-    PAD = self.tokenizer.pad_token_id
-    EOS = self.tokenizer.eos_token_id
-    BOS = self.tokenizer.bos_token_id
-
-    def tokenize(t):
-        return self.tokenizer(t, truncation=False, add_special_tokens=False)["input_ids"]
-
-    def transformer_encode(t):
-        if self.clip_skip > 1:
-            rt = self.transformer(input_ids=t, output_hidden_states=True)
-            return self.transformer.text_model.final_layer_norm(rt.hidden_states[-self.clip_skip])
-        else:
-            return self.transformer(input_ids=t, output_hidden_states=False).last_hidden_state
-
-    def split(x):
-        return x[75 * 0: 75 * 1], x[75 * 1: 75 * 2], x[75 * 2: 75 * 3]
-
-    def pad(x, p, i):
-        return x[:i] if len(x) >= i else x + [p] * (i - len(x))
-
-    raw_tokens_list = tokenize(text)
-    tokens_list = []
-
-    for raw_tokens in raw_tokens_list:
-        raw_tokens_123 = split(raw_tokens)
-        raw_tokens_123 = [[BOS] + raw_tokens_i + [EOS] for raw_tokens_i in raw_tokens_123]
-        raw_tokens_123 = [pad(raw_tokens_i, PAD, 77) for raw_tokens_i in raw_tokens_123]
-        tokens_list.append(raw_tokens_123)
-
-    tokens_list = torch.IntTensor(tokens_list).to(self.device)
-
-    feed = einops.rearrange(tokens_list, 'b f i -> (b f) i')
-    y = transformer_encode(feed)
-    z = einops.rearrange(y, '(b f) i c -> b (f i) c', f=3)
-
-    return z
-
-
-# Stolen from https://github.com/basujindal/stable-diffusion/blob/main/optimizedSD/splitAttention.py
-def _hacked_sliced_attentin_forward(self, x, context=None, mask=None):
-    h = self.heads
-
-    q = self.to_q(x)
-    context = default(context, x)
-    k = self.to_k(context)
-    v = self.to_v(context)
-    del context, x
-
-    q, k, v = map(lambda t: einops.rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
-
-    limit = k.shape[0]
-    att_step = 1
-    q_chunks = list(torch.tensor_split(q, limit // att_step, dim=0))
-    k_chunks = list(torch.tensor_split(k, limit // att_step, dim=0))
-    v_chunks = list(torch.tensor_split(v, limit // att_step, dim=0))
-
-    q_chunks.reverse()
-    k_chunks.reverse()
-    v_chunks.reverse()
-    sim = torch.zeros(q.shape[0], q.shape[1], v.shape[2], device=q.device)
-    del k, q, v
-    for i in range(0, limit, att_step):
-        q_buffer = q_chunks.pop()
-        k_buffer = k_chunks.pop()
-        v_buffer = v_chunks.pop()
-        sim_buffer = torch.einsum('b i d, b j d -> b i j', q_buffer, k_buffer) * self.scale
-
-        del k_buffer, q_buffer
-        # attention, what we cannot get enough of, by chunks
-
-        sim_buffer = sim_buffer.softmax(dim=-1)
-
-        sim_buffer = torch.einsum('b i j, b j d -> b i d', sim_buffer, v_buffer)
-        del v_buffer
-        sim[i:i + att_step, :, :] = sim_buffer
-
-        del sim_buffer
-    sim = einops.rearrange(sim, '(b h) n d -> b n (h d)', h=h)
-    return self.to_out(sim)
diff --git a/ControlNet/cldm/logger.py b/ControlNet/cldm/logger.py
deleted file mode 100644
index 6a8803846f2a8979f87f3cf9ea5b12869439e62f..0000000000000000000000000000000000000000
--- a/ControlNet/cldm/logger.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import os
-
-import numpy as np
-import torch
-import torchvision
-from PIL import Image
-from pytorch_lightning.callbacks import Callback
-from pytorch_lightning.utilities.distributed import rank_zero_only
-
-
-class ImageLogger(Callback):
-    def __init__(self, batch_frequency=2000, max_images=4, clamp=True, increase_log_steps=True,
-                 rescale=True, disabled=False, log_on_batch_idx=False, log_first_step=False,
-                 log_images_kwargs=None):
-        super().__init__()
-        self.rescale = rescale
-        self.batch_freq = batch_frequency
-        self.max_images = max_images
-        if not increase_log_steps:
-            self.log_steps = [self.batch_freq]
-        self.clamp = clamp
-        self.disabled = disabled
-        self.log_on_batch_idx = log_on_batch_idx
-        self.log_images_kwargs = log_images_kwargs if log_images_kwargs else {}
-        self.log_first_step = log_first_step
-
-    @rank_zero_only
-    def log_local(self, save_dir, split, images, global_step, current_epoch, batch_idx):
-        root = os.path.join(save_dir, "image_log", split)
-        for k in images:
-            grid = torchvision.utils.make_grid(images[k], nrow=4)
-            if self.rescale:
-                grid = (grid + 1.0) / 2.0  # -1,1 -> 0,1; c,h,w
-            grid = grid.transpose(0, 1).transpose(1, 2).squeeze(-1)
-            grid = grid.numpy()
-            grid = (grid * 255).astype(np.uint8)
-            filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format(k, global_step, current_epoch, batch_idx)
-            path = os.path.join(root, filename)
-            os.makedirs(os.path.split(path)[0], exist_ok=True)
-            Image.fromarray(grid).save(path)
-
-    def log_img(self, pl_module, batch, batch_idx, split="train"):
-        check_idx = batch_idx  # if self.log_on_batch_idx else pl_module.global_step
-        if (self.check_frequency(check_idx) and  # batch_idx % self.batch_freq == 0
-                hasattr(pl_module, "log_images") and
-                callable(pl_module.log_images) and
-                self.max_images > 0):
-            logger = type(pl_module.logger)
-
-            is_train = pl_module.training
-            if is_train:
-                pl_module.eval()
-
-            with torch.no_grad():
-                images = pl_module.log_images(batch, split=split, **self.log_images_kwargs)
-
-            for k in images:
-                N = min(images[k].shape[0], self.max_images)
-                images[k] = images[k][:N]
-                if isinstance(images[k], torch.Tensor):
-                    images[k] = images[k].detach().cpu()
-                    if self.clamp:
-                        images[k] = torch.clamp(images[k], -1., 1.)
-
-            self.log_local(pl_module.logger.save_dir, split, images,
-                           pl_module.global_step, pl_module.current_epoch, batch_idx)
-
-            if is_train:
-                pl_module.train()
-
-    def check_frequency(self, check_idx):
-        return check_idx % self.batch_freq == 0
-
-    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
-        if not self.disabled:
-            self.log_img(pl_module, batch, batch_idx, split="train")
diff --git a/ControlNet/cldm/model.py b/ControlNet/cldm/model.py
deleted file mode 100644
index f80dbd8c91f5dd1f9d7ac0bbbec4d107a6eb549f..0000000000000000000000000000000000000000
--- a/ControlNet/cldm/model.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import os
-import torch
-
-from omegaconf import OmegaConf
-from ControlNet.ldm.util import instantiate_from_config
-
-
-def get_state_dict(d):
-    return d.get('state_dict', d)
-
-
-def load_state_dict(ckpt_path, location='cpu'):
-    _, extension = os.path.splitext(ckpt_path)
-    if extension.lower() == ".safetensors":
-        import safetensors.torch
-        state_dict = safetensors.torch.load_file(ckpt_path, device=location)
-    else:
-        state_dict = get_state_dict(torch.load(ckpt_path, map_location=torch.device(location)))
-    state_dict = get_state_dict(state_dict)
-    print(f'Loaded state_dict from [{ckpt_path}]')
-    return state_dict
-
-
-def create_model(config_path):
-    config = OmegaConf.load(config_path)
-    model = instantiate_from_config(config.model).cpu()
-    print(f'Loaded model config from [{config_path}]')
-    return model
diff --git a/ControlNet/config.py b/ControlNet/config.py
deleted file mode 100644
index e0c738d8cbad66bbe1666284aef926c326849701..0000000000000000000000000000000000000000
--- a/ControlNet/config.py
+++ /dev/null
@@ -1 +0,0 @@
-save_memory = False
diff --git a/ControlNet/docs/annotator.md b/ControlNet/docs/annotator.md
deleted file mode 100644
index 5111678ff11407da19a920e0c500ee0480bbb155..0000000000000000000000000000000000000000
--- a/ControlNet/docs/annotator.md
+++ /dev/null
@@ -1,49 +0,0 @@
-# Automatic Annotations
-
-We provide gradio examples to obtain annotations that are aligned to our pretrained production-ready models.
-
-Just run
-
-    python gradio_annotator.py
-
-Since everyone has different habit to organize their datasets, we do not hard code any scripts for batch processing. But "gradio_annotator.py" is written in a super readable way, and modifying it to annotate your images should be easy.
-
-In the gradio UI of "gradio_annotator.py" we have the following interfaces:
-
-### Canny Edge
-
-Be careful about "black edge and white background" or "white edge and black background".
-
-![p](../github_page/a1.png)
-
-### HED Edge
-
-Be careful about "black edge and white background" or "white edge and black background".
-
-![p](../github_page/a2.png)
-
-### MLSD Edge
-
-Be careful about "black edge and white background" or "white edge and black background".
-
-![p](../github_page/a3.png)
-
-### MIDAS Depth and Normal
-
-Be careful about RGB or BGR in normal maps.
-
-![p](../github_page/a4.png)
-
-### Openpose
-
-Be careful about RGB or BGR in pose maps.
-
-For our production-ready model, the hand pose option is turned off.
-
-![p](../github_page/a5.png)
-
-### Uniformer Segmentation
-
-Be careful about RGB or BGR in segmentation maps.
-
-![p](../github_page/a6.png)
diff --git a/ControlNet/docs/faq.md b/ControlNet/docs/faq.md
deleted file mode 100644
index 07afd7aeacb51cac4c8bac3b601fe23a2842c4d3..0000000000000000000000000000000000000000
--- a/ControlNet/docs/faq.md
+++ /dev/null
@@ -1,21 +0,0 @@
-# FAQs
-
-**Q:** If the weight of a conv layer is zero, the gradient will also be zero, and the network will not learn anything. Why "zero convolution" works?
-
-**A:** This is wrong. Let us consider a very simple 
-
-$$y=wx+b$$
-
-and we have 
-
-$$\partial y/\partial w=x, \partial y/\partial x=w, \partial y/\partial b=1$$
-
-and if $w=0$ and $x \neq 0$, then 
-
-$$\partial y/\partial w \neq 0, \partial y/\partial x=0, \partial y/\partial b\neq 0$$
-
-which means as long as $x \neq 0$, one gradient descent iteration will make $w$ non-zero. Then 
-
-$$\partial y/\partial x\neq 0$$
-
-so that the zero convolutions will progressively become a common conv layer with non-zero weights.
diff --git a/ControlNet/docs/low_vram.md b/ControlNet/docs/low_vram.md
deleted file mode 100644
index 784964c78d5074ed9d318456d7c35f30a81f04ed..0000000000000000000000000000000000000000
--- a/ControlNet/docs/low_vram.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# Enable Low VRAM Mode
-
-If you are using 8GB GPU card (or if you want larger batch size), please open "config.py", and then set
-
-```python
-save_memory = True
-```
-
-This feature is still being tested - not all graphics cards are guaranteed to succeed.
-
-But it should be neat as I can diffuse at a batch size of 12 now.
-
-(prompt "man")
-
-![p](../github_page/ram12.jpg)
diff --git a/ControlNet/docs/train.md b/ControlNet/docs/train.md
deleted file mode 100644
index fa773925e28c4100df4bf74f0536d432554db806..0000000000000000000000000000000000000000
--- a/ControlNet/docs/train.md
+++ /dev/null
@@ -1,276 +0,0 @@
-# Train a ControlNet to Control SD
-
-You are here because you want to control SD in your own way, maybe you have an idea for your perfect research project, and you will annotate some data or have already annotated your own dataset automatically or manually. Herein, the control can be anything that can be converted to images, such as edges, keypoints, segments, etc.
-
-Before moving on to your own dataset, we highly recommend to first try the toy dataset, Fill50K, as a sanity check. This will help you get a "feeling" for the training. You will know how long it will take for the model to converge and whether your device will be able to complete the training in an acceptable amount of time. And what it "feels" like when the model converges.
-
-We hope that after you read this page, you will find that training a ControlNet is as easy as (or easier than) training a pix2pix.
-
-## Step 0 - Design your control
-
-Let us take a look at a very simple task to control SD to fill color in circles.
-
-![p](../github_page/t1.png)
-
-This is simple: we want to control SD to fill a circle with colors, and the prompt contains some description of our target.
-
-Stable diffusion is trained on billions of images, and it already knows what is "cyan", what is "circle", what is "pink", and what is "background".
-
-But it does not know the meaning of that "Control Image (Source Image)". Our target is to let it know.
-
-## Step 1 - Get a dataset
-
-Just download the Fill50K dataset from [our huggingface page](https://huggingface.co/lllyasviel/ControlNet) (training/fill50k.zip, the file is only 200M!). Make sure that the data is decompressed as 
-
-    ControlNet/training/fill50k/prompt.json
-    ControlNet/training/fill50k/source/X.png
-    ControlNet/training/fill50k/target/X.png
-
-In the folder "fill50k/source", you will have 50k images of circle lines.
-
-![p](../github_page/t2.png)
-
-In the folder "fill50k/target", you will have 50k images of filled circles.
-
-![p](../github_page/t3.png)
-
-In the "fill50k/prompt.json", you will have their filenames and prompts. Each prompt is like "a balabala color circle in some other color background."
-
-![p](../github_page/t4.png)
-
-## Step 2 - Load the dataset
-
-Then you need to write a simple script to read this dataset for pytorch. (In fact we have written it for you in "tutorial_dataset.py".)
-
-```python
-import json
-import cv2
-import numpy as np
-
-from torch.utils.data import Dataset
-
-
-class MyDataset(Dataset):
-    def __init__(self):
-        self.data = []
-        with open('./training/fill50k/prompt.json', 'rt') as f:
-            for line in f:
-                self.data.append(json.loads(line))
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(self, idx):
-        item = self.data[idx]
-
-        source_filename = item['source']
-        target_filename = item['target']
-        prompt = item['prompt']
-
-        source = cv2.imread('./training/fill50k/' + source_filename)
-        target = cv2.imread('./training/fill50k/' + target_filename)
-
-        # Do not forget that OpenCV read images in BGR order.
-        source = cv2.cvtColor(source, cv2.COLOR_BGR2RGB)
-        target = cv2.cvtColor(target, cv2.COLOR_BGR2RGB)
-
-        # Normalize source images to [0, 1].
-        source = source.astype(np.float32) / 255.0
-
-        # Normalize target images to [-1, 1].
-        target = (target.astype(np.float32) / 127.5) - 1.0
-
-        return dict(jpg=target, txt=prompt, hint=source)
-
-```
-
-This will make your dataset into an array-like object in python. You can test this dataset simply by accessing the array, like this
-
-```python
-from tutorial_dataset import MyDataset
-
-dataset = MyDataset()
-print(len(dataset))
-
-item = dataset[1234]
-jpg = item['jpg']
-txt = item['txt']
-hint = item['hint']
-print(txt)
-print(jpg.shape)
-print(hint.shape)
-
-```
-
-The outputs of this simple test on my machine are 
-
-    50000
-    burly wood circle with orange background
-    (512, 512, 3)
-    (512, 512, 3)
-
-And this code is in "tutorial_dataset_test.py".
-
-In this way, the dataset is an array-like object with 50000 items. Each item is a dict with three entry "jpg", "txt", and "hint". The "jpg" is the target image, the "hint" is the control image, and the "txt" is the prompt. 
-
-Do not ask us why we use these three names - this is related to the dark history of a library called LDM.
-
-## Step 3 - What SD model do you want to control?
-
-Then you need to decide which Stable Diffusion Model you want to control. In this example, we will just use standard SD1.5. You can download it from the [official page of Stability](https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/main). You want the file ["v1-5-pruned.ckpt"](https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/main).
-
-(Or ["v2-1_512-ema-pruned.ckpt"](https://huggingface.co/stabilityai/stable-diffusion-2-1-base/tree/main) if you are using SD2.)
-
-Then you need to attach a control net to the SD model. The architecture is 
-
-![img](../github_page/sd.png)
-
-Note that all weights inside the ControlNet are also copied from SD so that no layer is trained from scratch, and you are still finetuning the entire model.
-
-We provide a simple script for you to achieve this easily. If your SD filename is "./models/v1-5-pruned.ckpt" and you want the script to save the processed model (SD+ControlNet) at location "./models/control_sd15_ini.ckpt", you can just run:
-
-    python tool_add_control.py ./models/v1-5-pruned.ckpt ./models/control_sd15_ini.ckpt
-
-Or if you are using SD2:
-
-    python tool_add_control_sd21.py ./models/v2-1_512-ema-pruned.ckpt ./models/control_sd21_ini.ckpt
-
-You may also use other filenames as long as the command is "python tool_add_control.py input_path output_path".
-
-This is the correct output from my machine:
-
-![img](../github_page/t5.png)
-
-## Step 4 - Train!
-
-Happy! We finally come to the most exciting part: training!
-
-The training code in "tutorial_train.py" is actually surprisingly simple:
-
-```python
-import pytorch_lightning as pl
-from torch.utils.data import DataLoader
-from tutorial_dataset import MyDataset
-from cldm.logger import ImageLogger
-from cldm.model import create_model, load_state_dict
-
-
-# Configs
-resume_path = './models/control_sd15_ini.ckpt'
-batch_size = 4
-logger_freq = 300
-learning_rate = 1e-5
-sd_locked = True
-only_mid_control = False
-
-
-# First use cpu to load models. Pytorch Lightning will automatically move it to GPUs.
-model = create_model('./models/cldm_v15.yaml').cpu()
-model.load_state_dict(load_state_dict(resume_path, location='cpu'))
-model.learning_rate = learning_rate
-model.sd_locked = sd_locked
-model.only_mid_control = only_mid_control
-
-
-# Misc
-dataset = MyDataset()
-dataloader = DataLoader(dataset, num_workers=0, batch_size=batch_size, shuffle=True)
-logger = ImageLogger(batch_frequency=logger_freq)
-trainer = pl.Trainer(gpus=1, precision=32, callbacks=[logger])
-
-
-# Train!
-trainer.fit(model, dataloader)
-
-```
-(or "tutorial_train_sd21.py" if you are using SD2)
-
-Thanks to our organized dataset pytorch object and the power of pytorch_lightning, the entire code is just super short.
-
-Now, you may take a look at [Pytorch Lightning Official DOC](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.trainer.trainer.Trainer.html#trainer) to find out how to enable many useful features like gradient accumulation, multiple GPU training, accelerated dataset loading, flexible checkpoint saving, etc. All these only need about one line of code. Great!
-
-Note that if you find OOM, perhaps you need to enable [Low VRAM mode](low_vram.md), and perhaps you also need to use smaller batch size and gradient accumulation. Or you may also want to use some “advanced” tricks like sliced attention or xformers. For example:
-
-```python
-# Configs
-batch_size = 1
-
-# Misc
-trainer = pl.Trainer(gpus=1, precision=32, callbacks=[logger], accumulate_grad_batches=4)  # But this will be 4x slower
-```
-
-Note that training with 8 GB laptop GPU is challenging. We will need some GPU memory optimization at least as good as automatic1111’s UI. This may require expert modifications to the code.
-
-### Screenshots
-
-The training is fast. After 4000 steps (batch size 4, learning rate 1e-5, about 50 minutes on PCIE 40G), the results on my machine (in an output folder "image_log") is
-
-Control:
-
-![img](../github_page/t/ip.png)
-
-Prompt:
-
-![img](../github_page/t/t.png)
-
-Prediction:
-
-![img](../github_page/t/op.png)
-
-Ground Truth:
-
-![img](../github_page/t/gt.png)
-
-Note that the SD's capability is preserved. Even training on this super aligned dataset, it still draws some random textures and those snow decorations. (Besides, note that the ground truth looks a bit modified because it is converted from SD's latent image.)
-
-Larger batch size and longer training will further improve this. Adequate training will make the filling perfect. 
-
-Of course, training SD to fill circles is meaningless, but this is a successful beginning of your story. 
-
-Let us work together to control large models more and more.
-
-## Other options
-
-Beyond standard things, we also provide two important parameters "sd_locked" and "only_mid_control" that you need to know.
-
-### only_mid_control
-
-By default, only_mid_control is False. When it is True, you will train the below architecture.
-
-![img](../github_page/t6.png)
-
-This can be helpful when your computation power is limited and want to speed up the training, or when you want to facilitate the "global" context learning. Note that sometimes you may pause training, set it to True, resume training, and pause again, and set it again, and resume again. 
-
-If your computation device is good, perhaps you do not need this. But I also know some artists are willing to train a model on their laptop for a month - in that case, perhaps this option can be useful.
-
-### sd_locked
-
-By default, sd_locked is True. When it is False, you will train the below architecture.
-
-![img](../github_page/t7.png)
-
-This will unlock some layers in SD and you will train them as a whole.
-
-This option is DANGEROUS! If your dataset is not good enough, this may downgrade the capability of your SD model.
-
-However, this option is also very useful when you are training on images with some specific style, or when you are training with special datasets (like medical dataset with X-ray images or geographic datasets with lots of Google Maps). You can understand this as simultaneously training the ControlNet and something like a DreamBooth.
-
-Also, if your dataset is large, you may want to end the training with a few thousands of steps with those layer unlocked. This usually improve the "problem-specific" solutions a little. You may try it yourself to feel the difference.
-
-Also, if you unlock some original layers, you may want a lower learning rate, like 2e-6.
-
-## More Consideration: Sudden Converge Phenomenon and Gradient Accumulation
-
-![img](../github_page/ex1.jpg)
-
-Because we use zero convolutions, the SD should always be able to predict meaningful images. (If it cannot, the training has already failed.)
-
-You will always find that at some iterations, the model "suddenly" be able to fit some training conditions. This means that you will get a basically usable model at about 3k to 7k steps (future training will improve it, but that model after the first "sudden converge" should be basically functional).
-
-Note that 3k to 7k steps is not very large, and you should consider larger batch size rather than more training steps. If you can observe the "sudden converge" at 3k step using batch size 4, then, rather than train it with 300k further steps, a better idea is to use 100× gradient accumulation to re-train that 3k steps with 100× batch size. Note that perhaps we should not do this *too* extremely (perhaps 100x accumulation is too extreme), but you should consider that, since "sudden converge" will *always* happen at that certain point, getting a better converge is more important.
-
-Because that "sudden converge" always happens, lets say "sudden converge" will happen at 3k step and our money can optimize 90k step, then we have two options: (1) train 3k steps, sudden converge, then train 87k steps. (2) 30x gradient accumulation, train 3k steps (90k real computation steps), then sudden converge.
-
-In my experiments, (2) is usually better than (1). However, in real cases, perhaps you may need to balance the steps before and after the "sudden converge" on your own to find a balance. The training after "sudden converge" is also important.
-
-But usually, if your logic batch size is already bigger than 256, then further extending the batch size is not very meaningful. In that case, perhaps a better idea is to train more steps. I tried some "common" logic batch size at 64 or 96 or 128 (by gradient accumulation), it seems that many complicated conditions can be solved very well already.
diff --git a/ControlNet/environment.yaml b/ControlNet/environment.yaml
deleted file mode 100644
index 91463f0fb17145d7649e298819811ac9a21d6b93..0000000000000000000000000000000000000000
--- a/ControlNet/environment.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-name: control
-channels:
-  - pytorch
-  - defaults
-dependencies:
-  - python=3.8.5
-  - pip=20.3
-  - cudatoolkit=11.3
-  - pytorch=1.12.1
-  - torchvision=0.13.1
-  - numpy=1.23.1
-  - pip:
-      - gradio==3.16.2
-      - albumentations==1.3.0
-      - opencv-contrib-python==4.3.0.36
-      - imageio==2.9.0
-      - imageio-ffmpeg==0.4.2
-      - pytorch-lightning==1.5.0
-      - omegaconf==2.1.1
-      - test-tube>=0.7.5
-      - streamlit==1.12.1
-      - einops==0.3.0
-      - transformers==4.19.2
-      - webdataset==0.2.5
-      - kornia==0.6
-      - open_clip_torch==2.0.2
-      - invisible-watermark>=0.1.5
-      - streamlit-drawable-canvas==0.8.0
-      - torchmetrics==0.6.0
-      - timm==0.6.12
-      - addict==2.4.0
-      - yapf==0.32.0
-      - prettytable==3.6.0
-      - safetensors==0.2.7
-      - basicsr==1.4.2
diff --git a/ControlNet/font/DejaVuSans.ttf b/ControlNet/font/DejaVuSans.ttf
deleted file mode 100644
index e5f7eecce43be41ff0703ed99e1553029b849f14..0000000000000000000000000000000000000000
Binary files a/ControlNet/font/DejaVuSans.ttf and /dev/null differ
diff --git a/ControlNet/gradio_annotator.py b/ControlNet/gradio_annotator.py
deleted file mode 100644
index 2b1a29ebbec24073a9e4357b700e0577a17a9379..0000000000000000000000000000000000000000
--- a/ControlNet/gradio_annotator.py
+++ /dev/null
@@ -1,160 +0,0 @@
-import gradio as gr
-
-from annotator.util import resize_image, HWC3
-
-
-model_canny = None
-
-
-def canny(img, res, l, h):
-    img = resize_image(HWC3(img), res)
-    global model_canny
-    if model_canny is None:
-        from annotator.canny import CannyDetector
-        model_canny = CannyDetector()
-    result = model_canny(img, l, h)
-    return [result]
-
-
-model_hed = None
-
-
-def hed(img, res):
-    img = resize_image(HWC3(img), res)
-    global model_hed
-    if model_hed is None:
-        from annotator.hed import HEDdetector
-        model_hed = HEDdetector()
-    result = model_hed(img)
-    return [result]
-
-
-model_mlsd = None
-
-
-def mlsd(img, res, thr_v, thr_d):
-    img = resize_image(HWC3(img), res)
-    global model_mlsd
-    if model_mlsd is None:
-        from annotator.mlsd import MLSDdetector
-        model_mlsd = MLSDdetector()
-    result = model_mlsd(img, thr_v, thr_d)
-    return [result]
-
-
-model_midas = None
-
-
-def midas(img, res, a):
-    img = resize_image(HWC3(img), res)
-    global model_midas
-    if model_midas is None:
-        from annotator.midas import MidasDetector
-        model_midas = MidasDetector()
-    results = model_midas(img, a)
-    return results
-
-
-model_openpose = None
-
-
-def openpose(img, res, has_hand):
-    img = resize_image(HWC3(img), res)
-    global model_openpose
-    if model_openpose is None:
-        from annotator.openpose import OpenposeDetector
-        model_openpose = OpenposeDetector()
-    result, _ = model_openpose(img, has_hand)
-    return [result]
-
-
-model_uniformer = None
-
-
-def uniformer(img, res):
-    img = resize_image(HWC3(img), res)
-    global model_uniformer
-    if model_uniformer is None:
-        from annotator.uniformer import UniformerDetector
-        model_uniformer = UniformerDetector()
-    result = model_uniformer(img)
-    return [result]
-
-
-block = gr.Blocks().queue()
-with block:
-    with gr.Row():
-        gr.Markdown("## Canny Edge")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source='upload', type="numpy")
-            low_threshold = gr.Slider(label="low_threshold", minimum=1, maximum=255, value=100, step=1)
-            high_threshold = gr.Slider(label="high_threshold", minimum=1, maximum=255, value=200, step=1)
-            resolution = gr.Slider(label="resolution", minimum=256, maximum=1024, value=512, step=64)
-            run_button = gr.Button(label="Run")
-        with gr.Column():
-            gallery = gr.Gallery(label="Generated images", show_label=False).style(height="auto")
-    run_button.click(fn=canny, inputs=[input_image, resolution, low_threshold, high_threshold], outputs=[gallery])
-
-    with gr.Row():
-        gr.Markdown("## HED Edge")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source='upload', type="numpy")
-            resolution = gr.Slider(label="resolution", minimum=256, maximum=1024, value=512, step=64)
-            run_button = gr.Button(label="Run")
-        with gr.Column():
-            gallery = gr.Gallery(label="Generated images", show_label=False).style(height="auto")
-    run_button.click(fn=hed, inputs=[input_image, resolution], outputs=[gallery])
-
-    with gr.Row():
-        gr.Markdown("## MLSD Edge")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source='upload', type="numpy")
-            value_threshold = gr.Slider(label="value_threshold", minimum=0.01, maximum=2.0, value=0.1, step=0.01)
-            distance_threshold = gr.Slider(label="distance_threshold", minimum=0.01, maximum=20.0, value=0.1, step=0.01)
-            resolution = gr.Slider(label="resolution", minimum=256, maximum=1024, value=384, step=64)
-            run_button = gr.Button(label="Run")
-        with gr.Column():
-            gallery = gr.Gallery(label="Generated images", show_label=False).style(height="auto")
-    run_button.click(fn=mlsd, inputs=[input_image, resolution, value_threshold, distance_threshold], outputs=[gallery])
-
-    with gr.Row():
-        gr.Markdown("## MIDAS Depth and Normal")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source='upload', type="numpy")
-            alpha = gr.Slider(label="alpha", minimum=0.1, maximum=20.0, value=6.2, step=0.01)
-            resolution = gr.Slider(label="resolution", minimum=256, maximum=1024, value=384, step=64)
-            run_button = gr.Button(label="Run")
-        with gr.Column():
-            gallery = gr.Gallery(label="Generated images", show_label=False).style(height="auto")
-    run_button.click(fn=midas, inputs=[input_image, resolution, alpha], outputs=[gallery])
-
-    with gr.Row():
-        gr.Markdown("## Openpose")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source='upload', type="numpy")
-            hand = gr.Checkbox(label='detect hand', value=False)
-            resolution = gr.Slider(label="resolution", minimum=256, maximum=1024, value=512, step=64)
-            run_button = gr.Button(label="Run")
-        with gr.Column():
-            gallery = gr.Gallery(label="Generated images", show_label=False).style(height="auto")
-    run_button.click(fn=openpose, inputs=[input_image, resolution, hand], outputs=[gallery])
-
-
-    with gr.Row():
-        gr.Markdown("## Uniformer Segmentation")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source='upload', type="numpy")
-            resolution = gr.Slider(label="resolution", minimum=256, maximum=1024, value=512, step=64)
-            run_button = gr.Button(label="Run")
-        with gr.Column():
-            gallery = gr.Gallery(label="Generated images", show_label=False).style(height="auto")
-    run_button.click(fn=uniformer, inputs=[input_image, resolution], outputs=[gallery])
-
-
-block.launch(server_name='0.0.0.0')
diff --git a/ControlNet/gradio_canny2image.py b/ControlNet/gradio_canny2image.py
deleted file mode 100644
index 9866cac5b35925576c20ef4b9ee8b1b1cca235b2..0000000000000000000000000000000000000000
--- a/ControlNet/gradio_canny2image.py
+++ /dev/null
@@ -1,97 +0,0 @@
-from share import *
-import config
-
-import cv2
-import einops
-import gradio as gr
-import numpy as np
-import torch
-import random
-
-from pytorch_lightning import seed_everything
-from annotator.util import resize_image, HWC3
-from annotator.canny import CannyDetector
-from cldm.model import create_model, load_state_dict
-from cldm.ddim_hacked import DDIMSampler
-
-
-apply_canny = CannyDetector()
-
-model = create_model('./models/cldm_v15.yaml').cpu()
-model.load_state_dict(load_state_dict('./models/control_sd15_canny.pth', location='cuda'))
-model = model.cuda()
-ddim_sampler = DDIMSampler(model)
-
-
-def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, ddim_steps, guess_mode, strength, scale, seed, eta, low_threshold, high_threshold):
-    with torch.no_grad():
-        img = resize_image(HWC3(input_image), image_resolution)
-        H, W, C = img.shape
-
-        detected_map = apply_canny(img, low_threshold, high_threshold)
-        detected_map = HWC3(detected_map)
-
-        control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
-        control = torch.stack([control for _ in range(num_samples)], dim=0)
-        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
-
-        if seed == -1:
-            seed = random.randint(0, 65535)
-        seed_everything(seed)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=False)
-
-        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
-        un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
-        shape = (4, H // 8, W // 8)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=True)
-
-        model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
-                                                     shape, cond, verbose=False, eta=eta,
-                                                     unconditional_guidance_scale=scale,
-                                                     unconditional_conditioning=un_cond)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=False)
-
-        x_samples = model.decode_first_stage(samples)
-        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
-
-        results = [x_samples[i] for i in range(num_samples)]
-    return [255 - detected_map] + results
-
-
-block = gr.Blocks().queue()
-with block:
-    with gr.Row():
-        gr.Markdown("## Control Stable Diffusion with Canny Edge Maps")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source='upload', type="numpy")
-            prompt = gr.Textbox(label="Prompt")
-            run_button = gr.Button(label="Run")
-            with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
-                image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
-                strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
-                guess_mode = gr.Checkbox(label='Guess Mode', value=False)
-                low_threshold = gr.Slider(label="Canny low threshold", minimum=1, maximum=255, value=100, step=1)
-                high_threshold = gr.Slider(label="Canny high threshold", minimum=1, maximum=255, value=200, step=1)
-                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-                eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed')
-                n_prompt = gr.Textbox(label="Negative Prompt",
-                                      value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
-        with gr.Column():
-            result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-    ips = [input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, ddim_steps, guess_mode, strength, scale, seed, eta, low_threshold, high_threshold]
-    run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
-
-
-block.launch(server_name='0.0.0.0')
diff --git a/ControlNet/gradio_depth2image.py b/ControlNet/gradio_depth2image.py
deleted file mode 100644
index ee678999ae6033c18a5026bc5f6286d0364c7851..0000000000000000000000000000000000000000
--- a/ControlNet/gradio_depth2image.py
+++ /dev/null
@@ -1,98 +0,0 @@
-from share import *
-import config
-
-import cv2
-import einops
-import gradio as gr
-import numpy as np
-import torch
-import random
-
-from pytorch_lightning import seed_everything
-from annotator.util import resize_image, HWC3
-from annotator.midas import MidasDetector
-from cldm.model import create_model, load_state_dict
-from cldm.ddim_hacked import DDIMSampler
-
-
-apply_midas = MidasDetector()
-
-model = create_model('./models/cldm_v15.yaml').cpu()
-model.load_state_dict(load_state_dict('./models/control_sd15_depth.pth', location='cuda'))
-model = model.cuda()
-ddim_sampler = DDIMSampler(model)
-
-
-def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta):
-    with torch.no_grad():
-        input_image = HWC3(input_image)
-        detected_map, _ = apply_midas(resize_image(input_image, detect_resolution))
-        detected_map = HWC3(detected_map)
-        img = resize_image(input_image, image_resolution)
-        H, W, C = img.shape
-
-        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
-
-        control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
-        control = torch.stack([control for _ in range(num_samples)], dim=0)
-        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
-
-        if seed == -1:
-            seed = random.randint(0, 65535)
-        seed_everything(seed)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=False)
-
-        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
-        un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
-        shape = (4, H // 8, W // 8)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=True)
-
-        model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
-                                                     shape, cond, verbose=False, eta=eta,
-                                                     unconditional_guidance_scale=scale,
-                                                     unconditional_conditioning=un_cond)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=False)
-
-        x_samples = model.decode_first_stage(samples)
-        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
-
-        results = [x_samples[i] for i in range(num_samples)]
-    return [detected_map] + results
-
-
-block = gr.Blocks().queue()
-with block:
-    with gr.Row():
-        gr.Markdown("## Control Stable Diffusion with Depth Maps")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source='upload', type="numpy")
-            prompt = gr.Textbox(label="Prompt")
-            run_button = gr.Button(label="Run")
-            with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
-                image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
-                strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
-                guess_mode = gr.Checkbox(label='Guess Mode', value=False)
-                detect_resolution = gr.Slider(label="Depth Resolution", minimum=128, maximum=1024, value=384, step=1)
-                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-                eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed')
-                n_prompt = gr.Textbox(label="Negative Prompt",
-                                      value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
-        with gr.Column():
-            result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-    ips = [input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta]
-    run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
-
-
-block.launch(server_name='0.0.0.0')
diff --git a/ControlNet/gradio_fake_scribble2image.py b/ControlNet/gradio_fake_scribble2image.py
deleted file mode 100644
index a7cd375f7589c3f7c43b7df91802eb4bf87ea0e0..0000000000000000000000000000000000000000
--- a/ControlNet/gradio_fake_scribble2image.py
+++ /dev/null
@@ -1,102 +0,0 @@
-from share import *
-import config
-
-import cv2
-import einops
-import gradio as gr
-import numpy as np
-import torch
-import random
-
-from pytorch_lightning import seed_everything
-from annotator.util import resize_image, HWC3
-from annotator.hed import HEDdetector, nms
-from cldm.model import create_model, load_state_dict
-from cldm.ddim_hacked import DDIMSampler
-
-
-apply_hed = HEDdetector()
-
-model = create_model('./models/cldm_v15.yaml').cpu()
-model.load_state_dict(load_state_dict('./models/control_sd15_scribble.pth', location='cuda'))
-model = model.cuda()
-ddim_sampler = DDIMSampler(model)
-
-
-def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta):
-    with torch.no_grad():
-        input_image = HWC3(input_image)
-        detected_map = apply_hed(resize_image(input_image, detect_resolution))
-        detected_map = HWC3(detected_map)
-        img = resize_image(input_image, image_resolution)
-        H, W, C = img.shape
-
-        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
-        detected_map = nms(detected_map, 127, 3.0)
-        detected_map = cv2.GaussianBlur(detected_map, (0, 0), 3.0)
-        detected_map[detected_map > 4] = 255
-        detected_map[detected_map < 255] = 0
-
-        control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
-        control = torch.stack([control for _ in range(num_samples)], dim=0)
-        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
-
-        if seed == -1:
-            seed = random.randint(0, 65535)
-        seed_everything(seed)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=False)
-
-        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
-        un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
-        shape = (4, H // 8, W // 8)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=True)
-
-        model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
-                                                     shape, cond, verbose=False, eta=eta,
-                                                     unconditional_guidance_scale=scale,
-                                                     unconditional_conditioning=un_cond)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=False)
-
-        x_samples = model.decode_first_stage(samples)
-        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
-
-        results = [x_samples[i] for i in range(num_samples)]
-    return [255 - detected_map] + results
-
-
-block = gr.Blocks().queue()
-with block:
-    with gr.Row():
-        gr.Markdown("## Control Stable Diffusion with Fake Scribble Maps")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source='upload', type="numpy")
-            prompt = gr.Textbox(label="Prompt")
-            run_button = gr.Button(label="Run")
-            with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
-                image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
-                strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
-                guess_mode = gr.Checkbox(label='Guess Mode', value=False)
-                detect_resolution = gr.Slider(label="HED Resolution", minimum=128, maximum=1024, value=512, step=1)
-                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-                eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed')
-                n_prompt = gr.Textbox(label="Negative Prompt",
-                                      value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
-        with gr.Column():
-            result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-    ips = [input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta]
-    run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
-
-
-block.launch(server_name='0.0.0.0')
diff --git a/ControlNet/gradio_hed2image.py b/ControlNet/gradio_hed2image.py
deleted file mode 100644
index 1ceff67969b7c64a0adcf0557f922c71dd4bfab7..0000000000000000000000000000000000000000
--- a/ControlNet/gradio_hed2image.py
+++ /dev/null
@@ -1,98 +0,0 @@
-from share import *
-import config
-
-import cv2
-import einops
-import gradio as gr
-import numpy as np
-import torch
-import random
-
-from pytorch_lightning import seed_everything
-from annotator.util import resize_image, HWC3
-from annotator.hed import HEDdetector
-from cldm.model import create_model, load_state_dict
-from cldm.ddim_hacked import DDIMSampler
-
-
-apply_hed = HEDdetector()
-
-model = create_model('./models/cldm_v15.yaml').cpu()
-model.load_state_dict(load_state_dict('./models/control_sd15_hed.pth', location='cuda'))
-model = model.cuda()
-ddim_sampler = DDIMSampler(model)
-
-
-def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta):
-    with torch.no_grad():
-        input_image = HWC3(input_image)
-        detected_map = apply_hed(resize_image(input_image, detect_resolution))
-        detected_map = HWC3(detected_map)
-        img = resize_image(input_image, image_resolution)
-        H, W, C = img.shape
-
-        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
-
-        control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
-        control = torch.stack([control for _ in range(num_samples)], dim=0)
-        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
-
-        if seed == -1:
-            seed = random.randint(0, 65535)
-        seed_everything(seed)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=False)
-
-        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
-        un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
-        shape = (4, H // 8, W // 8)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=True)
-
-        model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
-                                                     shape, cond, verbose=False, eta=eta,
-                                                     unconditional_guidance_scale=scale,
-                                                     unconditional_conditioning=un_cond)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=False)
-
-        x_samples = model.decode_first_stage(samples)
-        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
-
-        results = [x_samples[i] for i in range(num_samples)]
-    return [detected_map] + results
-
-
-block = gr.Blocks().queue()
-with block:
-    with gr.Row():
-        gr.Markdown("## Control Stable Diffusion with HED Maps")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source='upload', type="numpy")
-            prompt = gr.Textbox(label="Prompt")
-            run_button = gr.Button(label="Run")
-            with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
-                image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
-                strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
-                guess_mode = gr.Checkbox(label='Guess Mode', value=False)
-                detect_resolution = gr.Slider(label="HED Resolution", minimum=128, maximum=1024, value=512, step=1)
-                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-                eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed')
-                n_prompt = gr.Textbox(label="Negative Prompt",
-                                      value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
-        with gr.Column():
-            result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-    ips = [input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta]
-    run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
-
-
-block.launch(server_name='0.0.0.0')
diff --git a/ControlNet/gradio_hough2image.py b/ControlNet/gradio_hough2image.py
deleted file mode 100644
index 6095eeb6767e005a155ee72057b3537021b09f31..0000000000000000000000000000000000000000
--- a/ControlNet/gradio_hough2image.py
+++ /dev/null
@@ -1,100 +0,0 @@
-from share import *
-import config
-
-import cv2
-import einops
-import gradio as gr
-import numpy as np
-import torch
-import random
-
-from pytorch_lightning import seed_everything
-from annotator.util import resize_image, HWC3
-from annotator.mlsd import MLSDdetector
-from cldm.model import create_model, load_state_dict
-from cldm.ddim_hacked import DDIMSampler
-
-
-apply_mlsd = MLSDdetector()
-
-model = create_model('./models/cldm_v15.yaml').cpu()
-model.load_state_dict(load_state_dict('./models/control_sd15_mlsd.pth', location='cuda'))
-model = model.cuda()
-ddim_sampler = DDIMSampler(model)
-
-
-def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta, value_threshold, distance_threshold):
-    with torch.no_grad():
-        input_image = HWC3(input_image)
-        detected_map = apply_mlsd(resize_image(input_image, detect_resolution), value_threshold, distance_threshold)
-        detected_map = HWC3(detected_map)
-        img = resize_image(input_image, image_resolution)
-        H, W, C = img.shape
-
-        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
-
-        control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
-        control = torch.stack([control for _ in range(num_samples)], dim=0)
-        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
-
-        if seed == -1:
-            seed = random.randint(0, 65535)
-        seed_everything(seed)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=False)
-
-        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
-        un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
-        shape = (4, H // 8, W // 8)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=True)
-
-        model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
-                                                     shape, cond, verbose=False, eta=eta,
-                                                     unconditional_guidance_scale=scale,
-                                                     unconditional_conditioning=un_cond)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=False)
-
-        x_samples = model.decode_first_stage(samples)
-        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
-
-        results = [x_samples[i] for i in range(num_samples)]
-    return [255 - cv2.dilate(detected_map, np.ones(shape=(3, 3), dtype=np.uint8), iterations=1)] + results
-
-
-block = gr.Blocks().queue()
-with block:
-    with gr.Row():
-        gr.Markdown("## Control Stable Diffusion with Hough Line Maps")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source='upload', type="numpy")
-            prompt = gr.Textbox(label="Prompt")
-            run_button = gr.Button(label="Run")
-            with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
-                image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
-                strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
-                guess_mode = gr.Checkbox(label='Guess Mode', value=False)
-                detect_resolution = gr.Slider(label="Hough Resolution", minimum=128, maximum=1024, value=512, step=1)
-                value_threshold = gr.Slider(label="Hough value threshold (MLSD)", minimum=0.01, maximum=2.0, value=0.1, step=0.01)
-                distance_threshold = gr.Slider(label="Hough distance threshold (MLSD)", minimum=0.01, maximum=20.0, value=0.1, step=0.01)
-                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-                eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed')
-                n_prompt = gr.Textbox(label="Negative Prompt",
-                                      value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
-        with gr.Column():
-            result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-    ips = [input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta, value_threshold, distance_threshold]
-    run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
-
-
-block.launch(server_name='0.0.0.0')
diff --git a/ControlNet/gradio_normal2image.py b/ControlNet/gradio_normal2image.py
deleted file mode 100644
index 30aea2f8d4a7956a609cd003f2fee23d2ab162b5..0000000000000000000000000000000000000000
--- a/ControlNet/gradio_normal2image.py
+++ /dev/null
@@ -1,99 +0,0 @@
-from share import *
-import config
-
-import cv2
-import einops
-import gradio as gr
-import numpy as np
-import torch
-import random
-
-from pytorch_lightning import seed_everything
-from annotator.util import resize_image, HWC3
-from annotator.midas import MidasDetector
-from cldm.model import create_model, load_state_dict
-from cldm.ddim_hacked import DDIMSampler
-
-
-apply_midas = MidasDetector()
-
-model = create_model('./models/cldm_v15.yaml').cpu()
-model.load_state_dict(load_state_dict('./models/control_sd15_normal.pth', location='cuda'))
-model = model.cuda()
-ddim_sampler = DDIMSampler(model)
-
-
-def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta, bg_threshold):
-    with torch.no_grad():
-        input_image = HWC3(input_image)
-        _, detected_map = apply_midas(resize_image(input_image, detect_resolution), bg_th=bg_threshold)
-        detected_map = HWC3(detected_map)
-        img = resize_image(input_image, image_resolution)
-        H, W, C = img.shape
-
-        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
-
-        control = torch.from_numpy(detected_map[:, :, ::-1].copy()).float().cuda() / 255.0
-        control = torch.stack([control for _ in range(num_samples)], dim=0)
-        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
-
-        if seed == -1:
-            seed = random.randint(0, 65535)
-        seed_everything(seed)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=False)
-
-        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
-        un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
-        shape = (4, H // 8, W // 8)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=True)
-
-        model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
-                                                     shape, cond, verbose=False, eta=eta,
-                                                     unconditional_guidance_scale=scale,
-                                                     unconditional_conditioning=un_cond)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=False)
-
-        x_samples = model.decode_first_stage(samples)
-        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
-
-        results = [x_samples[i] for i in range(num_samples)]
-    return [detected_map] + results
-
-
-block = gr.Blocks().queue()
-with block:
-    with gr.Row():
-        gr.Markdown("## Control Stable Diffusion with Normal Maps")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source='upload', type="numpy")
-            prompt = gr.Textbox(label="Prompt")
-            run_button = gr.Button(label="Run")
-            with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
-                image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
-                strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
-                guess_mode = gr.Checkbox(label='Guess Mode', value=False)
-                detect_resolution = gr.Slider(label="Normal Resolution", minimum=128, maximum=1024, value=384, step=1)
-                bg_threshold = gr.Slider(label="Normal background threshold", minimum=0.0, maximum=1.0, value=0.4, step=0.01)
-                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-                eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed')
-                n_prompt = gr.Textbox(label="Negative Prompt",
-                                      value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
-        with gr.Column():
-            result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-    ips = [input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta, bg_threshold]
-    run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
-
-
-block.launch(server_name='0.0.0.0')
diff --git a/ControlNet/gradio_pose2image.py b/ControlNet/gradio_pose2image.py
deleted file mode 100644
index 700973bfab2d6d5a1e294ad6ab96a85e413152b0..0000000000000000000000000000000000000000
--- a/ControlNet/gradio_pose2image.py
+++ /dev/null
@@ -1,98 +0,0 @@
-from share import *
-import config
-
-import cv2
-import einops
-import gradio as gr
-import numpy as np
-import torch
-import random
-
-from pytorch_lightning import seed_everything
-from annotator.util import resize_image, HWC3
-from annotator.openpose import OpenposeDetector
-from cldm.model import create_model, load_state_dict
-from cldm.ddim_hacked import DDIMSampler
-
-
-apply_openpose = OpenposeDetector()
-
-model = create_model('./models/cldm_v15.yaml').cpu()
-model.load_state_dict(load_state_dict('./models/control_sd15_openpose.pth', location='cuda'))
-model = model.cuda()
-ddim_sampler = DDIMSampler(model)
-
-
-def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta):
-    with torch.no_grad():
-        input_image = HWC3(input_image)
-        detected_map, _ = apply_openpose(resize_image(input_image, detect_resolution))
-        detected_map = HWC3(detected_map)
-        img = resize_image(input_image, image_resolution)
-        H, W, C = img.shape
-
-        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
-
-        control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
-        control = torch.stack([control for _ in range(num_samples)], dim=0)
-        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
-
-        if seed == -1:
-            seed = random.randint(0, 65535)
-        seed_everything(seed)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=False)
-
-        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
-        un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
-        shape = (4, H // 8, W // 8)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=True)
-
-        model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
-                                                     shape, cond, verbose=False, eta=eta,
-                                                     unconditional_guidance_scale=scale,
-                                                     unconditional_conditioning=un_cond)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=False)
-
-        x_samples = model.decode_first_stage(samples)
-        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
-
-        results = [x_samples[i] for i in range(num_samples)]
-    return [detected_map] + results
-
-
-block = gr.Blocks().queue()
-with block:
-    with gr.Row():
-        gr.Markdown("## Control Stable Diffusion with Human Pose")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source='upload', type="numpy")
-            prompt = gr.Textbox(label="Prompt")
-            run_button = gr.Button(label="Run")
-            with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
-                image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
-                strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
-                guess_mode = gr.Checkbox(label='Guess Mode', value=False)
-                detect_resolution = gr.Slider(label="OpenPose Resolution", minimum=128, maximum=1024, value=512, step=1)
-                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-                eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed')
-                n_prompt = gr.Textbox(label="Negative Prompt",
-                                      value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
-        with gr.Column():
-            result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-    ips = [input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta]
-    run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
-
-
-block.launch(server_name='0.0.0.0')
diff --git a/ControlNet/gradio_scribble2image.py b/ControlNet/gradio_scribble2image.py
deleted file mode 100644
index 8abbc25bdeaeae23b9032101336682657825ead4..0000000000000000000000000000000000000000
--- a/ControlNet/gradio_scribble2image.py
+++ /dev/null
@@ -1,92 +0,0 @@
-from share import *
-import config
-
-import cv2
-import einops
-import gradio as gr
-import numpy as np
-import torch
-import random
-
-from pytorch_lightning import seed_everything
-from annotator.util import resize_image, HWC3
-from cldm.model import create_model, load_state_dict
-from cldm.ddim_hacked import DDIMSampler
-
-
-model = create_model('./models/cldm_v15.yaml').cpu()
-model.load_state_dict(load_state_dict('./models/control_sd15_scribble.pth', location='cuda'))
-model = model.cuda()
-ddim_sampler = DDIMSampler(model)
-
-
-def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, ddim_steps, guess_mode, strength, scale, seed, eta):
-    with torch.no_grad():
-        img = resize_image(HWC3(input_image), image_resolution)
-        H, W, C = img.shape
-
-        detected_map = np.zeros_like(img, dtype=np.uint8)
-        detected_map[np.min(img, axis=2) < 127] = 255
-
-        control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
-        control = torch.stack([control for _ in range(num_samples)], dim=0)
-        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
-
-        if seed == -1:
-            seed = random.randint(0, 65535)
-        seed_everything(seed)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=False)
-
-        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
-        un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
-        shape = (4, H // 8, W // 8)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=True)
-
-        model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
-                                                     shape, cond, verbose=False, eta=eta,
-                                                     unconditional_guidance_scale=scale,
-                                                     unconditional_conditioning=un_cond)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=False)
-
-        x_samples = model.decode_first_stage(samples)
-        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
-
-        results = [x_samples[i] for i in range(num_samples)]
-    return [255 - detected_map] + results
-
-
-block = gr.Blocks().queue()
-with block:
-    with gr.Row():
-        gr.Markdown("## Control Stable Diffusion with Scribble Maps")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source='upload', type="numpy")
-            prompt = gr.Textbox(label="Prompt")
-            run_button = gr.Button(label="Run")
-            with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
-                image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
-                strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
-                guess_mode = gr.Checkbox(label='Guess Mode', value=False)
-                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-                eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed')
-                n_prompt = gr.Textbox(label="Negative Prompt",
-                                      value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
-        with gr.Column():
-            result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-    ips = [input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, ddim_steps, guess_mode, strength, scale, seed, eta]
-    run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
-
-
-block.launch(server_name='0.0.0.0')
diff --git a/ControlNet/gradio_scribble2image_interactive.py b/ControlNet/gradio_scribble2image_interactive.py
deleted file mode 100644
index 7308bcc1bb8387bba10c026495e0dcddae91c2db..0000000000000000000000000000000000000000
--- a/ControlNet/gradio_scribble2image_interactive.py
+++ /dev/null
@@ -1,102 +0,0 @@
-from share import *
-import config
-
-import cv2
-import einops
-import gradio as gr
-import numpy as np
-import torch
-import random
-
-from pytorch_lightning import seed_everything
-from annotator.util import resize_image, HWC3
-from cldm.model import create_model, load_state_dict
-from cldm.ddim_hacked import DDIMSampler
-
-
-model = create_model('./models/cldm_v15.yaml').cpu()
-model.load_state_dict(load_state_dict('./models/control_sd15_scribble.pth', location='cuda'))
-model = model.cuda()
-ddim_sampler = DDIMSampler(model)
-
-
-def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, ddim_steps, guess_mode, strength, scale, seed, eta):
-    with torch.no_grad():
-        img = resize_image(HWC3(input_image['mask'][:, :, 0]), image_resolution)
-        H, W, C = img.shape
-
-        detected_map = np.zeros_like(img, dtype=np.uint8)
-        detected_map[np.min(img, axis=2) > 127] = 255
-
-        control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
-        control = torch.stack([control for _ in range(num_samples)], dim=0)
-        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
-
-        if seed == -1:
-            seed = random.randint(0, 65535)
-        seed_everything(seed)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=False)
-
-        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
-        un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
-        shape = (4, H // 8, W // 8)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=True)
-
-        model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
-                                                     shape, cond, verbose=False, eta=eta,
-                                                     unconditional_guidance_scale=scale,
-                                                     unconditional_conditioning=un_cond)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=False)
-
-        x_samples = model.decode_first_stage(samples)
-        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
-
-        results = [x_samples[i] for i in range(num_samples)]
-    return [255 - detected_map] + results
-
-
-def create_canvas(w, h):
-    return np.zeros(shape=(h, w, 3), dtype=np.uint8) + 255
-
-
-block = gr.Blocks().queue()
-with block:
-    with gr.Row():
-        gr.Markdown("## Control Stable Diffusion with Interactive Scribbles")
-    with gr.Row():
-        with gr.Column():
-            canvas_width = gr.Slider(label="Canvas Width", minimum=256, maximum=1024, value=512, step=1)
-            canvas_height = gr.Slider(label="Canvas Height", minimum=256, maximum=1024, value=512, step=1)
-            create_button = gr.Button(label="Start", value='Open drawing canvas!')
-            input_image = gr.Image(source='upload', type='numpy', tool='sketch')
-            gr.Markdown(value='Do not forget to change your brush width to make it thinner. (Gradio do not allow developers to set brush width so you need to do it manually.) '
-                              'Just click on the small pencil icon in the upper right corner of the above block.')
-            create_button.click(fn=create_canvas, inputs=[canvas_width, canvas_height], outputs=[input_image])
-            prompt = gr.Textbox(label="Prompt")
-            run_button = gr.Button(label="Run")
-            with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
-                image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
-                strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
-                guess_mode = gr.Checkbox(label='Guess Mode', value=False)
-                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-                eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed')
-                n_prompt = gr.Textbox(label="Negative Prompt",
-                                      value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
-        with gr.Column():
-            result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-    ips = [input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, ddim_steps, guess_mode, strength, scale, seed, eta]
-    run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
-
-
-block.launch(server_name='0.0.0.0')
diff --git a/ControlNet/gradio_seg2image.py b/ControlNet/gradio_seg2image.py
deleted file mode 100644
index c3854dc7624ed6a0a68f059c5001e4973da27587..0000000000000000000000000000000000000000
--- a/ControlNet/gradio_seg2image.py
+++ /dev/null
@@ -1,97 +0,0 @@
-from share import *
-import config
-
-import cv2
-import einops
-import gradio as gr
-import numpy as np
-import torch
-import random
-
-from pytorch_lightning import seed_everything
-from annotator.util import resize_image, HWC3
-from annotator.uniformer import UniformerDetector
-from cldm.model import create_model, load_state_dict
-from cldm.ddim_hacked import DDIMSampler
-
-
-apply_uniformer = UniformerDetector()
-
-model = create_model('./models/cldm_v15.yaml').cpu()
-model.load_state_dict(load_state_dict('./models/control_sd15_seg.pth', location='cuda'))
-model = model.cuda()
-ddim_sampler = DDIMSampler(model)
-
-
-def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta):
-    with torch.no_grad():
-        input_image = HWC3(input_image)
-        detected_map = apply_uniformer(resize_image(input_image, detect_resolution))
-        img = resize_image(input_image, image_resolution)
-        H, W, C = img.shape
-
-        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
-
-        control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
-        control = torch.stack([control for _ in range(num_samples)], dim=0)
-        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
-
-        if seed == -1:
-            seed = random.randint(0, 65535)
-        seed_everything(seed)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=False)
-
-        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
-        un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
-        shape = (4, H // 8, W // 8)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=True)
-
-        model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
-                                                     shape, cond, verbose=False, eta=eta,
-                                                     unconditional_guidance_scale=scale,
-                                                     unconditional_conditioning=un_cond)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=False)
-
-        x_samples = model.decode_first_stage(samples)
-        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
-
-        results = [x_samples[i] for i in range(num_samples)]
-    return [detected_map] + results
-
-
-block = gr.Blocks().queue()
-with block:
-    with gr.Row():
-        gr.Markdown("## Control Stable Diffusion with Segmentation Maps")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source='upload', type="numpy")
-            prompt = gr.Textbox(label="Prompt")
-            run_button = gr.Button(label="Run")
-            with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
-                image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
-                strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
-                guess_mode = gr.Checkbox(label='Guess Mode', value=False)
-                detect_resolution = gr.Slider(label="Segmentation Resolution", minimum=128, maximum=1024, value=512, step=1)
-                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-                eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed')
-                n_prompt = gr.Textbox(label="Negative Prompt",
-                                      value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
-        with gr.Column():
-            result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-    ips = [input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta]
-    run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
-
-
-block.launch(server_name='0.0.0.0')
diff --git a/ControlNet/ldm/data/__init__.py b/ControlNet/ldm/data/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/ControlNet/ldm/data/util.py b/ControlNet/ldm/data/util.py
deleted file mode 100644
index 5b60ceb2349e3bd7900ff325740e2022d2903b1c..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/data/util.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import torch
-
-from ldm.modules.midas.api import load_midas_transform
-
-
-class AddMiDaS(object):
-    def __init__(self, model_type):
-        super().__init__()
-        self.transform = load_midas_transform(model_type)
-
-    def pt2np(self, x):
-        x = ((x + 1.0) * .5).detach().cpu().numpy()
-        return x
-
-    def np2pt(self, x):
-        x = torch.from_numpy(x) * 2 - 1.
-        return x
-
-    def __call__(self, sample):
-        # sample['jpg'] is tensor hwc in [-1, 1] at this point
-        x = self.pt2np(sample['jpg'])
-        x = self.transform({"image": x})["image"]
-        sample['midas_in'] = x
-        return sample
\ No newline at end of file
diff --git a/ControlNet/ldm/models/autoencoder.py b/ControlNet/ldm/models/autoencoder.py
deleted file mode 100644
index d411c834e25e371cdcf165fd3589808ed43be0f9..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/models/autoencoder.py
+++ /dev/null
@@ -1,219 +0,0 @@
-import torch
-import pytorch_lightning as pl
-import torch.nn.functional as F
-from contextlib import contextmanager
-
-from ControlNet.ldm.modules.diffusionmodules.model import Encoder, Decoder
-from ControlNet.ldm.modules.distributions.distributions import DiagonalGaussianDistribution
-
-from ControlNet.ldm.util import instantiate_from_config
-from ControlNet.ldm.modules.ema import LitEma
-
-
-class AutoencoderKL(pl.LightningModule):
-    def __init__(self,
-                 ddconfig,
-                 lossconfig,
-                 embed_dim,
-                 ckpt_path=None,
-                 ignore_keys=[],
-                 image_key="image",
-                 colorize_nlabels=None,
-                 monitor=None,
-                 ema_decay=None,
-                 learn_logvar=False
-                 ):
-        super().__init__()
-        self.learn_logvar = learn_logvar
-        self.image_key = image_key
-        self.encoder = Encoder(**ddconfig)
-        self.decoder = Decoder(**ddconfig)
-        self.loss = instantiate_from_config(lossconfig)
-        assert ddconfig["double_z"]
-        self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
-        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
-        self.embed_dim = embed_dim
-        if colorize_nlabels is not None:
-            assert type(colorize_nlabels)==int
-            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
-        if monitor is not None:
-            self.monitor = monitor
-
-        self.use_ema = ema_decay is not None
-        if self.use_ema:
-            self.ema_decay = ema_decay
-            assert 0. < ema_decay < 1.
-            self.model_ema = LitEma(self, decay=ema_decay)
-            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
-
-        if ckpt_path is not None:
-            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
-
-    def init_from_ckpt(self, path, ignore_keys=list()):
-        sd = torch.load(path, map_location="cpu")["state_dict"]
-        keys = list(sd.keys())
-        for k in keys:
-            for ik in ignore_keys:
-                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
-                    del sd[k]
-        self.load_state_dict(sd, strict=False)
-        print(f"Restored from {path}")
-
-    @contextmanager
-    def ema_scope(self, context=None):
-        if self.use_ema:
-            self.model_ema.store(self.parameters())
-            self.model_ema.copy_to(self)
-            if context is not None:
-                print(f"{context}: Switched to EMA weights")
-        try:
-            yield None
-        finally:
-            if self.use_ema:
-                self.model_ema.restore(self.parameters())
-                if context is not None:
-                    print(f"{context}: Restored training weights")
-
-    def on_train_batch_end(self, *args, **kwargs):
-        if self.use_ema:
-            self.model_ema(self)
-
-    def encode(self, x):
-        h = self.encoder(x)
-        moments = self.quant_conv(h)
-        posterior = DiagonalGaussianDistribution(moments)
-        return posterior
-
-    def decode(self, z):
-        z = self.post_quant_conv(z)
-        dec = self.decoder(z)
-        return dec
-
-    def forward(self, input, sample_posterior=True):
-        posterior = self.encode(input)
-        if sample_posterior:
-            z = posterior.sample()
-        else:
-            z = posterior.mode()
-        dec = self.decode(z)
-        return dec, posterior
-
-    def get_input(self, batch, k):
-        x = batch[k]
-        if len(x.shape) == 3:
-            x = x[..., None]
-        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
-        return x
-
-    def training_step(self, batch, batch_idx, optimizer_idx):
-        inputs = self.get_input(batch, self.image_key)
-        reconstructions, posterior = self(inputs)
-
-        if optimizer_idx == 0:
-            # train encoder+decoder+logvar
-            aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
-                                            last_layer=self.get_last_layer(), split="train")
-            self.log("aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
-            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False)
-            return aeloss
-
-        if optimizer_idx == 1:
-            # train the discriminator
-            discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
-                                                last_layer=self.get_last_layer(), split="train")
-
-            self.log("discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
-            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=False)
-            return discloss
-
-    def validation_step(self, batch, batch_idx):
-        log_dict = self._validation_step(batch, batch_idx)
-        with self.ema_scope():
-            log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema")
-        return log_dict
-
-    def _validation_step(self, batch, batch_idx, postfix=""):
-        inputs = self.get_input(batch, self.image_key)
-        reconstructions, posterior = self(inputs)
-        aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step,
-                                        last_layer=self.get_last_layer(), split="val"+postfix)
-
-        discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step,
-                                            last_layer=self.get_last_layer(), split="val"+postfix)
-
-        self.log(f"val{postfix}/rec_loss", log_dict_ae[f"val{postfix}/rec_loss"])
-        self.log_dict(log_dict_ae)
-        self.log_dict(log_dict_disc)
-        return self.log_dict
-
-    def configure_optimizers(self):
-        lr = self.learning_rate
-        ae_params_list = list(self.encoder.parameters()) + list(self.decoder.parameters()) + list(
-            self.quant_conv.parameters()) + list(self.post_quant_conv.parameters())
-        if self.learn_logvar:
-            print(f"{self.__class__.__name__}: Learning logvar")
-            ae_params_list.append(self.loss.logvar)
-        opt_ae = torch.optim.Adam(ae_params_list,
-                                  lr=lr, betas=(0.5, 0.9))
-        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
-                                    lr=lr, betas=(0.5, 0.9))
-        return [opt_ae, opt_disc], []
-
-    def get_last_layer(self):
-        return self.decoder.conv_out.weight
-
-    @torch.no_grad()
-    def log_images(self, batch, only_inputs=False, log_ema=False, **kwargs):
-        log = dict()
-        x = self.get_input(batch, self.image_key)
-        x = x.to(self.device)
-        if not only_inputs:
-            xrec, posterior = self(x)
-            if x.shape[1] > 3:
-                # colorize with random projection
-                assert xrec.shape[1] > 3
-                x = self.to_rgb(x)
-                xrec = self.to_rgb(xrec)
-            log["samples"] = self.decode(torch.randn_like(posterior.sample()))
-            log["reconstructions"] = xrec
-            if log_ema or self.use_ema:
-                with self.ema_scope():
-                    xrec_ema, posterior_ema = self(x)
-                    if x.shape[1] > 3:
-                        # colorize with random projection
-                        assert xrec_ema.shape[1] > 3
-                        xrec_ema = self.to_rgb(xrec_ema)
-                    log["samples_ema"] = self.decode(torch.randn_like(posterior_ema.sample()))
-                    log["reconstructions_ema"] = xrec_ema
-        log["inputs"] = x
-        return log
-
-    def to_rgb(self, x):
-        assert self.image_key == "segmentation"
-        if not hasattr(self, "colorize"):
-            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
-        x = F.conv2d(x, weight=self.colorize)
-        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
-        return x
-
-
-class IdentityFirstStage(torch.nn.Module):
-    def __init__(self, *args, vq_interface=False, **kwargs):
-        self.vq_interface = vq_interface
-        super().__init__()
-
-    def encode(self, x, *args, **kwargs):
-        return x
-
-    def decode(self, x, *args, **kwargs):
-        return x
-
-    def quantize(self, x, *args, **kwargs):
-        if self.vq_interface:
-            return x, None, [None, None, None]
-        return x
-
-    def forward(self, x, *args, **kwargs):
-        return x
-
diff --git a/ControlNet/ldm/models/diffusion/__init__.py b/ControlNet/ldm/models/diffusion/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/ControlNet/ldm/models/diffusion/ddim.py b/ControlNet/ldm/models/diffusion/ddim.py
deleted file mode 100644
index f1ca6bb1f4d03b58cf20c96fa715750950b19adc..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/models/diffusion/ddim.py
+++ /dev/null
@@ -1,336 +0,0 @@
-"""SAMPLING ONLY."""
-
-import torch
-import numpy as np
-from tqdm import tqdm
-
-from ControlNet.ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like, extract_into_tensor
-
-
-class DDIMSampler(object):
-    def __init__(self, model, schedule="linear", **kwargs):
-        super().__init__()
-        self.model = model
-        self.ddpm_num_timesteps = model.num_timesteps
-        self.schedule = schedule
-
-    def register_buffer(self, name, attr):
-        if type(attr) == torch.Tensor:
-            if attr.device != torch.device("cuda"):
-                attr = attr.to(torch.device("cuda"))
-        setattr(self, name, attr)
-
-    def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
-        self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
-                                                  num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
-        alphas_cumprod = self.model.alphas_cumprod
-        assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
-        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
-
-        self.register_buffer('betas', to_torch(self.model.betas))
-        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
-        self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
-
-        # calculations for diffusion q(x_t | x_{t-1}) and others
-        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
-        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
-
-        # ddim sampling parameters
-        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
-                                                                                   ddim_timesteps=self.ddim_timesteps,
-                                                                                   eta=ddim_eta,verbose=verbose)
-        self.register_buffer('ddim_sigmas', ddim_sigmas)
-        self.register_buffer('ddim_alphas', ddim_alphas)
-        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
-        self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
-        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
-            (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
-                        1 - self.alphas_cumprod / self.alphas_cumprod_prev))
-        self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
-
-    @torch.no_grad()
-    def sample(self,
-               S,
-               batch_size,
-               shape,
-               conditioning=None,
-               callback=None,
-               normals_sequence=None,
-               img_callback=None,
-               quantize_x0=False,
-               eta=0.,
-               mask=None,
-               x0=None,
-               temperature=1.,
-               noise_dropout=0.,
-               score_corrector=None,
-               corrector_kwargs=None,
-               verbose=True,
-               x_T=None,
-               log_every_t=100,
-               unconditional_guidance_scale=1.,
-               unconditional_conditioning=None, # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
-               dynamic_threshold=None,
-               ucg_schedule=None,
-               **kwargs
-               ):
-        if conditioning is not None:
-            if isinstance(conditioning, dict):
-                ctmp = conditioning[list(conditioning.keys())[0]]
-                while isinstance(ctmp, list): ctmp = ctmp[0]
-                cbs = ctmp.shape[0]
-                if cbs != batch_size:
-                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
-
-            elif isinstance(conditioning, list):
-                for ctmp in conditioning:
-                    if ctmp.shape[0] != batch_size:
-                        print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
-
-            else:
-                if conditioning.shape[0] != batch_size:
-                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
-
-        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
-        # sampling
-        C, H, W = shape
-        size = (batch_size, C, H, W)
-        print(f'Data shape for DDIM sampling is {size}, eta {eta}')
-
-        samples, intermediates = self.ddim_sampling(conditioning, size,
-                                                    callback=callback,
-                                                    img_callback=img_callback,
-                                                    quantize_denoised=quantize_x0,
-                                                    mask=mask, x0=x0,
-                                                    ddim_use_original_steps=False,
-                                                    noise_dropout=noise_dropout,
-                                                    temperature=temperature,
-                                                    score_corrector=score_corrector,
-                                                    corrector_kwargs=corrector_kwargs,
-                                                    x_T=x_T,
-                                                    log_every_t=log_every_t,
-                                                    unconditional_guidance_scale=unconditional_guidance_scale,
-                                                    unconditional_conditioning=unconditional_conditioning,
-                                                    dynamic_threshold=dynamic_threshold,
-                                                    ucg_schedule=ucg_schedule
-                                                    )
-        return samples, intermediates
-
-    @torch.no_grad()
-    def ddim_sampling(self, cond, shape,
-                      x_T=None, ddim_use_original_steps=False,
-                      callback=None, timesteps=None, quantize_denoised=False,
-                      mask=None, x0=None, img_callback=None, log_every_t=100,
-                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None, dynamic_threshold=None,
-                      ucg_schedule=None):
-        device = self.model.betas.device
-        b = shape[0]
-        if x_T is None:
-            img = torch.randn(shape, device=device)
-        else:
-            img = x_T
-
-        if timesteps is None:
-            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
-        elif timesteps is not None and not ddim_use_original_steps:
-            subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
-            timesteps = self.ddim_timesteps[:subset_end]
-
-        intermediates = {'x_inter': [img], 'pred_x0': [img]}
-        time_range = reversed(range(0,timesteps)) if ddim_use_original_steps else np.flip(timesteps)
-        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
-        print(f"Running DDIM Sampling with {total_steps} timesteps")
-
-        iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
-
-        for i, step in enumerate(iterator):
-            index = total_steps - i - 1
-            ts = torch.full((b,), step, device=device, dtype=torch.long)
-
-            if mask is not None:
-                assert x0 is not None
-                img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass?
-                img = img_orig * mask + (1. - mask) * img
-
-            if ucg_schedule is not None:
-                assert len(ucg_schedule) == len(time_range)
-                unconditional_guidance_scale = ucg_schedule[i]
-
-            outs = self.p_sample_ddim(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
-                                      quantize_denoised=quantize_denoised, temperature=temperature,
-                                      noise_dropout=noise_dropout, score_corrector=score_corrector,
-                                      corrector_kwargs=corrector_kwargs,
-                                      unconditional_guidance_scale=unconditional_guidance_scale,
-                                      unconditional_conditioning=unconditional_conditioning,
-                                      dynamic_threshold=dynamic_threshold)
-            img, pred_x0 = outs
-            if callback: callback(i)
-            if img_callback: img_callback(pred_x0, i)
-
-            if index % log_every_t == 0 or index == total_steps - 1:
-                intermediates['x_inter'].append(img)
-                intermediates['pred_x0'].append(pred_x0)
-
-        return img, intermediates
-
-    @torch.no_grad()
-    def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
-                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None,
-                      dynamic_threshold=None):
-        b, *_, device = *x.shape, x.device
-
-        if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
-            model_output = self.model.apply_model(x, t, c)
-        else:
-            x_in = torch.cat([x] * 2)
-            t_in = torch.cat([t] * 2)
-            if isinstance(c, dict):
-                assert isinstance(unconditional_conditioning, dict)
-                c_in = dict()
-                for k in c:
-                    if isinstance(c[k], list):
-                        c_in[k] = [torch.cat([
-                            unconditional_conditioning[k][i],
-                            c[k][i]]) for i in range(len(c[k]))]
-                    else:
-                        c_in[k] = torch.cat([
-                                unconditional_conditioning[k],
-                                c[k]])
-            elif isinstance(c, list):
-                c_in = list()
-                assert isinstance(unconditional_conditioning, list)
-                for i in range(len(c)):
-                    c_in.append(torch.cat([unconditional_conditioning[i], c[i]]))
-            else:
-                c_in = torch.cat([unconditional_conditioning, c])
-            model_uncond, model_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
-            model_output = model_uncond + unconditional_guidance_scale * (model_t - model_uncond)
-
-        if self.model.parameterization == "v":
-            e_t = self.model.predict_eps_from_z_and_v(x, t, model_output)
-        else:
-            e_t = model_output
-
-        if score_corrector is not None:
-            assert self.model.parameterization == "eps", 'not implemented'
-            e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
-
-        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
-        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
-        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
-        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
-        # select parameters corresponding to the currently considered timestep
-        a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
-        a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
-        sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
-        sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
-
-        # current prediction for x_0
-        if self.model.parameterization != "v":
-            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
-        else:
-            pred_x0 = self.model.predict_start_from_z_and_v(x, t, model_output)
-
-        if quantize_denoised:
-            pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
-
-        if dynamic_threshold is not None:
-            raise NotImplementedError()
-
-        # direction pointing to x_t
-        dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
-        noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
-        if noise_dropout > 0.:
-            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
-        x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
-        return x_prev, pred_x0
-
-    @torch.no_grad()
-    def encode(self, x0, c, t_enc, use_original_steps=False, return_intermediates=None,
-               unconditional_guidance_scale=1.0, unconditional_conditioning=None, callback=None):
-        num_reference_steps = self.ddpm_num_timesteps if use_original_steps else self.ddim_timesteps.shape[0]
-
-        assert t_enc <= num_reference_steps
-        num_steps = t_enc
-
-        if use_original_steps:
-            alphas_next = self.alphas_cumprod[:num_steps]
-            alphas = self.alphas_cumprod_prev[:num_steps]
-        else:
-            alphas_next = self.ddim_alphas[:num_steps]
-            alphas = torch.tensor(self.ddim_alphas_prev[:num_steps])
-
-        x_next = x0
-        intermediates = []
-        inter_steps = []
-        for i in tqdm(range(num_steps), desc='Encoding Image'):
-            t = torch.full((x0.shape[0],), i, device=self.model.device, dtype=torch.long)
-            if unconditional_guidance_scale == 1.:
-                noise_pred = self.model.apply_model(x_next, t, c)
-            else:
-                assert unconditional_conditioning is not None
-                e_t_uncond, noise_pred = torch.chunk(
-                    self.model.apply_model(torch.cat((x_next, x_next)), torch.cat((t, t)),
-                                           torch.cat((unconditional_conditioning, c))), 2)
-                noise_pred = e_t_uncond + unconditional_guidance_scale * (noise_pred - e_t_uncond)
-
-            xt_weighted = (alphas_next[i] / alphas[i]).sqrt() * x_next
-            weighted_noise_pred = alphas_next[i].sqrt() * (
-                    (1 / alphas_next[i] - 1).sqrt() - (1 / alphas[i] - 1).sqrt()) * noise_pred
-            x_next = xt_weighted + weighted_noise_pred
-            if return_intermediates and i % (
-                    num_steps // return_intermediates) == 0 and i < num_steps - 1:
-                intermediates.append(x_next)
-                inter_steps.append(i)
-            elif return_intermediates and i >= num_steps - 2:
-                intermediates.append(x_next)
-                inter_steps.append(i)
-            if callback: callback(i)
-
-        out = {'x_encoded': x_next, 'intermediate_steps': inter_steps}
-        if return_intermediates:
-            out.update({'intermediates': intermediates})
-        return x_next, out
-
-    @torch.no_grad()
-    def stochastic_encode(self, x0, t, use_original_steps=False, noise=None):
-        # fast, but does not allow for exact reconstruction
-        # t serves as an index to gather the correct alphas
-        if use_original_steps:
-            sqrt_alphas_cumprod = self.sqrt_alphas_cumprod
-            sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod
-        else:
-            sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas)
-            sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas
-
-        if noise is None:
-            noise = torch.randn_like(x0)
-        return (extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 +
-                extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise)
-
-    @torch.no_grad()
-    def decode(self, x_latent, cond, t_start, unconditional_guidance_scale=1.0, unconditional_conditioning=None,
-               use_original_steps=False, callback=None):
-
-        timesteps = np.arange(self.ddpm_num_timesteps) if use_original_steps else self.ddim_timesteps
-        timesteps = timesteps[:t_start]
-
-        time_range = np.flip(timesteps)
-        total_steps = timesteps.shape[0]
-        print(f"Running DDIM Sampling with {total_steps} timesteps")
-
-        iterator = tqdm(time_range, desc='Decoding image', total=total_steps)
-        x_dec = x_latent
-        for i, step in enumerate(iterator):
-            index = total_steps - i - 1
-            ts = torch.full((x_latent.shape[0],), step, device=x_latent.device, dtype=torch.long)
-            x_dec, _ = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps,
-                                          unconditional_guidance_scale=unconditional_guidance_scale,
-                                          unconditional_conditioning=unconditional_conditioning)
-            if callback: callback(i)
-        return x_dec
\ No newline at end of file
diff --git a/ControlNet/ldm/models/diffusion/ddpm.py b/ControlNet/ldm/models/diffusion/ddpm.py
deleted file mode 100644
index 1314be59ed451eb16053670e6e9d568fa674a124..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/models/diffusion/ddpm.py
+++ /dev/null
@@ -1,1797 +0,0 @@
-"""
-wild mixture of
-https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
-https://github.com/openai/improved-diffusion/blob/e94489283bb876ac1477d5dd7709bbbd2d9902ce/improved_diffusion/gaussian_diffusion.py
-https://github.com/CompVis/taming-transformers
--- merci
-"""
-
-import torch
-import torch.nn as nn
-import numpy as np
-import pytorch_lightning as pl
-from torch.optim.lr_scheduler import LambdaLR
-from einops import rearrange, repeat
-from contextlib import contextmanager, nullcontext
-from functools import partial
-import itertools
-from tqdm import tqdm
-from torchvision.utils import make_grid
-from pytorch_lightning.utilities.distributed import rank_zero_only
-from omegaconf import ListConfig
-
-from ControlNet.ldm.util import log_txt_as_img, exists, default, ismap, isimage, mean_flat, count_params, instantiate_from_config
-from ControlNet.ldm.modules.ema import LitEma
-from ControlNet.ldm.modules.distributions.distributions import normal_kl, DiagonalGaussianDistribution
-from ControlNet.ldm.models.autoencoder import IdentityFirstStage, AutoencoderKL
-from ControlNet.ldm.modules.diffusionmodules.util import make_beta_schedule, extract_into_tensor, noise_like
-from ControlNet.ldm.models.diffusion.ddim import DDIMSampler
-
-
-__conditioning_keys__ = {'concat': 'c_concat',
-                         'crossattn': 'c_crossattn',
-                         'adm': 'y'}
-
-
-def disabled_train(self, mode=True):
-    """Overwrite model.train with this function to make sure train/eval mode
-    does not change anymore."""
-    return self
-
-
-def uniform_on_device(r1, r2, shape, device):
-    return (r1 - r2) * torch.rand(*shape, device=device) + r2
-
-
-class DDPM(pl.LightningModule):
-    # classic DDPM with Gaussian diffusion, in image space
-    def __init__(self,
-                 unet_config,
-                 timesteps=1000,
-                 beta_schedule="linear",
-                 loss_type="l2",
-                 ckpt_path=None,
-                 ignore_keys=[],
-                 load_only_unet=False,
-                 monitor="val/loss",
-                 use_ema=True,
-                 first_stage_key="image",
-                 image_size=256,
-                 channels=3,
-                 log_every_t=100,
-                 clip_denoised=True,
-                 linear_start=1e-4,
-                 linear_end=2e-2,
-                 cosine_s=8e-3,
-                 given_betas=None,
-                 original_elbo_weight=0.,
-                 v_posterior=0.,  # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta
-                 l_simple_weight=1.,
-                 conditioning_key=None,
-                 parameterization="eps",  # all assuming fixed variance schedules
-                 scheduler_config=None,
-                 use_positional_encodings=False,
-                 learn_logvar=False,
-                 logvar_init=0.,
-                 make_it_fit=False,
-                 ucg_training=None,
-                 reset_ema=False,
-                 reset_num_ema_updates=False,
-                 ):
-        super().__init__()
-        assert parameterization in ["eps", "x0", "v"], 'currently only supporting "eps" and "x0" and "v"'
-        self.parameterization = parameterization
-        print(f"{self.__class__.__name__}: Running in {self.parameterization}-prediction mode")
-        self.cond_stage_model = None
-        self.clip_denoised = clip_denoised
-        self.log_every_t = log_every_t
-        self.first_stage_key = first_stage_key
-        self.image_size = image_size  # try conv?
-        self.channels = channels
-        self.use_positional_encodings = use_positional_encodings
-        self.model = DiffusionWrapper(unet_config, conditioning_key)
-        count_params(self.model, verbose=True)
-        self.use_ema = use_ema
-        if self.use_ema:
-            self.model_ema = LitEma(self.model)
-            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
-
-        self.use_scheduler = scheduler_config is not None
-        if self.use_scheduler:
-            self.scheduler_config = scheduler_config
-
-        self.v_posterior = v_posterior
-        self.original_elbo_weight = original_elbo_weight
-        self.l_simple_weight = l_simple_weight
-
-        if monitor is not None:
-            self.monitor = monitor
-        self.make_it_fit = make_it_fit
-        if reset_ema: assert exists(ckpt_path)
-        if ckpt_path is not None:
-            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys, only_model=load_only_unet)
-            if reset_ema:
-                assert self.use_ema
-                print(f"Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint.")
-                self.model_ema = LitEma(self.model)
-        if reset_num_ema_updates:
-            print(" +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ ")
-            assert self.use_ema
-            self.model_ema.reset_num_updates()
-
-        self.register_schedule(given_betas=given_betas, beta_schedule=beta_schedule, timesteps=timesteps,
-                               linear_start=linear_start, linear_end=linear_end, cosine_s=cosine_s)
-
-        self.loss_type = loss_type
-
-        self.learn_logvar = learn_logvar
-        logvar = torch.full(fill_value=logvar_init, size=(self.num_timesteps,))
-        if self.learn_logvar:
-            self.logvar = nn.Parameter(self.logvar, requires_grad=True)
-        else:
-            self.register_buffer('logvar', logvar)
-
-        self.ucg_training = ucg_training or dict()
-        if self.ucg_training:
-            self.ucg_prng = np.random.RandomState()
-
-    def register_schedule(self, given_betas=None, beta_schedule="linear", timesteps=1000,
-                          linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
-        if exists(given_betas):
-            betas = given_betas
-        else:
-            betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end,
-                                       cosine_s=cosine_s)
-        alphas = 1. - betas
-        alphas_cumprod = np.cumprod(alphas, axis=0)
-        alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
-
-        timesteps, = betas.shape
-        self.num_timesteps = int(timesteps)
-        self.linear_start = linear_start
-        self.linear_end = linear_end
-        assert alphas_cumprod.shape[0] == self.num_timesteps, 'alphas have to be defined for each timestep'
-
-        to_torch = partial(torch.tensor, dtype=torch.float32)
-
-        self.register_buffer('betas', to_torch(betas))
-        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
-        self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
-
-        # calculations for diffusion q(x_t | x_{t-1}) and others
-        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
-        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
-        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
-        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
-        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
-
-        # calculations for posterior q(x_{t-1} | x_t, x_0)
-        posterior_variance = (1 - self.v_posterior) * betas * (1. - alphas_cumprod_prev) / (
-                1. - alphas_cumprod) + self.v_posterior * betas
-        # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
-        self.register_buffer('posterior_variance', to_torch(posterior_variance))
-        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
-        self.register_buffer('posterior_log_variance_clipped', to_torch(np.log(np.maximum(posterior_variance, 1e-20))))
-        self.register_buffer('posterior_mean_coef1', to_torch(
-            betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)))
-        self.register_buffer('posterior_mean_coef2', to_torch(
-            (1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod)))
-
-        if self.parameterization == "eps":
-            lvlb_weights = self.betas ** 2 / (
-                    2 * self.posterior_variance * to_torch(alphas) * (1 - self.alphas_cumprod))
-        elif self.parameterization == "x0":
-            lvlb_weights = 0.5 * np.sqrt(torch.Tensor(alphas_cumprod)) / (2. * 1 - torch.Tensor(alphas_cumprod))
-        elif self.parameterization == "v":
-            lvlb_weights = torch.ones_like(self.betas ** 2 / (
-                    2 * self.posterior_variance * to_torch(alphas) * (1 - self.alphas_cumprod)))
-        else:
-            raise NotImplementedError("mu not supported")
-        lvlb_weights[0] = lvlb_weights[1]
-        self.register_buffer('lvlb_weights', lvlb_weights, persistent=False)
-        assert not torch.isnan(self.lvlb_weights).all()
-
-    @contextmanager
-    def ema_scope(self, context=None):
-        if self.use_ema:
-            self.model_ema.store(self.model.parameters())
-            self.model_ema.copy_to(self.model)
-            if context is not None:
-                print(f"{context}: Switched to EMA weights")
-        try:
-            yield None
-        finally:
-            if self.use_ema:
-                self.model_ema.restore(self.model.parameters())
-                if context is not None:
-                    print(f"{context}: Restored training weights")
-
-    @torch.no_grad()
-    def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
-        sd = torch.load(path, map_location="cpu")
-        if "state_dict" in list(sd.keys()):
-            sd = sd["state_dict"]
-        keys = list(sd.keys())
-        for k in keys:
-            for ik in ignore_keys:
-                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
-                    del sd[k]
-        if self.make_it_fit:
-            n_params = len([name for name, _ in
-                            itertools.chain(self.named_parameters(),
-                                            self.named_buffers())])
-            for name, param in tqdm(
-                    itertools.chain(self.named_parameters(),
-                                    self.named_buffers()),
-                    desc="Fitting old weights to new weights",
-                    total=n_params
-            ):
-                if not name in sd:
-                    continue
-                old_shape = sd[name].shape
-                new_shape = param.shape
-                assert len(old_shape) == len(new_shape)
-                if len(new_shape) > 2:
-                    # we only modify first two axes
-                    assert new_shape[2:] == old_shape[2:]
-                # assumes first axis corresponds to output dim
-                if not new_shape == old_shape:
-                    new_param = param.clone()
-                    old_param = sd[name]
-                    if len(new_shape) == 1:
-                        for i in range(new_param.shape[0]):
-                            new_param[i] = old_param[i % old_shape[0]]
-                    elif len(new_shape) >= 2:
-                        for i in range(new_param.shape[0]):
-                            for j in range(new_param.shape[1]):
-                                new_param[i, j] = old_param[i % old_shape[0], j % old_shape[1]]
-
-                        n_used_old = torch.ones(old_shape[1])
-                        for j in range(new_param.shape[1]):
-                            n_used_old[j % old_shape[1]] += 1
-                        n_used_new = torch.zeros(new_shape[1])
-                        for j in range(new_param.shape[1]):
-                            n_used_new[j] = n_used_old[j % old_shape[1]]
-
-                        n_used_new = n_used_new[None, :]
-                        while len(n_used_new.shape) < len(new_shape):
-                            n_used_new = n_used_new.unsqueeze(-1)
-                        new_param /= n_used_new
-
-                    sd[name] = new_param
-
-        missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
-            sd, strict=False)
-        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
-        if len(missing) > 0:
-            print(f"Missing Keys:\n {missing}")
-        if len(unexpected) > 0:
-            print(f"\nUnexpected Keys:\n {unexpected}")
-
-    def q_mean_variance(self, x_start, t):
-        """
-        Get the distribution q(x_t | x_0).
-        :param x_start: the [N x C x ...] tensor of noiseless inputs.
-        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
-        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
-        """
-        mean = (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start)
-        variance = extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
-        log_variance = extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
-        return mean, variance, log_variance
-
-    def predict_start_from_noise(self, x_t, t, noise):
-        return (
-                extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t -
-                extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
-        )
-
-    def predict_start_from_z_and_v(self, x_t, t, v):
-        # self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
-        # self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
-        return (
-                extract_into_tensor(self.sqrt_alphas_cumprod, t, x_t.shape) * x_t -
-                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) * v
-        )
-
-    def predict_eps_from_z_and_v(self, x_t, t, v):
-        return (
-                extract_into_tensor(self.sqrt_alphas_cumprod, t, x_t.shape) * v +
-                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) * x_t
-        )
-
-    def q_posterior(self, x_start, x_t, t):
-        posterior_mean = (
-                extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start +
-                extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
-        )
-        posterior_variance = extract_into_tensor(self.posterior_variance, t, x_t.shape)
-        posterior_log_variance_clipped = extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape)
-        return posterior_mean, posterior_variance, posterior_log_variance_clipped
-
-    def p_mean_variance(self, x, t, clip_denoised: bool):
-        model_out = self.model(x, t)
-        if self.parameterization == "eps":
-            x_recon = self.predict_start_from_noise(x, t=t, noise=model_out)
-        elif self.parameterization == "x0":
-            x_recon = model_out
-        if clip_denoised:
-            x_recon.clamp_(-1., 1.)
-
-        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
-        return model_mean, posterior_variance, posterior_log_variance
-
-    @torch.no_grad()
-    def p_sample(self, x, t, clip_denoised=True, repeat_noise=False):
-        b, *_, device = *x.shape, x.device
-        model_mean, _, model_log_variance = self.p_mean_variance(x=x, t=t, clip_denoised=clip_denoised)
-        noise = noise_like(x.shape, device, repeat_noise)
-        # no noise when t == 0
-        nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
-        return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
-
-    @torch.no_grad()
-    def p_sample_loop(self, shape, return_intermediates=False):
-        device = self.betas.device
-        b = shape[0]
-        img = torch.randn(shape, device=device)
-        intermediates = [img]
-        for i in tqdm(reversed(range(0, self.num_timesteps)), desc='Sampling t', total=self.num_timesteps):
-            img = self.p_sample(img, torch.full((b,), i, device=device, dtype=torch.long),
-                                clip_denoised=self.clip_denoised)
-            if i % self.log_every_t == 0 or i == self.num_timesteps - 1:
-                intermediates.append(img)
-        if return_intermediates:
-            return img, intermediates
-        return img
-
-    @torch.no_grad()
-    def sample(self, batch_size=16, return_intermediates=False):
-        image_size = self.image_size
-        channels = self.channels
-        return self.p_sample_loop((batch_size, channels, image_size, image_size),
-                                  return_intermediates=return_intermediates)
-
-    def q_sample(self, x_start, t, noise=None):
-        noise = default(noise, lambda: torch.randn_like(x_start))
-        return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
-                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
-
-    def get_v(self, x, noise, t):
-        return (
-                extract_into_tensor(self.sqrt_alphas_cumprod, t, x.shape) * noise -
-                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x.shape) * x
-        )
-
-    def get_loss(self, pred, target, mean=True):
-        if self.loss_type == 'l1':
-            loss = (target - pred).abs()
-            if mean:
-                loss = loss.mean()
-        elif self.loss_type == 'l2':
-            if mean:
-                loss = torch.nn.functional.mse_loss(target, pred)
-            else:
-                loss = torch.nn.functional.mse_loss(target, pred, reduction='none')
-        else:
-            raise NotImplementedError("unknown loss type '{loss_type}'")
-
-        return loss
-
-    def p_losses(self, x_start, t, noise=None):
-        noise = default(noise, lambda: torch.randn_like(x_start))
-        x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
-        model_out = self.model(x_noisy, t)
-
-        loss_dict = {}
-        if self.parameterization == "eps":
-            target = noise
-        elif self.parameterization == "x0":
-            target = x_start
-        elif self.parameterization == "v":
-            target = self.get_v(x_start, noise, t)
-        else:
-            raise NotImplementedError(f"Parameterization {self.parameterization} not yet supported")
-
-        loss = self.get_loss(model_out, target, mean=False).mean(dim=[1, 2, 3])
-
-        log_prefix = 'train' if self.training else 'val'
-
-        loss_dict.update({f'{log_prefix}/loss_simple': loss.mean()})
-        loss_simple = loss.mean() * self.l_simple_weight
-
-        loss_vlb = (self.lvlb_weights[t] * loss).mean()
-        loss_dict.update({f'{log_prefix}/loss_vlb': loss_vlb})
-
-        loss = loss_simple + self.original_elbo_weight * loss_vlb
-
-        loss_dict.update({f'{log_prefix}/loss': loss})
-
-        return loss, loss_dict
-
-    def forward(self, x, *args, **kwargs):
-        # b, c, h, w, device, img_size, = *x.shape, x.device, self.image_size
-        # assert h == img_size and w == img_size, f'height and width of image must be {img_size}'
-        t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=self.device).long()
-        return self.p_losses(x, t, *args, **kwargs)
-
-    def get_input(self, batch, k):
-        x = batch[k]
-        if len(x.shape) == 3:
-            x = x[..., None]
-        x = rearrange(x, 'b h w c -> b c h w')
-        x = x.to(memory_format=torch.contiguous_format).float()
-        return x
-
-    def shared_step(self, batch):
-        x = self.get_input(batch, self.first_stage_key)
-        loss, loss_dict = self(x)
-        return loss, loss_dict
-
-    def training_step(self, batch, batch_idx):
-        for k in self.ucg_training:
-            p = self.ucg_training[k]["p"]
-            val = self.ucg_training[k]["val"]
-            if val is None:
-                val = ""
-            for i in range(len(batch[k])):
-                if self.ucg_prng.choice(2, p=[1 - p, p]):
-                    batch[k][i] = val
-
-        loss, loss_dict = self.shared_step(batch)
-
-        self.log_dict(loss_dict, prog_bar=True,
-                      logger=True, on_step=True, on_epoch=True)
-
-        self.log("global_step", self.global_step,
-                 prog_bar=True, logger=True, on_step=True, on_epoch=False)
-
-        if self.use_scheduler:
-            lr = self.optimizers().param_groups[0]['lr']
-            self.log('lr_abs', lr, prog_bar=True, logger=True, on_step=True, on_epoch=False)
-
-        return loss
-
-    @torch.no_grad()
-    def validation_step(self, batch, batch_idx):
-        _, loss_dict_no_ema = self.shared_step(batch)
-        with self.ema_scope():
-            _, loss_dict_ema = self.shared_step(batch)
-            loss_dict_ema = {key + '_ema': loss_dict_ema[key] for key in loss_dict_ema}
-        self.log_dict(loss_dict_no_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
-        self.log_dict(loss_dict_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
-
-    def on_train_batch_end(self, *args, **kwargs):
-        if self.use_ema:
-            self.model_ema(self.model)
-
-    def _get_rows_from_list(self, samples):
-        n_imgs_per_row = len(samples)
-        denoise_grid = rearrange(samples, 'n b c h w -> b n c h w')
-        denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w')
-        denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row)
-        return denoise_grid
-
-    @torch.no_grad()
-    def log_images(self, batch, N=8, n_row=2, sample=True, return_keys=None, **kwargs):
-        log = dict()
-        x = self.get_input(batch, self.first_stage_key)
-        N = min(x.shape[0], N)
-        n_row = min(x.shape[0], n_row)
-        x = x.to(self.device)[:N]
-        log["inputs"] = x
-
-        # get diffusion row
-        diffusion_row = list()
-        x_start = x[:n_row]
-
-        for t in range(self.num_timesteps):
-            if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
-                t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
-                t = t.to(self.device).long()
-                noise = torch.randn_like(x_start)
-                x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
-                diffusion_row.append(x_noisy)
-
-        log["diffusion_row"] = self._get_rows_from_list(diffusion_row)
-
-        if sample:
-            # get denoise row
-            with self.ema_scope("Plotting"):
-                samples, denoise_row = self.sample(batch_size=N, return_intermediates=True)
-
-            log["samples"] = samples
-            log["denoise_row"] = self._get_rows_from_list(denoise_row)
-
-        if return_keys:
-            if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0:
-                return log
-            else:
-                return {key: log[key] for key in return_keys}
-        return log
-
-    def configure_optimizers(self):
-        lr = self.learning_rate
-        params = list(self.model.parameters())
-        if self.learn_logvar:
-            params = params + [self.logvar]
-        opt = torch.optim.AdamW(params, lr=lr)
-        return opt
-
-
-class LatentDiffusion(DDPM):
-    """main class"""
-
-    def __init__(self,
-                 first_stage_config,
-                 cond_stage_config,
-                 num_timesteps_cond=None,
-                 cond_stage_key="image",
-                 cond_stage_trainable=False,
-                 concat_mode=True,
-                 cond_stage_forward=None,
-                 conditioning_key=None,
-                 scale_factor=1.0,
-                 scale_by_std=False,
-                 force_null_conditioning=False,
-                 *args, **kwargs):
-        self.force_null_conditioning = force_null_conditioning
-        self.num_timesteps_cond = default(num_timesteps_cond, 1)
-        self.scale_by_std = scale_by_std
-        assert self.num_timesteps_cond <= kwargs['timesteps']
-        # for backwards compatibility after implementation of DiffusionWrapper
-        if conditioning_key is None:
-            conditioning_key = 'concat' if concat_mode else 'crossattn'
-        if cond_stage_config == '__is_unconditional__' and not self.force_null_conditioning:
-            conditioning_key = None
-        ckpt_path = kwargs.pop("ckpt_path", None)
-        reset_ema = kwargs.pop("reset_ema", False)
-        reset_num_ema_updates = kwargs.pop("reset_num_ema_updates", False)
-        ignore_keys = kwargs.pop("ignore_keys", [])
-        super().__init__(conditioning_key=conditioning_key, *args, **kwargs)
-        self.concat_mode = concat_mode
-        self.cond_stage_trainable = cond_stage_trainable
-        self.cond_stage_key = cond_stage_key
-        try:
-            self.num_downs = len(first_stage_config.params.ddconfig.ch_mult) - 1
-        except:
-            self.num_downs = 0
-        if not scale_by_std:
-            self.scale_factor = scale_factor
-        else:
-            self.register_buffer('scale_factor', torch.tensor(scale_factor))
-        self.instantiate_first_stage(first_stage_config)
-        self.instantiate_cond_stage(cond_stage_config)
-        self.cond_stage_forward = cond_stage_forward
-        self.clip_denoised = False
-        self.bbox_tokenizer = None
-
-        self.restarted_from_ckpt = False
-        if ckpt_path is not None:
-            self.init_from_ckpt(ckpt_path, ignore_keys)
-            self.restarted_from_ckpt = True
-            if reset_ema:
-                assert self.use_ema
-                print(
-                    f"Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint.")
-                self.model_ema = LitEma(self.model)
-        if reset_num_ema_updates:
-            print(" +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ ")
-            assert self.use_ema
-            self.model_ema.reset_num_updates()
-
-    def make_cond_schedule(self, ):
-        self.cond_ids = torch.full(size=(self.num_timesteps,), fill_value=self.num_timesteps - 1, dtype=torch.long)
-        ids = torch.round(torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)).long()
-        self.cond_ids[:self.num_timesteps_cond] = ids
-
-    @rank_zero_only
-    @torch.no_grad()
-    def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
-        # only for very first batch
-        if self.scale_by_std and self.current_epoch == 0 and self.global_step == 0 and batch_idx == 0 and not self.restarted_from_ckpt:
-            assert self.scale_factor == 1., 'rather not use custom rescaling and std-rescaling simultaneously'
-            # set rescale weight to 1./std of encodings
-            print("### USING STD-RESCALING ###")
-            x = super().get_input(batch, self.first_stage_key)
-            x = x.to(self.device)
-            encoder_posterior = self.encode_first_stage(x)
-            z = self.get_first_stage_encoding(encoder_posterior).detach()
-            del self.scale_factor
-            self.register_buffer('scale_factor', 1. / z.flatten().std())
-            print(f"setting self.scale_factor to {self.scale_factor}")
-            print("### USING STD-RESCALING ###")
-
-    def register_schedule(self,
-                          given_betas=None, beta_schedule="linear", timesteps=1000,
-                          linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
-        super().register_schedule(given_betas, beta_schedule, timesteps, linear_start, linear_end, cosine_s)
-
-        self.shorten_cond_schedule = self.num_timesteps_cond > 1
-        if self.shorten_cond_schedule:
-            self.make_cond_schedule()
-
-    def instantiate_first_stage(self, config):
-        model = instantiate_from_config(config)
-        self.first_stage_model = model.eval()
-        self.first_stage_model.train = disabled_train
-        for param in self.first_stage_model.parameters():
-            param.requires_grad = False
-
-    def instantiate_cond_stage(self, config):
-        if not self.cond_stage_trainable:
-            if config == "__is_first_stage__":
-                print("Using first stage also as cond stage.")
-                self.cond_stage_model = self.first_stage_model
-            elif config == "__is_unconditional__":
-                print(f"Training {self.__class__.__name__} as an unconditional model.")
-                self.cond_stage_model = None
-                # self.be_unconditional = True
-            else:
-                model = instantiate_from_config(config)
-                self.cond_stage_model = model.eval()
-                self.cond_stage_model.train = disabled_train
-                for param in self.cond_stage_model.parameters():
-                    param.requires_grad = False
-        else:
-            assert config != '__is_first_stage__'
-            assert config != '__is_unconditional__'
-            model = instantiate_from_config(config)
-            self.cond_stage_model = model
-
-    def _get_denoise_row_from_list(self, samples, desc='', force_no_decoder_quantization=False):
-        denoise_row = []
-        for zd in tqdm(samples, desc=desc):
-            denoise_row.append(self.decode_first_stage(zd.to(self.device),
-                                                       force_not_quantize=force_no_decoder_quantization))
-        n_imgs_per_row = len(denoise_row)
-        denoise_row = torch.stack(denoise_row)  # n_log_step, n_row, C, H, W
-        denoise_grid = rearrange(denoise_row, 'n b c h w -> b n c h w')
-        denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w')
-        denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row)
-        return denoise_grid
-
-    def get_first_stage_encoding(self, encoder_posterior):
-        if isinstance(encoder_posterior, DiagonalGaussianDistribution):
-            z = encoder_posterior.sample()
-        elif isinstance(encoder_posterior, torch.Tensor):
-            z = encoder_posterior
-        else:
-            raise NotImplementedError(f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented")
-        return self.scale_factor * z
-
-    def get_learned_conditioning(self, c):
-        if self.cond_stage_forward is None:
-            if hasattr(self.cond_stage_model, 'encode') and callable(self.cond_stage_model.encode):
-                c = self.cond_stage_model.encode(c)
-                if isinstance(c, DiagonalGaussianDistribution):
-                    c = c.mode()
-            else:
-                c = self.cond_stage_model(c)
-        else:
-            assert hasattr(self.cond_stage_model, self.cond_stage_forward)
-            c = getattr(self.cond_stage_model, self.cond_stage_forward)(c)
-        return c
-
-    def meshgrid(self, h, w):
-        y = torch.arange(0, h).view(h, 1, 1).repeat(1, w, 1)
-        x = torch.arange(0, w).view(1, w, 1).repeat(h, 1, 1)
-
-        arr = torch.cat([y, x], dim=-1)
-        return arr
-
-    def delta_border(self, h, w):
-        """
-        :param h: height
-        :param w: width
-        :return: normalized distance to image border,
-         wtith min distance = 0 at border and max dist = 0.5 at image center
-        """
-        lower_right_corner = torch.tensor([h - 1, w - 1]).view(1, 1, 2)
-        arr = self.meshgrid(h, w) / lower_right_corner
-        dist_left_up = torch.min(arr, dim=-1, keepdims=True)[0]
-        dist_right_down = torch.min(1 - arr, dim=-1, keepdims=True)[0]
-        edge_dist = torch.min(torch.cat([dist_left_up, dist_right_down], dim=-1), dim=-1)[0]
-        return edge_dist
-
-    def get_weighting(self, h, w, Ly, Lx, device):
-        weighting = self.delta_border(h, w)
-        weighting = torch.clip(weighting, self.split_input_params["clip_min_weight"],
-                               self.split_input_params["clip_max_weight"], )
-        weighting = weighting.view(1, h * w, 1).repeat(1, 1, Ly * Lx).to(device)
-
-        if self.split_input_params["tie_braker"]:
-            L_weighting = self.delta_border(Ly, Lx)
-            L_weighting = torch.clip(L_weighting,
-                                     self.split_input_params["clip_min_tie_weight"],
-                                     self.split_input_params["clip_max_tie_weight"])
-
-            L_weighting = L_weighting.view(1, 1, Ly * Lx).to(device)
-            weighting = weighting * L_weighting
-        return weighting
-
-    def get_fold_unfold(self, x, kernel_size, stride, uf=1, df=1):  # todo load once not every time, shorten code
-        """
-        :param x: img of size (bs, c, h, w)
-        :return: n img crops of size (n, bs, c, kernel_size[0], kernel_size[1])
-        """
-        bs, nc, h, w = x.shape
-
-        # number of crops in image
-        Ly = (h - kernel_size[0]) // stride[0] + 1
-        Lx = (w - kernel_size[1]) // stride[1] + 1
-
-        if uf == 1 and df == 1:
-            fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
-            unfold = torch.nn.Unfold(**fold_params)
-
-            fold = torch.nn.Fold(output_size=x.shape[2:], **fold_params)
-
-            weighting = self.get_weighting(kernel_size[0], kernel_size[1], Ly, Lx, x.device).to(x.dtype)
-            normalization = fold(weighting).view(1, 1, h, w)  # normalizes the overlap
-            weighting = weighting.view((1, 1, kernel_size[0], kernel_size[1], Ly * Lx))
-
-        elif uf > 1 and df == 1:
-            fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
-            unfold = torch.nn.Unfold(**fold_params)
-
-            fold_params2 = dict(kernel_size=(kernel_size[0] * uf, kernel_size[0] * uf),
-                                dilation=1, padding=0,
-                                stride=(stride[0] * uf, stride[1] * uf))
-            fold = torch.nn.Fold(output_size=(x.shape[2] * uf, x.shape[3] * uf), **fold_params2)
-
-            weighting = self.get_weighting(kernel_size[0] * uf, kernel_size[1] * uf, Ly, Lx, x.device).to(x.dtype)
-            normalization = fold(weighting).view(1, 1, h * uf, w * uf)  # normalizes the overlap
-            weighting = weighting.view((1, 1, kernel_size[0] * uf, kernel_size[1] * uf, Ly * Lx))
-
-        elif df > 1 and uf == 1:
-            fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
-            unfold = torch.nn.Unfold(**fold_params)
-
-            fold_params2 = dict(kernel_size=(kernel_size[0] // df, kernel_size[0] // df),
-                                dilation=1, padding=0,
-                                stride=(stride[0] // df, stride[1] // df))
-            fold = torch.nn.Fold(output_size=(x.shape[2] // df, x.shape[3] // df), **fold_params2)
-
-            weighting = self.get_weighting(kernel_size[0] // df, kernel_size[1] // df, Ly, Lx, x.device).to(x.dtype)
-            normalization = fold(weighting).view(1, 1, h // df, w // df)  # normalizes the overlap
-            weighting = weighting.view((1, 1, kernel_size[0] // df, kernel_size[1] // df, Ly * Lx))
-
-        else:
-            raise NotImplementedError
-
-        return fold, unfold, normalization, weighting
-
-    @torch.no_grad()
-    def get_input(self, batch, k, return_first_stage_outputs=False, force_c_encode=False,
-                  cond_key=None, return_original_cond=False, bs=None, return_x=False):
-        x = super().get_input(batch, k)
-        if bs is not None:
-            x = x[:bs]
-        x = x.to(self.device)
-        encoder_posterior = self.encode_first_stage(x)
-        z = self.get_first_stage_encoding(encoder_posterior).detach()
-
-        if self.model.conditioning_key is not None and not self.force_null_conditioning:
-            if cond_key is None:
-                cond_key = self.cond_stage_key
-            if cond_key != self.first_stage_key:
-                if cond_key in ['caption', 'coordinates_bbox', "txt"]:
-                    xc = batch[cond_key]
-                elif cond_key in ['class_label', 'cls']:
-                    xc = batch
-                else:
-                    xc = super().get_input(batch, cond_key).to(self.device)
-            else:
-                xc = x
-            if not self.cond_stage_trainable or force_c_encode:
-                if isinstance(xc, dict) or isinstance(xc, list):
-                    c = self.get_learned_conditioning(xc)
-                else:
-                    c = self.get_learned_conditioning(xc.to(self.device))
-            else:
-                c = xc
-            if bs is not None:
-                c = c[:bs]
-
-            if self.use_positional_encodings:
-                pos_x, pos_y = self.compute_latent_shifts(batch)
-                ckey = __conditioning_keys__[self.model.conditioning_key]
-                c = {ckey: c, 'pos_x': pos_x, 'pos_y': pos_y}
-
-        else:
-            c = None
-            xc = None
-            if self.use_positional_encodings:
-                pos_x, pos_y = self.compute_latent_shifts(batch)
-                c = {'pos_x': pos_x, 'pos_y': pos_y}
-        out = [z, c]
-        if return_first_stage_outputs:
-            xrec = self.decode_first_stage(z)
-            out.extend([x, xrec])
-        if return_x:
-            out.extend([x])
-        if return_original_cond:
-            out.append(xc)
-        return out
-
-    @torch.no_grad()
-    def decode_first_stage(self, z, predict_cids=False, force_not_quantize=False):
-        if predict_cids:
-            if z.dim() == 4:
-                z = torch.argmax(z.exp(), dim=1).long()
-            z = self.first_stage_model.quantize.get_codebook_entry(z, shape=None)
-            z = rearrange(z, 'b h w c -> b c h w').contiguous()
-
-        z = 1. / self.scale_factor * z
-        return self.first_stage_model.decode(z)
-
-    @torch.no_grad()
-    def encode_first_stage(self, x):
-        return self.first_stage_model.encode(x)
-
-    def shared_step(self, batch, **kwargs):
-        x, c = self.get_input(batch, self.first_stage_key)
-        loss = self(x, c)
-        return loss
-
-    def forward(self, x, c, *args, **kwargs):
-        t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=self.device).long()
-        if self.model.conditioning_key is not None:
-            assert c is not None
-            if self.cond_stage_trainable:
-                c = self.get_learned_conditioning(c)
-            if self.shorten_cond_schedule:  # TODO: drop this option
-                tc = self.cond_ids[t].to(self.device)
-                c = self.q_sample(x_start=c, t=tc, noise=torch.randn_like(c.float()))
-        return self.p_losses(x, c, t, *args, **kwargs)
-
-    def apply_model(self, x_noisy, t, cond, return_ids=False):
-        if isinstance(cond, dict):
-            # hybrid case, cond is expected to be a dict
-            pass
-        else:
-            if not isinstance(cond, list):
-                cond = [cond]
-            key = 'c_concat' if self.model.conditioning_key == 'concat' else 'c_crossattn'
-            cond = {key: cond}
-
-        x_recon = self.model(x_noisy, t, **cond)
-
-        if isinstance(x_recon, tuple) and not return_ids:
-            return x_recon[0]
-        else:
-            return x_recon
-
-    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
-        return (extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart) / \
-               extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
-
-    def _prior_bpd(self, x_start):
-        """
-        Get the prior KL term for the variational lower-bound, measured in
-        bits-per-dim.
-        This term can't be optimized, as it only depends on the encoder.
-        :param x_start: the [N x C x ...] tensor of inputs.
-        :return: a batch of [N] KL values (in bits), one per batch element.
-        """
-        batch_size = x_start.shape[0]
-        t = torch.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
-        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
-        kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0)
-        return mean_flat(kl_prior) / np.log(2.0)
-
-    def p_losses(self, x_start, cond, t, noise=None):
-        noise = default(noise, lambda: torch.randn_like(x_start))
-        x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
-        model_output = self.apply_model(x_noisy, t, cond)
-
-        loss_dict = {}
-        prefix = 'train' if self.training else 'val'
-
-        if self.parameterization == "x0":
-            target = x_start
-        elif self.parameterization == "eps":
-            target = noise
-        elif self.parameterization == "v":
-            target = self.get_v(x_start, noise, t)
-        else:
-            raise NotImplementedError()
-
-        loss_simple = self.get_loss(model_output, target, mean=False).mean([1, 2, 3])
-        loss_dict.update({f'{prefix}/loss_simple': loss_simple.mean()})
-
-        logvar_t = self.logvar[t].to(self.device)
-        loss = loss_simple / torch.exp(logvar_t) + logvar_t
-        # loss = loss_simple / torch.exp(self.logvar) + self.logvar
-        if self.learn_logvar:
-            loss_dict.update({f'{prefix}/loss_gamma': loss.mean()})
-            loss_dict.update({'logvar': self.logvar.data.mean()})
-
-        loss = self.l_simple_weight * loss.mean()
-
-        loss_vlb = self.get_loss(model_output, target, mean=False).mean(dim=(1, 2, 3))
-        loss_vlb = (self.lvlb_weights[t] * loss_vlb).mean()
-        loss_dict.update({f'{prefix}/loss_vlb': loss_vlb})
-        loss += (self.original_elbo_weight * loss_vlb)
-        loss_dict.update({f'{prefix}/loss': loss})
-
-        return loss, loss_dict
-
-    def p_mean_variance(self, x, c, t, clip_denoised: bool, return_codebook_ids=False, quantize_denoised=False,
-                        return_x0=False, score_corrector=None, corrector_kwargs=None):
-        t_in = t
-        model_out = self.apply_model(x, t_in, c, return_ids=return_codebook_ids)
-
-        if score_corrector is not None:
-            assert self.parameterization == "eps"
-            model_out = score_corrector.modify_score(self, model_out, x, t, c, **corrector_kwargs)
-
-        if return_codebook_ids:
-            model_out, logits = model_out
-
-        if self.parameterization == "eps":
-            x_recon = self.predict_start_from_noise(x, t=t, noise=model_out)
-        elif self.parameterization == "x0":
-            x_recon = model_out
-        else:
-            raise NotImplementedError()
-
-        if clip_denoised:
-            x_recon.clamp_(-1., 1.)
-        if quantize_denoised:
-            x_recon, _, [_, _, indices] = self.first_stage_model.quantize(x_recon)
-        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
-        if return_codebook_ids:
-            return model_mean, posterior_variance, posterior_log_variance, logits
-        elif return_x0:
-            return model_mean, posterior_variance, posterior_log_variance, x_recon
-        else:
-            return model_mean, posterior_variance, posterior_log_variance
-
-    @torch.no_grad()
-    def p_sample(self, x, c, t, clip_denoised=False, repeat_noise=False,
-                 return_codebook_ids=False, quantize_denoised=False, return_x0=False,
-                 temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None):
-        b, *_, device = *x.shape, x.device
-        outputs = self.p_mean_variance(x=x, c=c, t=t, clip_denoised=clip_denoised,
-                                       return_codebook_ids=return_codebook_ids,
-                                       quantize_denoised=quantize_denoised,
-                                       return_x0=return_x0,
-                                       score_corrector=score_corrector, corrector_kwargs=corrector_kwargs)
-        if return_codebook_ids:
-            raise DeprecationWarning("Support dropped.")
-            model_mean, _, model_log_variance, logits = outputs
-        elif return_x0:
-            model_mean, _, model_log_variance, x0 = outputs
-        else:
-            model_mean, _, model_log_variance = outputs
-
-        noise = noise_like(x.shape, device, repeat_noise) * temperature
-        if noise_dropout > 0.:
-            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
-        # no noise when t == 0
-        nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
-
-        if return_codebook_ids:
-            return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise, logits.argmax(dim=1)
-        if return_x0:
-            return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise, x0
-        else:
-            return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
-
-    @torch.no_grad()
-    def progressive_denoising(self, cond, shape, verbose=True, callback=None, quantize_denoised=False,
-                              img_callback=None, mask=None, x0=None, temperature=1., noise_dropout=0.,
-                              score_corrector=None, corrector_kwargs=None, batch_size=None, x_T=None, start_T=None,
-                              log_every_t=None):
-        if not log_every_t:
-            log_every_t = self.log_every_t
-        timesteps = self.num_timesteps
-        if batch_size is not None:
-            b = batch_size if batch_size is not None else shape[0]
-            shape = [batch_size] + list(shape)
-        else:
-            b = batch_size = shape[0]
-        if x_T is None:
-            img = torch.randn(shape, device=self.device)
-        else:
-            img = x_T
-        intermediates = []
-        if cond is not None:
-            if isinstance(cond, dict):
-                cond = {key: cond[key][:batch_size] if not isinstance(cond[key], list) else
-                list(map(lambda x: x[:batch_size], cond[key])) for key in cond}
-            else:
-                cond = [c[:batch_size] for c in cond] if isinstance(cond, list) else cond[:batch_size]
-
-        if start_T is not None:
-            timesteps = min(timesteps, start_T)
-        iterator = tqdm(reversed(range(0, timesteps)), desc='Progressive Generation',
-                        total=timesteps) if verbose else reversed(
-            range(0, timesteps))
-        if type(temperature) == float:
-            temperature = [temperature] * timesteps
-
-        for i in iterator:
-            ts = torch.full((b,), i, device=self.device, dtype=torch.long)
-            if self.shorten_cond_schedule:
-                assert self.model.conditioning_key != 'hybrid'
-                tc = self.cond_ids[ts].to(cond.device)
-                cond = self.q_sample(x_start=cond, t=tc, noise=torch.randn_like(cond))
-
-            img, x0_partial = self.p_sample(img, cond, ts,
-                                            clip_denoised=self.clip_denoised,
-                                            quantize_denoised=quantize_denoised, return_x0=True,
-                                            temperature=temperature[i], noise_dropout=noise_dropout,
-                                            score_corrector=score_corrector, corrector_kwargs=corrector_kwargs)
-            if mask is not None:
-                assert x0 is not None
-                img_orig = self.q_sample(x0, ts)
-                img = img_orig * mask + (1. - mask) * img
-
-            if i % log_every_t == 0 or i == timesteps - 1:
-                intermediates.append(x0_partial)
-            if callback: callback(i)
-            if img_callback: img_callback(img, i)
-        return img, intermediates
-
-    @torch.no_grad()
-    def p_sample_loop(self, cond, shape, return_intermediates=False,
-                      x_T=None, verbose=True, callback=None, timesteps=None, quantize_denoised=False,
-                      mask=None, x0=None, img_callback=None, start_T=None,
-                      log_every_t=None):
-
-        if not log_every_t:
-            log_every_t = self.log_every_t
-        device = self.betas.device
-        b = shape[0]
-        if x_T is None:
-            img = torch.randn(shape, device=device)
-        else:
-            img = x_T
-
-        intermediates = [img]
-        if timesteps is None:
-            timesteps = self.num_timesteps
-
-        if start_T is not None:
-            timesteps = min(timesteps, start_T)
-        iterator = tqdm(reversed(range(0, timesteps)), desc='Sampling t', total=timesteps) if verbose else reversed(
-            range(0, timesteps))
-
-        if mask is not None:
-            assert x0 is not None
-            assert x0.shape[2:3] == mask.shape[2:3]  # spatial size has to match
-
-        for i in iterator:
-            ts = torch.full((b,), i, device=device, dtype=torch.long)
-            if self.shorten_cond_schedule:
-                assert self.model.conditioning_key != 'hybrid'
-                tc = self.cond_ids[ts].to(cond.device)
-                cond = self.q_sample(x_start=cond, t=tc, noise=torch.randn_like(cond))
-
-            img = self.p_sample(img, cond, ts,
-                                clip_denoised=self.clip_denoised,
-                                quantize_denoised=quantize_denoised)
-            if mask is not None:
-                img_orig = self.q_sample(x0, ts)
-                img = img_orig * mask + (1. - mask) * img
-
-            if i % log_every_t == 0 or i == timesteps - 1:
-                intermediates.append(img)
-            if callback: callback(i)
-            if img_callback: img_callback(img, i)
-
-        if return_intermediates:
-            return img, intermediates
-        return img
-
-    @torch.no_grad()
-    def sample(self, cond, batch_size=16, return_intermediates=False, x_T=None,
-               verbose=True, timesteps=None, quantize_denoised=False,
-               mask=None, x0=None, shape=None, **kwargs):
-        if shape is None:
-            shape = (batch_size, self.channels, self.image_size, self.image_size)
-        if cond is not None:
-            if isinstance(cond, dict):
-                cond = {key: cond[key][:batch_size] if not isinstance(cond[key], list) else
-                list(map(lambda x: x[:batch_size], cond[key])) for key in cond}
-            else:
-                cond = [c[:batch_size] for c in cond] if isinstance(cond, list) else cond[:batch_size]
-        return self.p_sample_loop(cond,
-                                  shape,
-                                  return_intermediates=return_intermediates, x_T=x_T,
-                                  verbose=verbose, timesteps=timesteps, quantize_denoised=quantize_denoised,
-                                  mask=mask, x0=x0)
-
-    @torch.no_grad()
-    def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs):
-        if ddim:
-            ddim_sampler = DDIMSampler(self)
-            shape = (self.channels, self.image_size, self.image_size)
-            samples, intermediates = ddim_sampler.sample(ddim_steps, batch_size,
-                                                         shape, cond, verbose=False, **kwargs)
-
-        else:
-            samples, intermediates = self.sample(cond=cond, batch_size=batch_size,
-                                                 return_intermediates=True, **kwargs)
-
-        return samples, intermediates
-
-    @torch.no_grad()
-    def get_unconditional_conditioning(self, batch_size, null_label=None):
-        if null_label is not None:
-            xc = null_label
-            if isinstance(xc, ListConfig):
-                xc = list(xc)
-            if isinstance(xc, dict) or isinstance(xc, list):
-                c = self.get_learned_conditioning(xc)
-            else:
-                if hasattr(xc, "to"):
-                    xc = xc.to(self.device)
-                c = self.get_learned_conditioning(xc)
-        else:
-            if self.cond_stage_key in ["class_label", "cls"]:
-                xc = self.cond_stage_model.get_unconditional_conditioning(batch_size, device=self.device)
-                return self.get_learned_conditioning(xc)
-            else:
-                raise NotImplementedError("todo")
-        if isinstance(c, list):  # in case the encoder gives us a list
-            for i in range(len(c)):
-                c[i] = repeat(c[i], '1 ... -> b ...', b=batch_size).to(self.device)
-        else:
-            c = repeat(c, '1 ... -> b ...', b=batch_size).to(self.device)
-        return c
-
-    @torch.no_grad()
-    def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=50, ddim_eta=0., return_keys=None,
-                   quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
-                   plot_diffusion_rows=True, unconditional_guidance_scale=1., unconditional_guidance_label=None,
-                   use_ema_scope=True,
-                   **kwargs):
-        ema_scope = self.ema_scope if use_ema_scope else nullcontext
-        use_ddim = ddim_steps is not None
-
-        log = dict()
-        z, c, x, xrec, xc = self.get_input(batch, self.first_stage_key,
-                                           return_first_stage_outputs=True,
-                                           force_c_encode=True,
-                                           return_original_cond=True,
-                                           bs=N)
-        N = min(x.shape[0], N)
-        n_row = min(x.shape[0], n_row)
-        log["inputs"] = x
-        log["reconstruction"] = xrec
-        if self.model.conditioning_key is not None:
-            if hasattr(self.cond_stage_model, "decode"):
-                xc = self.cond_stage_model.decode(c)
-                log["conditioning"] = xc
-            elif self.cond_stage_key in ["caption", "txt"]:
-                xc = log_txt_as_img((x.shape[2], x.shape[3]), batch[self.cond_stage_key], size=x.shape[2] // 25)
-                log["conditioning"] = xc
-            elif self.cond_stage_key in ['class_label', "cls"]:
-                try:
-                    xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"], size=x.shape[2] // 25)
-                    log['conditioning'] = xc
-                except KeyError:
-                    # probably no "human_label" in batch
-                    pass
-            elif isimage(xc):
-                log["conditioning"] = xc
-            if ismap(xc):
-                log["original_conditioning"] = self.to_rgb(xc)
-
-        if plot_diffusion_rows:
-            # get diffusion row
-            diffusion_row = list()
-            z_start = z[:n_row]
-            for t in range(self.num_timesteps):
-                if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
-                    t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
-                    t = t.to(self.device).long()
-                    noise = torch.randn_like(z_start)
-                    z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
-                    diffusion_row.append(self.decode_first_stage(z_noisy))
-
-            diffusion_row = torch.stack(diffusion_row)  # n_log_step, n_row, C, H, W
-            diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
-            diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
-            diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
-            log["diffusion_row"] = diffusion_grid
-
-        if sample:
-            # get denoise row
-            with ema_scope("Sampling"):
-                samples, z_denoise_row = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
-                                                         ddim_steps=ddim_steps, eta=ddim_eta)
-                # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
-            x_samples = self.decode_first_stage(samples)
-            log["samples"] = x_samples
-            if plot_denoise_rows:
-                denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
-                log["denoise_row"] = denoise_grid
-
-            if quantize_denoised and not isinstance(self.first_stage_model, AutoencoderKL) and not isinstance(
-                    self.first_stage_model, IdentityFirstStage):
-                # also display when quantizing x0 while sampling
-                with ema_scope("Plotting Quantized Denoised"):
-                    samples, z_denoise_row = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
-                                                             ddim_steps=ddim_steps, eta=ddim_eta,
-                                                             quantize_denoised=True)
-                    # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True,
-                    #                                      quantize_denoised=True)
-                x_samples = self.decode_first_stage(samples.to(self.device))
-                log["samples_x0_quantized"] = x_samples
-
-        if unconditional_guidance_scale > 1.0:
-            uc = self.get_unconditional_conditioning(N, unconditional_guidance_label)
-            if self.model.conditioning_key == "crossattn-adm":
-                uc = {"c_crossattn": [uc], "c_adm": c["c_adm"]}
-            with ema_scope("Sampling with classifier-free guidance"):
-                samples_cfg, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
-                                                 ddim_steps=ddim_steps, eta=ddim_eta,
-                                                 unconditional_guidance_scale=unconditional_guidance_scale,
-                                                 unconditional_conditioning=uc,
-                                                 )
-                x_samples_cfg = self.decode_first_stage(samples_cfg)
-                log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
-
-        if inpaint:
-            # make a simple center square
-            b, h, w = z.shape[0], z.shape[2], z.shape[3]
-            mask = torch.ones(N, h, w).to(self.device)
-            # zeros will be filled in
-            mask[:, h // 4:3 * h // 4, w // 4:3 * w // 4] = 0.
-            mask = mask[:, None, ...]
-            with ema_scope("Plotting Inpaint"):
-                samples, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim, eta=ddim_eta,
-                                             ddim_steps=ddim_steps, x0=z[:N], mask=mask)
-            x_samples = self.decode_first_stage(samples.to(self.device))
-            log["samples_inpainting"] = x_samples
-            log["mask"] = mask
-
-            # outpaint
-            mask = 1. - mask
-            with ema_scope("Plotting Outpaint"):
-                samples, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim, eta=ddim_eta,
-                                             ddim_steps=ddim_steps, x0=z[:N], mask=mask)
-            x_samples = self.decode_first_stage(samples.to(self.device))
-            log["samples_outpainting"] = x_samples
-
-        if plot_progressive_rows:
-            with ema_scope("Plotting Progressives"):
-                img, progressives = self.progressive_denoising(c,
-                                                               shape=(self.channels, self.image_size, self.image_size),
-                                                               batch_size=N)
-            prog_row = self._get_denoise_row_from_list(progressives, desc="Progressive Generation")
-            log["progressive_row"] = prog_row
-
-        if return_keys:
-            if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0:
-                return log
-            else:
-                return {key: log[key] for key in return_keys}
-        return log
-
-    def configure_optimizers(self):
-        lr = self.learning_rate
-        params = list(self.model.parameters())
-        if self.cond_stage_trainable:
-            print(f"{self.__class__.__name__}: Also optimizing conditioner params!")
-            params = params + list(self.cond_stage_model.parameters())
-        if self.learn_logvar:
-            print('Diffusion model optimizing logvar')
-            params.append(self.logvar)
-        opt = torch.optim.AdamW(params, lr=lr)
-        if self.use_scheduler:
-            assert 'target' in self.scheduler_config
-            scheduler = instantiate_from_config(self.scheduler_config)
-
-            print("Setting up LambdaLR scheduler...")
-            scheduler = [
-                {
-                    'scheduler': LambdaLR(opt, lr_lambda=scheduler.schedule),
-                    'interval': 'step',
-                    'frequency': 1
-                }]
-            return [opt], scheduler
-        return opt
-
-    @torch.no_grad()
-    def to_rgb(self, x):
-        x = x.float()
-        if not hasattr(self, "colorize"):
-            self.colorize = torch.randn(3, x.shape[1], 1, 1).to(x)
-        x = nn.functional.conv2d(x, weight=self.colorize)
-        x = 2. * (x - x.min()) / (x.max() - x.min()) - 1.
-        return x
-
-
-class DiffusionWrapper(pl.LightningModule):
-    def __init__(self, diff_model_config, conditioning_key):
-        super().__init__()
-        self.sequential_cross_attn = diff_model_config.pop("sequential_crossattn", False)
-        self.diffusion_model = instantiate_from_config(diff_model_config)
-        self.conditioning_key = conditioning_key
-        assert self.conditioning_key in [None, 'concat', 'crossattn', 'hybrid', 'adm', 'hybrid-adm', 'crossattn-adm']
-
-    def forward(self, x, t, c_concat: list = None, c_crossattn: list = None, c_adm=None):
-        if self.conditioning_key is None:
-            out = self.diffusion_model(x, t)
-        elif self.conditioning_key == 'concat':
-            xc = torch.cat([x] + c_concat, dim=1)
-            out = self.diffusion_model(xc, t)
-        elif self.conditioning_key == 'crossattn':
-            if not self.sequential_cross_attn:
-                cc = torch.cat(c_crossattn, 1)
-            else:
-                cc = c_crossattn
-            out = self.diffusion_model(x, t, context=cc)
-        elif self.conditioning_key == 'hybrid':
-            xc = torch.cat([x] + c_concat, dim=1)
-            cc = torch.cat(c_crossattn, 1)
-            out = self.diffusion_model(xc, t, context=cc)
-        elif self.conditioning_key == 'hybrid-adm':
-            assert c_adm is not None
-            xc = torch.cat([x] + c_concat, dim=1)
-            cc = torch.cat(c_crossattn, 1)
-            out = self.diffusion_model(xc, t, context=cc, y=c_adm)
-        elif self.conditioning_key == 'crossattn-adm':
-            assert c_adm is not None
-            cc = torch.cat(c_crossattn, 1)
-            out = self.diffusion_model(x, t, context=cc, y=c_adm)
-        elif self.conditioning_key == 'adm':
-            cc = c_crossattn[0]
-            out = self.diffusion_model(x, t, y=cc)
-        else:
-            raise NotImplementedError()
-
-        return out
-
-
-class LatentUpscaleDiffusion(LatentDiffusion):
-    def __init__(self, *args, low_scale_config, low_scale_key="LR", noise_level_key=None, **kwargs):
-        super().__init__(*args, **kwargs)
-        # assumes that neither the cond_stage nor the low_scale_model contain trainable params
-        assert not self.cond_stage_trainable
-        self.instantiate_low_stage(low_scale_config)
-        self.low_scale_key = low_scale_key
-        self.noise_level_key = noise_level_key
-
-    def instantiate_low_stage(self, config):
-        model = instantiate_from_config(config)
-        self.low_scale_model = model.eval()
-        self.low_scale_model.train = disabled_train
-        for param in self.low_scale_model.parameters():
-            param.requires_grad = False
-
-    @torch.no_grad()
-    def get_input(self, batch, k, cond_key=None, bs=None, log_mode=False):
-        if not log_mode:
-            z, c = super().get_input(batch, k, force_c_encode=True, bs=bs)
-        else:
-            z, c, x, xrec, xc = super().get_input(batch, self.first_stage_key, return_first_stage_outputs=True,
-                                                  force_c_encode=True, return_original_cond=True, bs=bs)
-        x_low = batch[self.low_scale_key][:bs]
-        x_low = rearrange(x_low, 'b h w c -> b c h w')
-        x_low = x_low.to(memory_format=torch.contiguous_format).float()
-        zx, noise_level = self.low_scale_model(x_low)
-        if self.noise_level_key is not None:
-            # get noise level from batch instead, e.g. when extracting a custom noise level for bsr
-            raise NotImplementedError('TODO')
-
-        all_conds = {"c_concat": [zx], "c_crossattn": [c], "c_adm": noise_level}
-        if log_mode:
-            # TODO: maybe disable if too expensive
-            x_low_rec = self.low_scale_model.decode(zx)
-            return z, all_conds, x, xrec, xc, x_low, x_low_rec, noise_level
-        return z, all_conds
-
-    @torch.no_grad()
-    def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=1., return_keys=None,
-                   plot_denoise_rows=False, plot_progressive_rows=True, plot_diffusion_rows=True,
-                   unconditional_guidance_scale=1., unconditional_guidance_label=None, use_ema_scope=True,
-                   **kwargs):
-        ema_scope = self.ema_scope if use_ema_scope else nullcontext
-        use_ddim = ddim_steps is not None
-
-        log = dict()
-        z, c, x, xrec, xc, x_low, x_low_rec, noise_level = self.get_input(batch, self.first_stage_key, bs=N,
-                                                                          log_mode=True)
-        N = min(x.shape[0], N)
-        n_row = min(x.shape[0], n_row)
-        log["inputs"] = x
-        log["reconstruction"] = xrec
-        log["x_lr"] = x_low
-        log[f"x_lr_rec_@noise_levels{'-'.join(map(lambda x: str(x), list(noise_level.cpu().numpy())))}"] = x_low_rec
-        if self.model.conditioning_key is not None:
-            if hasattr(self.cond_stage_model, "decode"):
-                xc = self.cond_stage_model.decode(c)
-                log["conditioning"] = xc
-            elif self.cond_stage_key in ["caption", "txt"]:
-                xc = log_txt_as_img((x.shape[2], x.shape[3]), batch[self.cond_stage_key], size=x.shape[2] // 25)
-                log["conditioning"] = xc
-            elif self.cond_stage_key in ['class_label', 'cls']:
-                xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"], size=x.shape[2] // 25)
-                log['conditioning'] = xc
-            elif isimage(xc):
-                log["conditioning"] = xc
-            if ismap(xc):
-                log["original_conditioning"] = self.to_rgb(xc)
-
-        if plot_diffusion_rows:
-            # get diffusion row
-            diffusion_row = list()
-            z_start = z[:n_row]
-            for t in range(self.num_timesteps):
-                if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
-                    t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
-                    t = t.to(self.device).long()
-                    noise = torch.randn_like(z_start)
-                    z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
-                    diffusion_row.append(self.decode_first_stage(z_noisy))
-
-            diffusion_row = torch.stack(diffusion_row)  # n_log_step, n_row, C, H, W
-            diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
-            diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
-            diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
-            log["diffusion_row"] = diffusion_grid
-
-        if sample:
-            # get denoise row
-            with ema_scope("Sampling"):
-                samples, z_denoise_row = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
-                                                         ddim_steps=ddim_steps, eta=ddim_eta)
-                # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
-            x_samples = self.decode_first_stage(samples)
-            log["samples"] = x_samples
-            if plot_denoise_rows:
-                denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
-                log["denoise_row"] = denoise_grid
-
-        if unconditional_guidance_scale > 1.0:
-            uc_tmp = self.get_unconditional_conditioning(N, unconditional_guidance_label)
-            # TODO explore better "unconditional" choices for the other keys
-            # maybe guide away from empty text label and highest noise level and maximally degraded zx?
-            uc = dict()
-            for k in c:
-                if k == "c_crossattn":
-                    assert isinstance(c[k], list) and len(c[k]) == 1
-                    uc[k] = [uc_tmp]
-                elif k == "c_adm":  # todo: only run with text-based guidance?
-                    assert isinstance(c[k], torch.Tensor)
-                    #uc[k] = torch.ones_like(c[k]) * self.low_scale_model.max_noise_level
-                    uc[k] = c[k]
-                elif isinstance(c[k], list):
-                    uc[k] = [c[k][i] for i in range(len(c[k]))]
-                else:
-                    uc[k] = c[k]
-
-            with ema_scope("Sampling with classifier-free guidance"):
-                samples_cfg, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
-                                                 ddim_steps=ddim_steps, eta=ddim_eta,
-                                                 unconditional_guidance_scale=unconditional_guidance_scale,
-                                                 unconditional_conditioning=uc,
-                                                 )
-                x_samples_cfg = self.decode_first_stage(samples_cfg)
-                log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
-
-        if plot_progressive_rows:
-            with ema_scope("Plotting Progressives"):
-                img, progressives = self.progressive_denoising(c,
-                                                               shape=(self.channels, self.image_size, self.image_size),
-                                                               batch_size=N)
-            prog_row = self._get_denoise_row_from_list(progressives, desc="Progressive Generation")
-            log["progressive_row"] = prog_row
-
-        return log
-
-
-class LatentFinetuneDiffusion(LatentDiffusion):
-    """
-         Basis for different finetunas, such as inpainting or depth2image
-         To disable finetuning mode, set finetune_keys to None
-    """
-
-    def __init__(self,
-                 concat_keys: tuple,
-                 finetune_keys=("model.diffusion_model.input_blocks.0.0.weight",
-                                "model_ema.diffusion_modelinput_blocks00weight"
-                                ),
-                 keep_finetune_dims=4,
-                 # if model was trained without concat mode before and we would like to keep these channels
-                 c_concat_log_start=None,  # to log reconstruction of c_concat codes
-                 c_concat_log_end=None,
-                 *args, **kwargs
-                 ):
-        ckpt_path = kwargs.pop("ckpt_path", None)
-        ignore_keys = kwargs.pop("ignore_keys", list())
-        super().__init__(*args, **kwargs)
-        self.finetune_keys = finetune_keys
-        self.concat_keys = concat_keys
-        self.keep_dims = keep_finetune_dims
-        self.c_concat_log_start = c_concat_log_start
-        self.c_concat_log_end = c_concat_log_end
-        if exists(self.finetune_keys): assert exists(ckpt_path), 'can only finetune from a given checkpoint'
-        if exists(ckpt_path):
-            self.init_from_ckpt(ckpt_path, ignore_keys)
-
-    def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
-        sd = torch.load(path, map_location="cpu")
-        if "state_dict" in list(sd.keys()):
-            sd = sd["state_dict"]
-        keys = list(sd.keys())
-        for k in keys:
-            for ik in ignore_keys:
-                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
-                    del sd[k]
-
-            # make it explicit, finetune by including extra input channels
-            if exists(self.finetune_keys) and k in self.finetune_keys:
-                new_entry = None
-                for name, param in self.named_parameters():
-                    if name in self.finetune_keys:
-                        print(
-                            f"modifying key '{name}' and keeping its original {self.keep_dims} (channels) dimensions only")
-                        new_entry = torch.zeros_like(param)  # zero init
-                assert exists(new_entry), 'did not find matching parameter to modify'
-                new_entry[:, :self.keep_dims, ...] = sd[k]
-                sd[k] = new_entry
-
-        missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
-            sd, strict=False)
-        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
-        if len(missing) > 0:
-            print(f"Missing Keys: {missing}")
-        if len(unexpected) > 0:
-            print(f"Unexpected Keys: {unexpected}")
-
-    @torch.no_grad()
-    def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=1., return_keys=None,
-                   quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
-                   plot_diffusion_rows=True, unconditional_guidance_scale=1., unconditional_guidance_label=None,
-                   use_ema_scope=True,
-                   **kwargs):
-        ema_scope = self.ema_scope if use_ema_scope else nullcontext
-        use_ddim = ddim_steps is not None
-
-        log = dict()
-        z, c, x, xrec, xc = self.get_input(batch, self.first_stage_key, bs=N, return_first_stage_outputs=True)
-        c_cat, c = c["c_concat"][0], c["c_crossattn"][0]
-        N = min(x.shape[0], N)
-        n_row = min(x.shape[0], n_row)
-        log["inputs"] = x
-        log["reconstruction"] = xrec
-        if self.model.conditioning_key is not None:
-            if hasattr(self.cond_stage_model, "decode"):
-                xc = self.cond_stage_model.decode(c)
-                log["conditioning"] = xc
-            elif self.cond_stage_key in ["caption", "txt"]:
-                xc = log_txt_as_img((x.shape[2], x.shape[3]), batch[self.cond_stage_key], size=x.shape[2] // 25)
-                log["conditioning"] = xc
-            elif self.cond_stage_key in ['class_label', 'cls']:
-                xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"], size=x.shape[2] // 25)
-                log['conditioning'] = xc
-            elif isimage(xc):
-                log["conditioning"] = xc
-            if ismap(xc):
-                log["original_conditioning"] = self.to_rgb(xc)
-
-        if not (self.c_concat_log_start is None and self.c_concat_log_end is None):
-            log["c_concat_decoded"] = self.decode_first_stage(c_cat[:, self.c_concat_log_start:self.c_concat_log_end])
-
-        if plot_diffusion_rows:
-            # get diffusion row
-            diffusion_row = list()
-            z_start = z[:n_row]
-            for t in range(self.num_timesteps):
-                if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
-                    t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
-                    t = t.to(self.device).long()
-                    noise = torch.randn_like(z_start)
-                    z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
-                    diffusion_row.append(self.decode_first_stage(z_noisy))
-
-            diffusion_row = torch.stack(diffusion_row)  # n_log_step, n_row, C, H, W
-            diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
-            diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
-            diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
-            log["diffusion_row"] = diffusion_grid
-
-        if sample:
-            # get denoise row
-            with ema_scope("Sampling"):
-                samples, z_denoise_row = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
-                                                         batch_size=N, ddim=use_ddim,
-                                                         ddim_steps=ddim_steps, eta=ddim_eta)
-                # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
-            x_samples = self.decode_first_stage(samples)
-            log["samples"] = x_samples
-            if plot_denoise_rows:
-                denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
-                log["denoise_row"] = denoise_grid
-
-        if unconditional_guidance_scale > 1.0:
-            uc_cross = self.get_unconditional_conditioning(N, unconditional_guidance_label)
-            uc_cat = c_cat
-            uc_full = {"c_concat": [uc_cat], "c_crossattn": [uc_cross]}
-            with ema_scope("Sampling with classifier-free guidance"):
-                samples_cfg, _ = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
-                                                 batch_size=N, ddim=use_ddim,
-                                                 ddim_steps=ddim_steps, eta=ddim_eta,
-                                                 unconditional_guidance_scale=unconditional_guidance_scale,
-                                                 unconditional_conditioning=uc_full,
-                                                 )
-                x_samples_cfg = self.decode_first_stage(samples_cfg)
-                log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
-
-        return log
-
-
-class LatentInpaintDiffusion(LatentFinetuneDiffusion):
-    """
-    can either run as pure inpainting model (only concat mode) or with mixed conditionings,
-    e.g. mask as concat and text via cross-attn.
-    To disable finetuning mode, set finetune_keys to None
-     """
-
-    def __init__(self,
-                 concat_keys=("mask", "masked_image"),
-                 masked_image_key="masked_image",
-                 *args, **kwargs
-                 ):
-        super().__init__(concat_keys, *args, **kwargs)
-        self.masked_image_key = masked_image_key
-        assert self.masked_image_key in concat_keys
-
-    @torch.no_grad()
-    def get_input(self, batch, k, cond_key=None, bs=None, return_first_stage_outputs=False):
-        # note: restricted to non-trainable encoders currently
-        assert not self.cond_stage_trainable, 'trainable cond stages not yet supported for inpainting'
-        z, c, x, xrec, xc = super().get_input(batch, self.first_stage_key, return_first_stage_outputs=True,
-                                              force_c_encode=True, return_original_cond=True, bs=bs)
-
-        assert exists(self.concat_keys)
-        c_cat = list()
-        for ck in self.concat_keys:
-            cc = rearrange(batch[ck], 'b h w c -> b c h w').to(memory_format=torch.contiguous_format).float()
-            if bs is not None:
-                cc = cc[:bs]
-                cc = cc.to(self.device)
-            bchw = z.shape
-            if ck != self.masked_image_key:
-                cc = torch.nn.functional.interpolate(cc, size=bchw[-2:])
-            else:
-                cc = self.get_first_stage_encoding(self.encode_first_stage(cc))
-            c_cat.append(cc)
-        c_cat = torch.cat(c_cat, dim=1)
-        all_conds = {"c_concat": [c_cat], "c_crossattn": [c]}
-        if return_first_stage_outputs:
-            return z, all_conds, x, xrec, xc
-        return z, all_conds
-
-    @torch.no_grad()
-    def log_images(self, *args, **kwargs):
-        log = super(LatentInpaintDiffusion, self).log_images(*args, **kwargs)
-        log["masked_image"] = rearrange(args[0]["masked_image"],
-                                        'b h w c -> b c h w').to(memory_format=torch.contiguous_format).float()
-        return log
-
-
-class LatentDepth2ImageDiffusion(LatentFinetuneDiffusion):
-    """
-    condition on monocular depth estimation
-    """
-
-    def __init__(self, depth_stage_config, concat_keys=("midas_in",), *args, **kwargs):
-        super().__init__(concat_keys=concat_keys, *args, **kwargs)
-        self.depth_model = instantiate_from_config(depth_stage_config)
-        self.depth_stage_key = concat_keys[0]
-
-    @torch.no_grad()
-    def get_input(self, batch, k, cond_key=None, bs=None, return_first_stage_outputs=False):
-        # note: restricted to non-trainable encoders currently
-        assert not self.cond_stage_trainable, 'trainable cond stages not yet supported for depth2img'
-        z, c, x, xrec, xc = super().get_input(batch, self.first_stage_key, return_first_stage_outputs=True,
-                                              force_c_encode=True, return_original_cond=True, bs=bs)
-
-        assert exists(self.concat_keys)
-        assert len(self.concat_keys) == 1
-        c_cat = list()
-        for ck in self.concat_keys:
-            cc = batch[ck]
-            if bs is not None:
-                cc = cc[:bs]
-                cc = cc.to(self.device)
-            cc = self.depth_model(cc)
-            cc = torch.nn.functional.interpolate(
-                cc,
-                size=z.shape[2:],
-                mode="bicubic",
-                align_corners=False,
-            )
-
-            depth_min, depth_max = torch.amin(cc, dim=[1, 2, 3], keepdim=True), torch.amax(cc, dim=[1, 2, 3],
-                                                                                           keepdim=True)
-            cc = 2. * (cc - depth_min) / (depth_max - depth_min + 0.001) - 1.
-            c_cat.append(cc)
-        c_cat = torch.cat(c_cat, dim=1)
-        all_conds = {"c_concat": [c_cat], "c_crossattn": [c]}
-        if return_first_stage_outputs:
-            return z, all_conds, x, xrec, xc
-        return z, all_conds
-
-    @torch.no_grad()
-    def log_images(self, *args, **kwargs):
-        log = super().log_images(*args, **kwargs)
-        depth = self.depth_model(args[0][self.depth_stage_key])
-        depth_min, depth_max = torch.amin(depth, dim=[1, 2, 3], keepdim=True), \
-                               torch.amax(depth, dim=[1, 2, 3], keepdim=True)
-        log["depth"] = 2. * (depth - depth_min) / (depth_max - depth_min) - 1.
-        return log
-
-
-class LatentUpscaleFinetuneDiffusion(LatentFinetuneDiffusion):
-    """
-        condition on low-res image (and optionally on some spatial noise augmentation)
-    """
-    def __init__(self, concat_keys=("lr",), reshuffle_patch_size=None,
-                 low_scale_config=None, low_scale_key=None, *args, **kwargs):
-        super().__init__(concat_keys=concat_keys, *args, **kwargs)
-        self.reshuffle_patch_size = reshuffle_patch_size
-        self.low_scale_model = None
-        if low_scale_config is not None:
-            print("Initializing a low-scale model")
-            assert exists(low_scale_key)
-            self.instantiate_low_stage(low_scale_config)
-            self.low_scale_key = low_scale_key
-
-    def instantiate_low_stage(self, config):
-        model = instantiate_from_config(config)
-        self.low_scale_model = model.eval()
-        self.low_scale_model.train = disabled_train
-        for param in self.low_scale_model.parameters():
-            param.requires_grad = False
-
-    @torch.no_grad()
-    def get_input(self, batch, k, cond_key=None, bs=None, return_first_stage_outputs=False):
-        # note: restricted to non-trainable encoders currently
-        assert not self.cond_stage_trainable, 'trainable cond stages not yet supported for upscaling-ft'
-        z, c, x, xrec, xc = super().get_input(batch, self.first_stage_key, return_first_stage_outputs=True,
-                                              force_c_encode=True, return_original_cond=True, bs=bs)
-
-        assert exists(self.concat_keys)
-        assert len(self.concat_keys) == 1
-        # optionally make spatial noise_level here
-        c_cat = list()
-        noise_level = None
-        for ck in self.concat_keys:
-            cc = batch[ck]
-            cc = rearrange(cc, 'b h w c -> b c h w')
-            if exists(self.reshuffle_patch_size):
-                assert isinstance(self.reshuffle_patch_size, int)
-                cc = rearrange(cc, 'b c (p1 h) (p2 w) -> b (p1 p2 c) h w',
-                               p1=self.reshuffle_patch_size, p2=self.reshuffle_patch_size)
-            if bs is not None:
-                cc = cc[:bs]
-                cc = cc.to(self.device)
-            if exists(self.low_scale_model) and ck == self.low_scale_key:
-                cc, noise_level = self.low_scale_model(cc)
-            c_cat.append(cc)
-        c_cat = torch.cat(c_cat, dim=1)
-        if exists(noise_level):
-            all_conds = {"c_concat": [c_cat], "c_crossattn": [c], "c_adm": noise_level}
-        else:
-            all_conds = {"c_concat": [c_cat], "c_crossattn": [c]}
-        if return_first_stage_outputs:
-            return z, all_conds, x, xrec, xc
-        return z, all_conds
-
-    @torch.no_grad()
-    def log_images(self, *args, **kwargs):
-        log = super().log_images(*args, **kwargs)
-        log["lr"] = rearrange(args[0]["lr"], 'b h w c -> b c h w')
-        return log
diff --git a/ControlNet/ldm/models/diffusion/dpm_solver/__init__.py b/ControlNet/ldm/models/diffusion/dpm_solver/__init__.py
deleted file mode 100644
index 7427f38c07530afbab79154ea8aaf88c4bf70a08..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/models/diffusion/dpm_solver/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .sampler import DPMSolverSampler
\ No newline at end of file
diff --git a/ControlNet/ldm/models/diffusion/dpm_solver/dpm_solver.py b/ControlNet/ldm/models/diffusion/dpm_solver/dpm_solver.py
deleted file mode 100644
index 095e5ba3ce0b1aa7f4b3f1e2e5d8fff7cfe6dc8c..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/models/diffusion/dpm_solver/dpm_solver.py
+++ /dev/null
@@ -1,1154 +0,0 @@
-import torch
-import torch.nn.functional as F
-import math
-from tqdm import tqdm
-
-
-class NoiseScheduleVP:
-    def __init__(
-            self,
-            schedule='discrete',
-            betas=None,
-            alphas_cumprod=None,
-            continuous_beta_0=0.1,
-            continuous_beta_1=20.,
-    ):
-        """Create a wrapper class for the forward SDE (VP type).
-        ***
-        Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t.
-                We recommend to use schedule='discrete' for the discrete-time diffusion models, especially for high-resolution images.
-        ***
-        The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ).
-        We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper).
-        Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have:
-            log_alpha_t = self.marginal_log_mean_coeff(t)
-            sigma_t = self.marginal_std(t)
-            lambda_t = self.marginal_lambda(t)
-        Moreover, as lambda(t) is an invertible function, we also support its inverse function:
-            t = self.inverse_lambda(lambda_t)
-        ===============================================================
-        We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]).
-        1. For discrete-time DPMs:
-            For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by:
-                t_i = (i + 1) / N
-            e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1.
-            We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3.
-            Args:
-                betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details)
-                alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details)
-            Note that we always have alphas_cumprod = cumprod(betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`.
-            **Important**:  Please pay special attention for the args for `alphas_cumprod`:
-                The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that
-                    q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ).
-                Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have
-                    alpha_{t_n} = \sqrt{\hat{alpha_n}},
-                and
-                    log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}).
-        2. For continuous-time DPMs:
-            We support two types of VPSDEs: linear (DDPM) and cosine (improved-DDPM). The hyperparameters for the noise
-            schedule are the default settings in DDPM and improved-DDPM:
-            Args:
-                beta_min: A `float` number. The smallest beta for the linear schedule.
-                beta_max: A `float` number. The largest beta for the linear schedule.
-                cosine_s: A `float` number. The hyperparameter in the cosine schedule.
-                cosine_beta_max: A `float` number. The hyperparameter in the cosine schedule.
-                T: A `float` number. The ending time of the forward process.
-        ===============================================================
-        Args:
-            schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs,
-                    'linear' or 'cosine' for continuous-time DPMs.
-        Returns:
-            A wrapper object of the forward SDE (VP type).
-
-        ===============================================================
-        Example:
-        # For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1):
-        >>> ns = NoiseScheduleVP('discrete', betas=betas)
-        # For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1):
-        >>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod)
-        # For continuous-time DPMs (VPSDE), linear schedule:
-        >>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.)
-        """
-
-        if schedule not in ['discrete', 'linear', 'cosine']:
-            raise ValueError(
-                "Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear' or 'cosine'".format(
-                    schedule))
-
-        self.schedule = schedule
-        if schedule == 'discrete':
-            if betas is not None:
-                log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0)
-            else:
-                assert alphas_cumprod is not None
-                log_alphas = 0.5 * torch.log(alphas_cumprod)
-            self.total_N = len(log_alphas)
-            self.T = 1.
-            self.t_array = torch.linspace(0., 1., self.total_N + 1)[1:].reshape((1, -1))
-            self.log_alpha_array = log_alphas.reshape((1, -1,))
-        else:
-            self.total_N = 1000
-            self.beta_0 = continuous_beta_0
-            self.beta_1 = continuous_beta_1
-            self.cosine_s = 0.008
-            self.cosine_beta_max = 999.
-            self.cosine_t_max = math.atan(self.cosine_beta_max * (1. + self.cosine_s) / math.pi) * 2. * (
-                        1. + self.cosine_s) / math.pi - self.cosine_s
-            self.cosine_log_alpha_0 = math.log(math.cos(self.cosine_s / (1. + self.cosine_s) * math.pi / 2.))
-            self.schedule = schedule
-            if schedule == 'cosine':
-                # For the cosine schedule, T = 1 will have numerical issues. So we manually set the ending time T.
-                # Note that T = 0.9946 may be not the optimal setting. However, we find it works well.
-                self.T = 0.9946
-            else:
-                self.T = 1.
-
-    def marginal_log_mean_coeff(self, t):
-        """
-        Compute log(alpha_t) of a given continuous-time label t in [0, T].
-        """
-        if self.schedule == 'discrete':
-            return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device),
-                                  self.log_alpha_array.to(t.device)).reshape((-1))
-        elif self.schedule == 'linear':
-            return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
-        elif self.schedule == 'cosine':
-            log_alpha_fn = lambda s: torch.log(torch.cos((s + self.cosine_s) / (1. + self.cosine_s) * math.pi / 2.))
-            log_alpha_t = log_alpha_fn(t) - self.cosine_log_alpha_0
-            return log_alpha_t
-
-    def marginal_alpha(self, t):
-        """
-        Compute alpha_t of a given continuous-time label t in [0, T].
-        """
-        return torch.exp(self.marginal_log_mean_coeff(t))
-
-    def marginal_std(self, t):
-        """
-        Compute sigma_t of a given continuous-time label t in [0, T].
-        """
-        return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t)))
-
-    def marginal_lambda(self, t):
-        """
-        Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
-        """
-        log_mean_coeff = self.marginal_log_mean_coeff(t)
-        log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff))
-        return log_mean_coeff - log_std
-
-    def inverse_lambda(self, lamb):
-        """
-        Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t.
-        """
-        if self.schedule == 'linear':
-            tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
-            Delta = self.beta_0 ** 2 + tmp
-            return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
-        elif self.schedule == 'discrete':
-            log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb)
-            t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]),
-                               torch.flip(self.t_array.to(lamb.device), [1]))
-            return t.reshape((-1,))
-        else:
-            log_alpha = -0.5 * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
-            t_fn = lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2. * (
-                        1. + self.cosine_s) / math.pi - self.cosine_s
-            t = t_fn(log_alpha)
-            return t
-
-
-def model_wrapper(
-        model,
-        noise_schedule,
-        model_type="noise",
-        model_kwargs={},
-        guidance_type="uncond",
-        condition=None,
-        unconditional_condition=None,
-        guidance_scale=1.,
-        classifier_fn=None,
-        classifier_kwargs={},
-):
-    """Create a wrapper function for the noise prediction model.
-    DPM-Solver needs to solve the continuous-time diffusion ODEs. For DPMs trained on discrete-time labels, we need to
-    firstly wrap the model function to a noise prediction model that accepts the continuous time as the input.
-    We support four types of the diffusion model by setting `model_type`:
-        1. "noise": noise prediction model. (Trained by predicting noise).
-        2. "x_start": data prediction model. (Trained by predicting the data x_0 at time 0).
-        3. "v": velocity prediction model. (Trained by predicting the velocity).
-            The "v" prediction is derivation detailed in Appendix D of [1], and is used in Imagen-Video [2].
-            [1] Salimans, Tim, and Jonathan Ho. "Progressive distillation for fast sampling of diffusion models."
-                arXiv preprint arXiv:2202.00512 (2022).
-            [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models."
-                arXiv preprint arXiv:2210.02303 (2022).
-
-        4. "score": marginal score function. (Trained by denoising score matching).
-            Note that the score function and the noise prediction model follows a simple relationship:
-            ```
-                noise(x_t, t) = -sigma_t * score(x_t, t)
-            ```
-    We support three types of guided sampling by DPMs by setting `guidance_type`:
-        1. "uncond": unconditional sampling by DPMs.
-            The input `model` has the following format:
-            ``
-                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
-            ``
-        2. "classifier": classifier guidance sampling [3] by DPMs and another classifier.
-            The input `model` has the following format:
-            ``
-                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
-            ``
-            The input `classifier_fn` has the following format:
-            ``
-                classifier_fn(x, t_input, cond, **classifier_kwargs) -> logits(x, t_input, cond)
-            ``
-            [3] P. Dhariwal and A. Q. Nichol, "Diffusion models beat GANs on image synthesis,"
-                in Advances in Neural Information Processing Systems, vol. 34, 2021, pp. 8780-8794.
-        3. "classifier-free": classifier-free guidance sampling by conditional DPMs.
-            The input `model` has the following format:
-            ``
-                model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score
-            ``
-            And if cond == `unconditional_condition`, the model output is the unconditional DPM output.
-            [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance."
-                arXiv preprint arXiv:2207.12598 (2022).
-
-    The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999)
-    or continuous-time labels (i.e. epsilon to T).
-    We wrap the model function to accept only `x` and `t_continuous` as inputs, and outputs the predicted noise:
-    ``
-        def model_fn(x, t_continuous) -> noise:
-            t_input = get_model_input_time(t_continuous)
-            return noise_pred(model, x, t_input, **model_kwargs)
-    ``
-    where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for DPM-Solver.
-    ===============================================================
-    Args:
-        model: A diffusion model with the corresponding format described above.
-        noise_schedule: A noise schedule object, such as NoiseScheduleVP.
-        model_type: A `str`. The parameterization type of the diffusion model.
-                    "noise" or "x_start" or "v" or "score".
-        model_kwargs: A `dict`. A dict for the other inputs of the model function.
-        guidance_type: A `str`. The type of the guidance for sampling.
-                    "uncond" or "classifier" or "classifier-free".
-        condition: A pytorch tensor. The condition for the guided sampling.
-                    Only used for "classifier" or "classifier-free" guidance type.
-        unconditional_condition: A pytorch tensor. The condition for the unconditional sampling.
-                    Only used for "classifier-free" guidance type.
-        guidance_scale: A `float`. The scale for the guided sampling.
-        classifier_fn: A classifier function. Only used for the classifier guidance.
-        classifier_kwargs: A `dict`. A dict for the other inputs of the classifier function.
-    Returns:
-        A noise prediction model that accepts the noised data and the continuous time as the inputs.
-    """
-
-    def get_model_input_time(t_continuous):
-        """
-        Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time.
-        For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N].
-        For continuous-time DPMs, we just use `t_continuous`.
-        """
-        if noise_schedule.schedule == 'discrete':
-            return (t_continuous - 1. / noise_schedule.total_N) * 1000.
-        else:
-            return t_continuous
-
-    def noise_pred_fn(x, t_continuous, cond=None):
-        if t_continuous.reshape((-1,)).shape[0] == 1:
-            t_continuous = t_continuous.expand((x.shape[0]))
-        t_input = get_model_input_time(t_continuous)
-        if cond is None:
-            output = model(x, t_input, **model_kwargs)
-        else:
-            output = model(x, t_input, cond, **model_kwargs)
-        if model_type == "noise":
-            return output
-        elif model_type == "x_start":
-            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
-            dims = x.dim()
-            return (x - expand_dims(alpha_t, dims) * output) / expand_dims(sigma_t, dims)
-        elif model_type == "v":
-            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
-            dims = x.dim()
-            return expand_dims(alpha_t, dims) * output + expand_dims(sigma_t, dims) * x
-        elif model_type == "score":
-            sigma_t = noise_schedule.marginal_std(t_continuous)
-            dims = x.dim()
-            return -expand_dims(sigma_t, dims) * output
-
-    def cond_grad_fn(x, t_input):
-        """
-        Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t).
-        """
-        with torch.enable_grad():
-            x_in = x.detach().requires_grad_(True)
-            log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs)
-            return torch.autograd.grad(log_prob.sum(), x_in)[0]
-
-    def model_fn(x, t_continuous):
-        """
-        The noise predicition model function that is used for DPM-Solver.
-        """
-        if t_continuous.reshape((-1,)).shape[0] == 1:
-            t_continuous = t_continuous.expand((x.shape[0]))
-        if guidance_type == "uncond":
-            return noise_pred_fn(x, t_continuous)
-        elif guidance_type == "classifier":
-            assert classifier_fn is not None
-            t_input = get_model_input_time(t_continuous)
-            cond_grad = cond_grad_fn(x, t_input)
-            sigma_t = noise_schedule.marginal_std(t_continuous)
-            noise = noise_pred_fn(x, t_continuous)
-            return noise - guidance_scale * expand_dims(sigma_t, dims=cond_grad.dim()) * cond_grad
-        elif guidance_type == "classifier-free":
-            if guidance_scale == 1. or unconditional_condition is None:
-                return noise_pred_fn(x, t_continuous, cond=condition)
-            else:
-                x_in = torch.cat([x] * 2)
-                t_in = torch.cat([t_continuous] * 2)
-                c_in = torch.cat([unconditional_condition, condition])
-                noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2)
-                return noise_uncond + guidance_scale * (noise - noise_uncond)
-
-    assert model_type in ["noise", "x_start", "v"]
-    assert guidance_type in ["uncond", "classifier", "classifier-free"]
-    return model_fn
-
-
-class DPM_Solver:
-    def __init__(self, model_fn, noise_schedule, predict_x0=False, thresholding=False, max_val=1.):
-        """Construct a DPM-Solver.
-        We support both the noise prediction model ("predicting epsilon") and the data prediction model ("predicting x0").
-        If `predict_x0` is False, we use the solver for the noise prediction model (DPM-Solver).
-        If `predict_x0` is True, we use the solver for the data prediction model (DPM-Solver++).
-            In such case, we further support the "dynamic thresholding" in [1] when `thresholding` is True.
-            The "dynamic thresholding" can greatly improve the sample quality for pixel-space DPMs with large guidance scales.
-        Args:
-            model_fn: A noise prediction model function which accepts the continuous-time input (t in [epsilon, T]):
-                ``
-                def model_fn(x, t_continuous):
-                    return noise
-                ``
-            noise_schedule: A noise schedule object, such as NoiseScheduleVP.
-            predict_x0: A `bool`. If true, use the data prediction model; else, use the noise prediction model.
-            thresholding: A `bool`. Valid when `predict_x0` is True. Whether to use the "dynamic thresholding" in [1].
-            max_val: A `float`. Valid when both `predict_x0` and `thresholding` are True. The max value for thresholding.
-
-        [1] Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar Seyed Ghasemipour, Burcu Karagol Ayan, S Sara Mahdavi, Rapha Gontijo Lopes, et al. Photorealistic text-to-image diffusion models with deep language understanding. arXiv preprint arXiv:2205.11487, 2022b.
-        """
-        self.model = model_fn
-        self.noise_schedule = noise_schedule
-        self.predict_x0 = predict_x0
-        self.thresholding = thresholding
-        self.max_val = max_val
-
-    def noise_prediction_fn(self, x, t):
-        """
-        Return the noise prediction model.
-        """
-        return self.model(x, t)
-
-    def data_prediction_fn(self, x, t):
-        """
-        Return the data prediction model (with thresholding).
-        """
-        noise = self.noise_prediction_fn(x, t)
-        dims = x.dim()
-        alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
-        x0 = (x - expand_dims(sigma_t, dims) * noise) / expand_dims(alpha_t, dims)
-        if self.thresholding:
-            p = 0.995  # A hyperparameter in the paper of "Imagen" [1].
-            s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
-            s = expand_dims(torch.maximum(s, self.max_val * torch.ones_like(s).to(s.device)), dims)
-            x0 = torch.clamp(x0, -s, s) / s
-        return x0
-
-    def model_fn(self, x, t):
-        """
-        Convert the model to the noise prediction model or the data prediction model.
-        """
-        if self.predict_x0:
-            return self.data_prediction_fn(x, t)
-        else:
-            return self.noise_prediction_fn(x, t)
-
-    def get_time_steps(self, skip_type, t_T, t_0, N, device):
-        """Compute the intermediate time steps for sampling.
-        Args:
-            skip_type: A `str`. The type for the spacing of the time steps. We support three types:
-                - 'logSNR': uniform logSNR for the time steps.
-                - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.)
-                - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.)
-            t_T: A `float`. The starting time of the sampling (default is T).
-            t_0: A `float`. The ending time of the sampling (default is epsilon).
-            N: A `int`. The total number of the spacing of the time steps.
-            device: A torch device.
-        Returns:
-            A pytorch tensor of the time steps, with the shape (N + 1,).
-        """
-        if skip_type == 'logSNR':
-            lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device))
-            lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device))
-            logSNR_steps = torch.linspace(lambda_T.cpu().item(), lambda_0.cpu().item(), N + 1).to(device)
-            return self.noise_schedule.inverse_lambda(logSNR_steps)
-        elif skip_type == 'time_uniform':
-            return torch.linspace(t_T, t_0, N + 1).to(device)
-        elif skip_type == 'time_quadratic':
-            t_order = 2
-            t = torch.linspace(t_T ** (1. / t_order), t_0 ** (1. / t_order), N + 1).pow(t_order).to(device)
-            return t
-        else:
-            raise ValueError(
-                "Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type))
-
-    def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device):
-        """
-        Get the order of each step for sampling by the singlestep DPM-Solver.
-        We combine both DPM-Solver-1,2,3 to use all the function evaluations, which is named as "DPM-Solver-fast".
-        Given a fixed number of function evaluations by `steps`, the sampling procedure by DPM-Solver-fast is:
-            - If order == 1:
-                We take `steps` of DPM-Solver-1 (i.e. DDIM).
-            - If order == 2:
-                - Denote K = (steps // 2). We take K or (K + 1) intermediate time steps for sampling.
-                - If steps % 2 == 0, we use K steps of DPM-Solver-2.
-                - If steps % 2 == 1, we use K steps of DPM-Solver-2 and 1 step of DPM-Solver-1.
-            - If order == 3:
-                - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling.
-                - If steps % 3 == 0, we use (K - 2) steps of DPM-Solver-3, and 1 step of DPM-Solver-2 and 1 step of DPM-Solver-1.
-                - If steps % 3 == 1, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-1.
-                - If steps % 3 == 2, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-2.
-        ============================================
-        Args:
-            order: A `int`. The max order for the solver (2 or 3).
-            steps: A `int`. The total number of function evaluations (NFE).
-            skip_type: A `str`. The type for the spacing of the time steps. We support three types:
-                - 'logSNR': uniform logSNR for the time steps.
-                - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.)
-                - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.)
-            t_T: A `float`. The starting time of the sampling (default is T).
-            t_0: A `float`. The ending time of the sampling (default is epsilon).
-            device: A torch device.
-        Returns:
-            orders: A list of the solver order of each step.
-        """
-        if order == 3:
-            K = steps // 3 + 1
-            if steps % 3 == 0:
-                orders = [3, ] * (K - 2) + [2, 1]
-            elif steps % 3 == 1:
-                orders = [3, ] * (K - 1) + [1]
-            else:
-                orders = [3, ] * (K - 1) + [2]
-        elif order == 2:
-            if steps % 2 == 0:
-                K = steps // 2
-                orders = [2, ] * K
-            else:
-                K = steps // 2 + 1
-                orders = [2, ] * (K - 1) + [1]
-        elif order == 1:
-            K = 1
-            orders = [1, ] * steps
-        else:
-            raise ValueError("'order' must be '1' or '2' or '3'.")
-        if skip_type == 'logSNR':
-            # To reproduce the results in DPM-Solver paper
-            timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device)
-        else:
-            timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[
-                torch.cumsum(torch.tensor([0, ] + orders)).to(device)]
-        return timesteps_outer, orders
-
-    def denoise_to_zero_fn(self, x, s):
-        """
-        Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization.
-        """
-        return self.data_prediction_fn(x, s)
-
-    def dpm_solver_first_update(self, x, s, t, model_s=None, return_intermediate=False):
-        """
-        DPM-Solver-1 (equivalent to DDIM) from time `s` to time `t`.
-        Args:
-            x: A pytorch tensor. The initial value at time `s`.
-            s: A pytorch tensor. The starting time, with the shape (x.shape[0],).
-            t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
-            model_s: A pytorch tensor. The model function evaluated at time `s`.
-                If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
-            return_intermediate: A `bool`. If true, also return the model value at time `s`.
-        Returns:
-            x_t: A pytorch tensor. The approximated solution at time `t`.
-        """
-        ns = self.noise_schedule
-        dims = x.dim()
-        lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
-        h = lambda_t - lambda_s
-        log_alpha_s, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(t)
-        sigma_s, sigma_t = ns.marginal_std(s), ns.marginal_std(t)
-        alpha_t = torch.exp(log_alpha_t)
-
-        if self.predict_x0:
-            phi_1 = torch.expm1(-h)
-            if model_s is None:
-                model_s = self.model_fn(x, s)
-            x_t = (
-                    expand_dims(sigma_t / sigma_s, dims) * x
-                    - expand_dims(alpha_t * phi_1, dims) * model_s
-            )
-            if return_intermediate:
-                return x_t, {'model_s': model_s}
-            else:
-                return x_t
-        else:
-            phi_1 = torch.expm1(h)
-            if model_s is None:
-                model_s = self.model_fn(x, s)
-            x_t = (
-                    expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
-                    - expand_dims(sigma_t * phi_1, dims) * model_s
-            )
-            if return_intermediate:
-                return x_t, {'model_s': model_s}
-            else:
-                return x_t
-
-    def singlestep_dpm_solver_second_update(self, x, s, t, r1=0.5, model_s=None, return_intermediate=False,
-                                            solver_type='dpm_solver'):
-        """
-        Singlestep solver DPM-Solver-2 from time `s` to time `t`.
-        Args:
-            x: A pytorch tensor. The initial value at time `s`.
-            s: A pytorch tensor. The starting time, with the shape (x.shape[0],).
-            t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
-            r1: A `float`. The hyperparameter of the second-order solver.
-            model_s: A pytorch tensor. The model function evaluated at time `s`.
-                If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
-            return_intermediate: A `bool`. If true, also return the model value at time `s` and `s1` (the intermediate time).
-            solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
-                The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
-        Returns:
-            x_t: A pytorch tensor. The approximated solution at time `t`.
-        """
-        if solver_type not in ['dpm_solver', 'taylor']:
-            raise ValueError("'solver_type' must be either 'dpm_solver' or 'taylor', got {}".format(solver_type))
-        if r1 is None:
-            r1 = 0.5
-        ns = self.noise_schedule
-        dims = x.dim()
-        lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
-        h = lambda_t - lambda_s
-        lambda_s1 = lambda_s + r1 * h
-        s1 = ns.inverse_lambda(lambda_s1)
-        log_alpha_s, log_alpha_s1, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(
-            s1), ns.marginal_log_mean_coeff(t)
-        sigma_s, sigma_s1, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(t)
-        alpha_s1, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_t)
-
-        if self.predict_x0:
-            phi_11 = torch.expm1(-r1 * h)
-            phi_1 = torch.expm1(-h)
-
-            if model_s is None:
-                model_s = self.model_fn(x, s)
-            x_s1 = (
-                    expand_dims(sigma_s1 / sigma_s, dims) * x
-                    - expand_dims(alpha_s1 * phi_11, dims) * model_s
-            )
-            model_s1 = self.model_fn(x_s1, s1)
-            if solver_type == 'dpm_solver':
-                x_t = (
-                        expand_dims(sigma_t / sigma_s, dims) * x
-                        - expand_dims(alpha_t * phi_1, dims) * model_s
-                        - (0.5 / r1) * expand_dims(alpha_t * phi_1, dims) * (model_s1 - model_s)
-                )
-            elif solver_type == 'taylor':
-                x_t = (
-                        expand_dims(sigma_t / sigma_s, dims) * x
-                        - expand_dims(alpha_t * phi_1, dims) * model_s
-                        + (1. / r1) * expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * (
-                                    model_s1 - model_s)
-                )
-        else:
-            phi_11 = torch.expm1(r1 * h)
-            phi_1 = torch.expm1(h)
-
-            if model_s is None:
-                model_s = self.model_fn(x, s)
-            x_s1 = (
-                    expand_dims(torch.exp(log_alpha_s1 - log_alpha_s), dims) * x
-                    - expand_dims(sigma_s1 * phi_11, dims) * model_s
-            )
-            model_s1 = self.model_fn(x_s1, s1)
-            if solver_type == 'dpm_solver':
-                x_t = (
-                        expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
-                        - expand_dims(sigma_t * phi_1, dims) * model_s
-                        - (0.5 / r1) * expand_dims(sigma_t * phi_1, dims) * (model_s1 - model_s)
-                )
-            elif solver_type == 'taylor':
-                x_t = (
-                        expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
-                        - expand_dims(sigma_t * phi_1, dims) * model_s
-                        - (1. / r1) * expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * (model_s1 - model_s)
-                )
-        if return_intermediate:
-            return x_t, {'model_s': model_s, 'model_s1': model_s1}
-        else:
-            return x_t
-
-    def singlestep_dpm_solver_third_update(self, x, s, t, r1=1. / 3., r2=2. / 3., model_s=None, model_s1=None,
-                                           return_intermediate=False, solver_type='dpm_solver'):
-        """
-        Singlestep solver DPM-Solver-3 from time `s` to time `t`.
-        Args:
-            x: A pytorch tensor. The initial value at time `s`.
-            s: A pytorch tensor. The starting time, with the shape (x.shape[0],).
-            t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
-            r1: A `float`. The hyperparameter of the third-order solver.
-            r2: A `float`. The hyperparameter of the third-order solver.
-            model_s: A pytorch tensor. The model function evaluated at time `s`.
-                If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
-            model_s1: A pytorch tensor. The model function evaluated at time `s1` (the intermediate time given by `r1`).
-                If `model_s1` is None, we evaluate the model at `s1`; otherwise we directly use it.
-            return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times).
-            solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
-                The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
-        Returns:
-            x_t: A pytorch tensor. The approximated solution at time `t`.
-        """
-        if solver_type not in ['dpm_solver', 'taylor']:
-            raise ValueError("'solver_type' must be either 'dpm_solver' or 'taylor', got {}".format(solver_type))
-        if r1 is None:
-            r1 = 1. / 3.
-        if r2 is None:
-            r2 = 2. / 3.
-        ns = self.noise_schedule
-        dims = x.dim()
-        lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
-        h = lambda_t - lambda_s
-        lambda_s1 = lambda_s + r1 * h
-        lambda_s2 = lambda_s + r2 * h
-        s1 = ns.inverse_lambda(lambda_s1)
-        s2 = ns.inverse_lambda(lambda_s2)
-        log_alpha_s, log_alpha_s1, log_alpha_s2, log_alpha_t = ns.marginal_log_mean_coeff(
-            s), ns.marginal_log_mean_coeff(s1), ns.marginal_log_mean_coeff(s2), ns.marginal_log_mean_coeff(t)
-        sigma_s, sigma_s1, sigma_s2, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(
-            s2), ns.marginal_std(t)
-        alpha_s1, alpha_s2, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_s2), torch.exp(log_alpha_t)
-
-        if self.predict_x0:
-            phi_11 = torch.expm1(-r1 * h)
-            phi_12 = torch.expm1(-r2 * h)
-            phi_1 = torch.expm1(-h)
-            phi_22 = torch.expm1(-r2 * h) / (r2 * h) + 1.
-            phi_2 = phi_1 / h + 1.
-            phi_3 = phi_2 / h - 0.5
-
-            if model_s is None:
-                model_s = self.model_fn(x, s)
-            if model_s1 is None:
-                x_s1 = (
-                        expand_dims(sigma_s1 / sigma_s, dims) * x
-                        - expand_dims(alpha_s1 * phi_11, dims) * model_s
-                )
-                model_s1 = self.model_fn(x_s1, s1)
-            x_s2 = (
-                    expand_dims(sigma_s2 / sigma_s, dims) * x
-                    - expand_dims(alpha_s2 * phi_12, dims) * model_s
-                    + r2 / r1 * expand_dims(alpha_s2 * phi_22, dims) * (model_s1 - model_s)
-            )
-            model_s2 = self.model_fn(x_s2, s2)
-            if solver_type == 'dpm_solver':
-                x_t = (
-                        expand_dims(sigma_t / sigma_s, dims) * x
-                        - expand_dims(alpha_t * phi_1, dims) * model_s
-                        + (1. / r2) * expand_dims(alpha_t * phi_2, dims) * (model_s2 - model_s)
-                )
-            elif solver_type == 'taylor':
-                D1_0 = (1. / r1) * (model_s1 - model_s)
-                D1_1 = (1. / r2) * (model_s2 - model_s)
-                D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
-                D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
-                x_t = (
-                        expand_dims(sigma_t / sigma_s, dims) * x
-                        - expand_dims(alpha_t * phi_1, dims) * model_s
-                        + expand_dims(alpha_t * phi_2, dims) * D1
-                        - expand_dims(alpha_t * phi_3, dims) * D2
-                )
-        else:
-            phi_11 = torch.expm1(r1 * h)
-            phi_12 = torch.expm1(r2 * h)
-            phi_1 = torch.expm1(h)
-            phi_22 = torch.expm1(r2 * h) / (r2 * h) - 1.
-            phi_2 = phi_1 / h - 1.
-            phi_3 = phi_2 / h - 0.5
-
-            if model_s is None:
-                model_s = self.model_fn(x, s)
-            if model_s1 is None:
-                x_s1 = (
-                        expand_dims(torch.exp(log_alpha_s1 - log_alpha_s), dims) * x
-                        - expand_dims(sigma_s1 * phi_11, dims) * model_s
-                )
-                model_s1 = self.model_fn(x_s1, s1)
-            x_s2 = (
-                    expand_dims(torch.exp(log_alpha_s2 - log_alpha_s), dims) * x
-                    - expand_dims(sigma_s2 * phi_12, dims) * model_s
-                    - r2 / r1 * expand_dims(sigma_s2 * phi_22, dims) * (model_s1 - model_s)
-            )
-            model_s2 = self.model_fn(x_s2, s2)
-            if solver_type == 'dpm_solver':
-                x_t = (
-                        expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
-                        - expand_dims(sigma_t * phi_1, dims) * model_s
-                        - (1. / r2) * expand_dims(sigma_t * phi_2, dims) * (model_s2 - model_s)
-                )
-            elif solver_type == 'taylor':
-                D1_0 = (1. / r1) * (model_s1 - model_s)
-                D1_1 = (1. / r2) * (model_s2 - model_s)
-                D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
-                D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
-                x_t = (
-                        expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
-                        - expand_dims(sigma_t * phi_1, dims) * model_s
-                        - expand_dims(sigma_t * phi_2, dims) * D1
-                        - expand_dims(sigma_t * phi_3, dims) * D2
-                )
-
-        if return_intermediate:
-            return x_t, {'model_s': model_s, 'model_s1': model_s1, 'model_s2': model_s2}
-        else:
-            return x_t
-
-    def multistep_dpm_solver_second_update(self, x, model_prev_list, t_prev_list, t, solver_type="dpm_solver"):
-        """
-        Multistep solver DPM-Solver-2 from time `t_prev_list[-1]` to time `t`.
-        Args:
-            x: A pytorch tensor. The initial value at time `s`.
-            model_prev_list: A list of pytorch tensor. The previous computed model values.
-            t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (x.shape[0],)
-            t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
-            solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
-                The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
-        Returns:
-            x_t: A pytorch tensor. The approximated solution at time `t`.
-        """
-        if solver_type not in ['dpm_solver', 'taylor']:
-            raise ValueError("'solver_type' must be either 'dpm_solver' or 'taylor', got {}".format(solver_type))
-        ns = self.noise_schedule
-        dims = x.dim()
-        model_prev_1, model_prev_0 = model_prev_list
-        t_prev_1, t_prev_0 = t_prev_list
-        lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_1), ns.marginal_lambda(
-            t_prev_0), ns.marginal_lambda(t)
-        log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
-        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
-        alpha_t = torch.exp(log_alpha_t)
-
-        h_0 = lambda_prev_0 - lambda_prev_1
-        h = lambda_t - lambda_prev_0
-        r0 = h_0 / h
-        D1_0 = expand_dims(1. / r0, dims) * (model_prev_0 - model_prev_1)
-        if self.predict_x0:
-            if solver_type == 'dpm_solver':
-                x_t = (
-                        expand_dims(sigma_t / sigma_prev_0, dims) * x
-                        - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
-                        - 0.5 * expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * D1_0
-                )
-            elif solver_type == 'taylor':
-                x_t = (
-                        expand_dims(sigma_t / sigma_prev_0, dims) * x
-                        - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
-                        + expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * D1_0
-                )
-        else:
-            if solver_type == 'dpm_solver':
-                x_t = (
-                        expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
-                        - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
-                        - 0.5 * expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * D1_0
-                )
-            elif solver_type == 'taylor':
-                x_t = (
-                        expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
-                        - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
-                        - expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * D1_0
-                )
-        return x_t
-
-    def multistep_dpm_solver_third_update(self, x, model_prev_list, t_prev_list, t, solver_type='dpm_solver'):
-        """
-        Multistep solver DPM-Solver-3 from time `t_prev_list[-1]` to time `t`.
-        Args:
-            x: A pytorch tensor. The initial value at time `s`.
-            model_prev_list: A list of pytorch tensor. The previous computed model values.
-            t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (x.shape[0],)
-            t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
-            solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
-                The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
-        Returns:
-            x_t: A pytorch tensor. The approximated solution at time `t`.
-        """
-        ns = self.noise_schedule
-        dims = x.dim()
-        model_prev_2, model_prev_1, model_prev_0 = model_prev_list
-        t_prev_2, t_prev_1, t_prev_0 = t_prev_list
-        lambda_prev_2, lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_2), ns.marginal_lambda(
-            t_prev_1), ns.marginal_lambda(t_prev_0), ns.marginal_lambda(t)
-        log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
-        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
-        alpha_t = torch.exp(log_alpha_t)
-
-        h_1 = lambda_prev_1 - lambda_prev_2
-        h_0 = lambda_prev_0 - lambda_prev_1
-        h = lambda_t - lambda_prev_0
-        r0, r1 = h_0 / h, h_1 / h
-        D1_0 = expand_dims(1. / r0, dims) * (model_prev_0 - model_prev_1)
-        D1_1 = expand_dims(1. / r1, dims) * (model_prev_1 - model_prev_2)
-        D1 = D1_0 + expand_dims(r0 / (r0 + r1), dims) * (D1_0 - D1_1)
-        D2 = expand_dims(1. / (r0 + r1), dims) * (D1_0 - D1_1)
-        if self.predict_x0:
-            x_t = (
-                    expand_dims(sigma_t / sigma_prev_0, dims) * x
-                    - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
-                    + expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * D1
-                    - expand_dims(alpha_t * ((torch.exp(-h) - 1. + h) / h ** 2 - 0.5), dims) * D2
-            )
-        else:
-            x_t = (
-                    expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
-                    - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
-                    - expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * D1
-                    - expand_dims(sigma_t * ((torch.exp(h) - 1. - h) / h ** 2 - 0.5), dims) * D2
-            )
-        return x_t
-
-    def singlestep_dpm_solver_update(self, x, s, t, order, return_intermediate=False, solver_type='dpm_solver', r1=None,
-                                     r2=None):
-        """
-        Singlestep DPM-Solver with the order `order` from time `s` to time `t`.
-        Args:
-            x: A pytorch tensor. The initial value at time `s`.
-            s: A pytorch tensor. The starting time, with the shape (x.shape[0],).
-            t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
-            order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
-            return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times).
-            solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
-                The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
-            r1: A `float`. The hyperparameter of the second-order or third-order solver.
-            r2: A `float`. The hyperparameter of the third-order solver.
-        Returns:
-            x_t: A pytorch tensor. The approximated solution at time `t`.
-        """
-        if order == 1:
-            return self.dpm_solver_first_update(x, s, t, return_intermediate=return_intermediate)
-        elif order == 2:
-            return self.singlestep_dpm_solver_second_update(x, s, t, return_intermediate=return_intermediate,
-                                                            solver_type=solver_type, r1=r1)
-        elif order == 3:
-            return self.singlestep_dpm_solver_third_update(x, s, t, return_intermediate=return_intermediate,
-                                                           solver_type=solver_type, r1=r1, r2=r2)
-        else:
-            raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
-
-    def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, order, solver_type='dpm_solver'):
-        """
-        Multistep DPM-Solver with the order `order` from time `t_prev_list[-1]` to time `t`.
-        Args:
-            x: A pytorch tensor. The initial value at time `s`.
-            model_prev_list: A list of pytorch tensor. The previous computed model values.
-            t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (x.shape[0],)
-            t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
-            order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
-            solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
-                The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
-        Returns:
-            x_t: A pytorch tensor. The approximated solution at time `t`.
-        """
-        if order == 1:
-            return self.dpm_solver_first_update(x, t_prev_list[-1], t, model_s=model_prev_list[-1])
-        elif order == 2:
-            return self.multistep_dpm_solver_second_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type)
-        elif order == 3:
-            return self.multistep_dpm_solver_third_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type)
-        else:
-            raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
-
-    def dpm_solver_adaptive(self, x, order, t_T, t_0, h_init=0.05, atol=0.0078, rtol=0.05, theta=0.9, t_err=1e-5,
-                            solver_type='dpm_solver'):
-        """
-        The adaptive step size solver based on singlestep DPM-Solver.
-        Args:
-            x: A pytorch tensor. The initial value at time `t_T`.
-            order: A `int`. The (higher) order of the solver. We only support order == 2 or 3.
-            t_T: A `float`. The starting time of the sampling (default is T).
-            t_0: A `float`. The ending time of the sampling (default is epsilon).
-            h_init: A `float`. The initial step size (for logSNR).
-            atol: A `float`. The absolute tolerance of the solver. For image data, the default setting is 0.0078, followed [1].
-            rtol: A `float`. The relative tolerance of the solver. The default setting is 0.05.
-            theta: A `float`. The safety hyperparameter for adapting the step size. The default setting is 0.9, followed [1].
-            t_err: A `float`. The tolerance for the time. We solve the diffusion ODE until the absolute error between the
-                current time and `t_0` is less than `t_err`. The default setting is 1e-5.
-            solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
-                The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
-        Returns:
-            x_0: A pytorch tensor. The approximated solution at time `t_0`.
-        [1] A. Jolicoeur-Martineau, K. Li, R. Piché-Taillefer, T. Kachman, and I. Mitliagkas, "Gotta go fast when generating data with score-based models," arXiv preprint arXiv:2105.14080, 2021.
-        """
-        ns = self.noise_schedule
-        s = t_T * torch.ones((x.shape[0],)).to(x)
-        lambda_s = ns.marginal_lambda(s)
-        lambda_0 = ns.marginal_lambda(t_0 * torch.ones_like(s).to(x))
-        h = h_init * torch.ones_like(s).to(x)
-        x_prev = x
-        nfe = 0
-        if order == 2:
-            r1 = 0.5
-            lower_update = lambda x, s, t: self.dpm_solver_first_update(x, s, t, return_intermediate=True)
-            higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1,
-                                                                                               solver_type=solver_type,
-                                                                                               **kwargs)
-        elif order == 3:
-            r1, r2 = 1. / 3., 2. / 3.
-            lower_update = lambda x, s, t: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1,
-                                                                                    return_intermediate=True,
-                                                                                    solver_type=solver_type)
-            higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2,
-                                                                                              solver_type=solver_type,
-                                                                                              **kwargs)
-        else:
-            raise ValueError("For adaptive step size solver, order must be 2 or 3, got {}".format(order))
-        while torch.abs((s - t_0)).mean() > t_err:
-            t = ns.inverse_lambda(lambda_s + h)
-            x_lower, lower_noise_kwargs = lower_update(x, s, t)
-            x_higher = higher_update(x, s, t, **lower_noise_kwargs)
-            delta = torch.max(torch.ones_like(x).to(x) * atol, rtol * torch.max(torch.abs(x_lower), torch.abs(x_prev)))
-            norm_fn = lambda v: torch.sqrt(torch.square(v.reshape((v.shape[0], -1))).mean(dim=-1, keepdim=True))
-            E = norm_fn((x_higher - x_lower) / delta).max()
-            if torch.all(E <= 1.):
-                x = x_higher
-                s = t
-                x_prev = x_lower
-                lambda_s = ns.marginal_lambda(s)
-            h = torch.min(theta * h * torch.float_power(E, -1. / order).float(), lambda_0 - lambda_s)
-            nfe += order
-        print('adaptive solver nfe', nfe)
-        return x
-
-    def sample(self, x, steps=20, t_start=None, t_end=None, order=3, skip_type='time_uniform',
-               method='singlestep', lower_order_final=True, denoise_to_zero=False, solver_type='dpm_solver',
-               atol=0.0078, rtol=0.05,
-               ):
-        """
-        Compute the sample at time `t_end` by DPM-Solver, given the initial `x` at time `t_start`.
-        =====================================================
-        We support the following algorithms for both noise prediction model and data prediction model:
-            - 'singlestep':
-                Singlestep DPM-Solver (i.e. "DPM-Solver-fast" in the paper), which combines different orders of singlestep DPM-Solver.
-                We combine all the singlestep solvers with order <= `order` to use up all the function evaluations (steps).
-                The total number of function evaluations (NFE) == `steps`.
-                Given a fixed NFE == `steps`, the sampling procedure is:
-                    - If `order` == 1:
-                        - Denote K = steps. We use K steps of DPM-Solver-1 (i.e. DDIM).
-                    - If `order` == 2:
-                        - Denote K = (steps // 2) + (steps % 2). We take K intermediate time steps for sampling.
-                        - If steps % 2 == 0, we use K steps of singlestep DPM-Solver-2.
-                        - If steps % 2 == 1, we use (K - 1) steps of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1.
-                    - If `order` == 3:
-                        - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling.
-                        - If steps % 3 == 0, we use (K - 2) steps of singlestep DPM-Solver-3, and 1 step of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1.
-                        - If steps % 3 == 1, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of DPM-Solver-1.
-                        - If steps % 3 == 2, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of singlestep DPM-Solver-2.
-            - 'multistep':
-                Multistep DPM-Solver with the order of `order`. The total number of function evaluations (NFE) == `steps`.
-                We initialize the first `order` values by lower order multistep solvers.
-                Given a fixed NFE == `steps`, the sampling procedure is:
-                    Denote K = steps.
-                    - If `order` == 1:
-                        - We use K steps of DPM-Solver-1 (i.e. DDIM).
-                    - If `order` == 2:
-                        - We firstly use 1 step of DPM-Solver-1, then use (K - 1) step of multistep DPM-Solver-2.
-                    - If `order` == 3:
-                        - We firstly use 1 step of DPM-Solver-1, then 1 step of multistep DPM-Solver-2, then (K - 2) step of multistep DPM-Solver-3.
-            - 'singlestep_fixed':
-                Fixed order singlestep DPM-Solver (i.e. DPM-Solver-1 or singlestep DPM-Solver-2 or singlestep DPM-Solver-3).
-                We use singlestep DPM-Solver-`order` for `order`=1 or 2 or 3, with total [`steps` // `order`] * `order` NFE.
-            - 'adaptive':
-                Adaptive step size DPM-Solver (i.e. "DPM-Solver-12" and "DPM-Solver-23" in the paper).
-                We ignore `steps` and use adaptive step size DPM-Solver with a higher order of `order`.
-                You can adjust the absolute tolerance `atol` and the relative tolerance `rtol` to balance the computatation costs
-                (NFE) and the sample quality.
-                    - If `order` == 2, we use DPM-Solver-12 which combines DPM-Solver-1 and singlestep DPM-Solver-2.
-                    - If `order` == 3, we use DPM-Solver-23 which combines singlestep DPM-Solver-2 and singlestep DPM-Solver-3.
-        =====================================================
-        Some advices for choosing the algorithm:
-            - For **unconditional sampling** or **guided sampling with small guidance scale** by DPMs:
-                Use singlestep DPM-Solver ("DPM-Solver-fast" in the paper) with `order = 3`.
-                e.g.
-                    >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, predict_x0=False)
-                    >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=3,
-                            skip_type='time_uniform', method='singlestep')
-            - For **guided sampling with large guidance scale** by DPMs:
-                Use multistep DPM-Solver with `predict_x0 = True` and `order = 2`.
-                e.g.
-                    >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, predict_x0=True)
-                    >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=2,
-                            skip_type='time_uniform', method='multistep')
-        We support three types of `skip_type`:
-            - 'logSNR': uniform logSNR for the time steps. **Recommended for low-resolutional images**
-            - 'time_uniform': uniform time for the time steps. **Recommended for high-resolutional images**.
-            - 'time_quadratic': quadratic time for the time steps.
-        =====================================================
-        Args:
-            x: A pytorch tensor. The initial value at time `t_start`
-                e.g. if `t_start` == T, then `x` is a sample from the standard normal distribution.
-            steps: A `int`. The total number of function evaluations (NFE).
-            t_start: A `float`. The starting time of the sampling.
-                If `T` is None, we use self.noise_schedule.T (default is 1.0).
-            t_end: A `float`. The ending time of the sampling.
-                If `t_end` is None, we use 1. / self.noise_schedule.total_N.
-                e.g. if total_N == 1000, we have `t_end` == 1e-3.
-                For discrete-time DPMs:
-                    - We recommend `t_end` == 1. / self.noise_schedule.total_N.
-                For continuous-time DPMs:
-                    - We recommend `t_end` == 1e-3 when `steps` <= 15; and `t_end` == 1e-4 when `steps` > 15.
-            order: A `int`. The order of DPM-Solver.
-            skip_type: A `str`. The type for the spacing of the time steps. 'time_uniform' or 'logSNR' or 'time_quadratic'.
-            method: A `str`. The method for sampling. 'singlestep' or 'multistep' or 'singlestep_fixed' or 'adaptive'.
-            denoise_to_zero: A `bool`. Whether to denoise to time 0 at the final step.
-                Default is `False`. If `denoise_to_zero` is `True`, the total NFE is (`steps` + 1).
-                This trick is firstly proposed by DDPM (https://arxiv.org/abs/2006.11239) and
-                score_sde (https://arxiv.org/abs/2011.13456). Such trick can improve the FID
-                for diffusion models sampling by diffusion SDEs for low-resolutional images
-                (such as CIFAR-10). However, we observed that such trick does not matter for
-                high-resolutional images. As it needs an additional NFE, we do not recommend
-                it for high-resolutional images.
-            lower_order_final: A `bool`. Whether to use lower order solvers at the final steps.
-                Only valid for `method=multistep` and `steps < 15`. We empirically find that
-                this trick is a key to stabilizing the sampling by DPM-Solver with very few steps
-                (especially for steps <= 10). So we recommend to set it to be `True`.
-            solver_type: A `str`. The taylor expansion type for the solver. `dpm_solver` or `taylor`. We recommend `dpm_solver`.
-            atol: A `float`. The absolute tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'.
-            rtol: A `float`. The relative tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'.
-        Returns:
-            x_end: A pytorch tensor. The approximated solution at time `t_end`.
-        """
-        t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
-        t_T = self.noise_schedule.T if t_start is None else t_start
-        device = x.device
-        if method == 'adaptive':
-            with torch.no_grad():
-                x = self.dpm_solver_adaptive(x, order=order, t_T=t_T, t_0=t_0, atol=atol, rtol=rtol,
-                                             solver_type=solver_type)
-        elif method == 'multistep':
-            assert steps >= order
-            timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device)
-            assert timesteps.shape[0] - 1 == steps
-            with torch.no_grad():
-                vec_t = timesteps[0].expand((x.shape[0]))
-                model_prev_list = [self.model_fn(x, vec_t)]
-                t_prev_list = [vec_t]
-                # Init the first `order` values by lower order multistep DPM-Solver.
-                for init_order in tqdm(range(1, order), desc="DPM init order"):
-                    vec_t = timesteps[init_order].expand(x.shape[0])
-                    x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, vec_t, init_order,
-                                                         solver_type=solver_type)
-                    model_prev_list.append(self.model_fn(x, vec_t))
-                    t_prev_list.append(vec_t)
-                # Compute the remaining values by `order`-th order multistep DPM-Solver.
-                for step in tqdm(range(order, steps + 1), desc="DPM multistep"):
-                    vec_t = timesteps[step].expand(x.shape[0])
-                    if lower_order_final and steps < 15:
-                        step_order = min(order, steps + 1 - step)
-                    else:
-                        step_order = order
-                    x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, vec_t, step_order,
-                                                         solver_type=solver_type)
-                    for i in range(order - 1):
-                        t_prev_list[i] = t_prev_list[i + 1]
-                        model_prev_list[i] = model_prev_list[i + 1]
-                    t_prev_list[-1] = vec_t
-                    # We do not need to evaluate the final model value.
-                    if step < steps:
-                        model_prev_list[-1] = self.model_fn(x, vec_t)
-        elif method in ['singlestep', 'singlestep_fixed']:
-            if method == 'singlestep':
-                timesteps_outer, orders = self.get_orders_and_timesteps_for_singlestep_solver(steps=steps, order=order,
-                                                                                              skip_type=skip_type,
-                                                                                              t_T=t_T, t_0=t_0,
-                                                                                              device=device)
-            elif method == 'singlestep_fixed':
-                K = steps // order
-                orders = [order, ] * K
-                timesteps_outer = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=K, device=device)
-            for i, order in enumerate(orders):
-                t_T_inner, t_0_inner = timesteps_outer[i], timesteps_outer[i + 1]
-                timesteps_inner = self.get_time_steps(skip_type=skip_type, t_T=t_T_inner.item(), t_0=t_0_inner.item(),
-                                                      N=order, device=device)
-                lambda_inner = self.noise_schedule.marginal_lambda(timesteps_inner)
-                vec_s, vec_t = t_T_inner.tile(x.shape[0]), t_0_inner.tile(x.shape[0])
-                h = lambda_inner[-1] - lambda_inner[0]
-                r1 = None if order <= 1 else (lambda_inner[1] - lambda_inner[0]) / h
-                r2 = None if order <= 2 else (lambda_inner[2] - lambda_inner[0]) / h
-                x = self.singlestep_dpm_solver_update(x, vec_s, vec_t, order, solver_type=solver_type, r1=r1, r2=r2)
-        if denoise_to_zero:
-            x = self.denoise_to_zero_fn(x, torch.ones((x.shape[0],)).to(device) * t_0)
-        return x
-
-
-#############################################################
-# other utility functions
-#############################################################
-
-def interpolate_fn(x, xp, yp):
-    """
-    A piecewise linear function y = f(x), using xp and yp as keypoints.
-    We implement f(x) in a differentiable way (i.e. applicable for autograd).
-    The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.)
-    Args:
-        x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver).
-        xp: PyTorch tensor with shape [C, K], where K is the number of keypoints.
-        yp: PyTorch tensor with shape [C, K].
-    Returns:
-        The function values f(x), with shape [N, C].
-    """
-    N, K = x.shape[0], xp.shape[1]
-    all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2)
-    sorted_all_x, x_indices = torch.sort(all_x, dim=2)
-    x_idx = torch.argmin(x_indices, dim=2)
-    cand_start_idx = x_idx - 1
-    start_idx = torch.where(
-        torch.eq(x_idx, 0),
-        torch.tensor(1, device=x.device),
-        torch.where(
-            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
-        ),
-    )
-    end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
-    start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
-    end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2)
-    start_idx2 = torch.where(
-        torch.eq(x_idx, 0),
-        torch.tensor(0, device=x.device),
-        torch.where(
-            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
-        ),
-    )
-    y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
-    start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
-    end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
-    cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)
-    return cand
-
-
-def expand_dims(v, dims):
-    """
-    Expand the tensor `v` to the dim `dims`.
-    Args:
-        `v`: a PyTorch tensor with shape [N].
-        `dim`: a `int`.
-    Returns:
-        a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
-    """
-    return v[(...,) + (None,) * (dims - 1)]
\ No newline at end of file
diff --git a/ControlNet/ldm/models/diffusion/dpm_solver/sampler.py b/ControlNet/ldm/models/diffusion/dpm_solver/sampler.py
deleted file mode 100644
index 7d137b8cf36718c1c58faa09f9dd919e5fb2977b..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/models/diffusion/dpm_solver/sampler.py
+++ /dev/null
@@ -1,87 +0,0 @@
-"""SAMPLING ONLY."""
-import torch
-
-from .dpm_solver import NoiseScheduleVP, model_wrapper, DPM_Solver
-
-
-MODEL_TYPES = {
-    "eps": "noise",
-    "v": "v"
-}
-
-
-class DPMSolverSampler(object):
-    def __init__(self, model, **kwargs):
-        super().__init__()
-        self.model = model
-        to_torch = lambda x: x.clone().detach().to(torch.float32).to(model.device)
-        self.register_buffer('alphas_cumprod', to_torch(model.alphas_cumprod))
-
-    def register_buffer(self, name, attr):
-        if type(attr) == torch.Tensor:
-            if attr.device != torch.device("cuda"):
-                attr = attr.to(torch.device("cuda"))
-        setattr(self, name, attr)
-
-    @torch.no_grad()
-    def sample(self,
-               S,
-               batch_size,
-               shape,
-               conditioning=None,
-               callback=None,
-               normals_sequence=None,
-               img_callback=None,
-               quantize_x0=False,
-               eta=0.,
-               mask=None,
-               x0=None,
-               temperature=1.,
-               noise_dropout=0.,
-               score_corrector=None,
-               corrector_kwargs=None,
-               verbose=True,
-               x_T=None,
-               log_every_t=100,
-               unconditional_guidance_scale=1.,
-               unconditional_conditioning=None,
-               # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
-               **kwargs
-               ):
-        if conditioning is not None:
-            if isinstance(conditioning, dict):
-                cbs = conditioning[list(conditioning.keys())[0]].shape[0]
-                if cbs != batch_size:
-                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
-            else:
-                if conditioning.shape[0] != batch_size:
-                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
-
-        # sampling
-        C, H, W = shape
-        size = (batch_size, C, H, W)
-
-        print(f'Data shape for DPM-Solver sampling is {size}, sampling steps {S}')
-
-        device = self.model.betas.device
-        if x_T is None:
-            img = torch.randn(size, device=device)
-        else:
-            img = x_T
-
-        ns = NoiseScheduleVP('discrete', alphas_cumprod=self.alphas_cumprod)
-
-        model_fn = model_wrapper(
-            lambda x, t, c: self.model.apply_model(x, t, c),
-            ns,
-            model_type=MODEL_TYPES[self.model.parameterization],
-            guidance_type="classifier-free",
-            condition=conditioning,
-            unconditional_condition=unconditional_conditioning,
-            guidance_scale=unconditional_guidance_scale,
-        )
-
-        dpm_solver = DPM_Solver(model_fn, ns, predict_x0=True, thresholding=False)
-        x = dpm_solver.sample(img, steps=S, skip_type="time_uniform", method="multistep", order=2, lower_order_final=True)
-
-        return x.to(device), None
\ No newline at end of file
diff --git a/ControlNet/ldm/models/diffusion/plms.py b/ControlNet/ldm/models/diffusion/plms.py
deleted file mode 100644
index 7002a365d27168ced0a04e9a4d83e088f8284eae..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/models/diffusion/plms.py
+++ /dev/null
@@ -1,244 +0,0 @@
-"""SAMPLING ONLY."""
-
-import torch
-import numpy as np
-from tqdm import tqdm
-from functools import partial
-
-from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
-from ldm.models.diffusion.sampling_util import norm_thresholding
-
-
-class PLMSSampler(object):
-    def __init__(self, model, schedule="linear", **kwargs):
-        super().__init__()
-        self.model = model
-        self.ddpm_num_timesteps = model.num_timesteps
-        self.schedule = schedule
-
-    def register_buffer(self, name, attr):
-        if type(attr) == torch.Tensor:
-            if attr.device != torch.device("cuda"):
-                attr = attr.to(torch.device("cuda"))
-        setattr(self, name, attr)
-
-    def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
-        if ddim_eta != 0:
-            raise ValueError('ddim_eta must be 0 for PLMS')
-        self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
-                                                  num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
-        alphas_cumprod = self.model.alphas_cumprod
-        assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
-        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
-
-        self.register_buffer('betas', to_torch(self.model.betas))
-        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
-        self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
-
-        # calculations for diffusion q(x_t | x_{t-1}) and others
-        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
-        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
-
-        # ddim sampling parameters
-        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
-                                                                                   ddim_timesteps=self.ddim_timesteps,
-                                                                                   eta=ddim_eta,verbose=verbose)
-        self.register_buffer('ddim_sigmas', ddim_sigmas)
-        self.register_buffer('ddim_alphas', ddim_alphas)
-        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
-        self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
-        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
-            (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
-                        1 - self.alphas_cumprod / self.alphas_cumprod_prev))
-        self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
-
-    @torch.no_grad()
-    def sample(self,
-               S,
-               batch_size,
-               shape,
-               conditioning=None,
-               callback=None,
-               normals_sequence=None,
-               img_callback=None,
-               quantize_x0=False,
-               eta=0.,
-               mask=None,
-               x0=None,
-               temperature=1.,
-               noise_dropout=0.,
-               score_corrector=None,
-               corrector_kwargs=None,
-               verbose=True,
-               x_T=None,
-               log_every_t=100,
-               unconditional_guidance_scale=1.,
-               unconditional_conditioning=None,
-               # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
-               dynamic_threshold=None,
-               **kwargs
-               ):
-        if conditioning is not None:
-            if isinstance(conditioning, dict):
-                cbs = conditioning[list(conditioning.keys())[0]].shape[0]
-                if cbs != batch_size:
-                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
-            else:
-                if conditioning.shape[0] != batch_size:
-                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
-
-        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
-        # sampling
-        C, H, W = shape
-        size = (batch_size, C, H, W)
-        print(f'Data shape for PLMS sampling is {size}')
-
-        samples, intermediates = self.plms_sampling(conditioning, size,
-                                                    callback=callback,
-                                                    img_callback=img_callback,
-                                                    quantize_denoised=quantize_x0,
-                                                    mask=mask, x0=x0,
-                                                    ddim_use_original_steps=False,
-                                                    noise_dropout=noise_dropout,
-                                                    temperature=temperature,
-                                                    score_corrector=score_corrector,
-                                                    corrector_kwargs=corrector_kwargs,
-                                                    x_T=x_T,
-                                                    log_every_t=log_every_t,
-                                                    unconditional_guidance_scale=unconditional_guidance_scale,
-                                                    unconditional_conditioning=unconditional_conditioning,
-                                                    dynamic_threshold=dynamic_threshold,
-                                                    )
-        return samples, intermediates
-
-    @torch.no_grad()
-    def plms_sampling(self, cond, shape,
-                      x_T=None, ddim_use_original_steps=False,
-                      callback=None, timesteps=None, quantize_denoised=False,
-                      mask=None, x0=None, img_callback=None, log_every_t=100,
-                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None,
-                      dynamic_threshold=None):
-        device = self.model.betas.device
-        b = shape[0]
-        if x_T is None:
-            img = torch.randn(shape, device=device)
-        else:
-            img = x_T
-
-        if timesteps is None:
-            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
-        elif timesteps is not None and not ddim_use_original_steps:
-            subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
-            timesteps = self.ddim_timesteps[:subset_end]
-
-        intermediates = {'x_inter': [img], 'pred_x0': [img]}
-        time_range = list(reversed(range(0,timesteps))) if ddim_use_original_steps else np.flip(timesteps)
-        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
-        print(f"Running PLMS Sampling with {total_steps} timesteps")
-
-        iterator = tqdm(time_range, desc='PLMS Sampler', total=total_steps)
-        old_eps = []
-
-        for i, step in enumerate(iterator):
-            index = total_steps - i - 1
-            ts = torch.full((b,), step, device=device, dtype=torch.long)
-            ts_next = torch.full((b,), time_range[min(i + 1, len(time_range) - 1)], device=device, dtype=torch.long)
-
-            if mask is not None:
-                assert x0 is not None
-                img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass?
-                img = img_orig * mask + (1. - mask) * img
-
-            outs = self.p_sample_plms(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
-                                      quantize_denoised=quantize_denoised, temperature=temperature,
-                                      noise_dropout=noise_dropout, score_corrector=score_corrector,
-                                      corrector_kwargs=corrector_kwargs,
-                                      unconditional_guidance_scale=unconditional_guidance_scale,
-                                      unconditional_conditioning=unconditional_conditioning,
-                                      old_eps=old_eps, t_next=ts_next,
-                                      dynamic_threshold=dynamic_threshold)
-            img, pred_x0, e_t = outs
-            old_eps.append(e_t)
-            if len(old_eps) >= 4:
-                old_eps.pop(0)
-            if callback: callback(i)
-            if img_callback: img_callback(pred_x0, i)
-
-            if index % log_every_t == 0 or index == total_steps - 1:
-                intermediates['x_inter'].append(img)
-                intermediates['pred_x0'].append(pred_x0)
-
-        return img, intermediates
-
-    @torch.no_grad()
-    def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
-                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None,
-                      dynamic_threshold=None):
-        b, *_, device = *x.shape, x.device
-
-        def get_model_output(x, t):
-            if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
-                e_t = self.model.apply_model(x, t, c)
-            else:
-                x_in = torch.cat([x] * 2)
-                t_in = torch.cat([t] * 2)
-                c_in = torch.cat([unconditional_conditioning, c])
-                e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
-                e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
-
-            if score_corrector is not None:
-                assert self.model.parameterization == "eps"
-                e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
-
-            return e_t
-
-        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
-        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
-        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
-        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
-
-        def get_x_prev_and_pred_x0(e_t, index):
-            # select parameters corresponding to the currently considered timestep
-            a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
-            a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
-            sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
-            sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
-
-            # current prediction for x_0
-            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
-            if quantize_denoised:
-                pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
-            if dynamic_threshold is not None:
-                pred_x0 = norm_thresholding(pred_x0, dynamic_threshold)
-            # direction pointing to x_t
-            dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
-            noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
-            if noise_dropout > 0.:
-                noise = torch.nn.functional.dropout(noise, p=noise_dropout)
-            x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
-            return x_prev, pred_x0
-
-        e_t = get_model_output(x, t)
-        if len(old_eps) == 0:
-            # Pseudo Improved Euler (2nd order)
-            x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
-            e_t_next = get_model_output(x_prev, t_next)
-            e_t_prime = (e_t + e_t_next) / 2
-        elif len(old_eps) == 1:
-            # 2nd order Pseudo Linear Multistep (Adams-Bashforth)
-            e_t_prime = (3 * e_t - old_eps[-1]) / 2
-        elif len(old_eps) == 2:
-            # 3nd order Pseudo Linear Multistep (Adams-Bashforth)
-            e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
-        elif len(old_eps) >= 3:
-            # 4nd order Pseudo Linear Multistep (Adams-Bashforth)
-            e_t_prime = (55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]) / 24
-
-        x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
-
-        return x_prev, pred_x0, e_t
diff --git a/ControlNet/ldm/models/diffusion/sampling_util.py b/ControlNet/ldm/models/diffusion/sampling_util.py
deleted file mode 100644
index 7eff02be6d7c54d43ee6680636ac0698dd3b3f33..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/models/diffusion/sampling_util.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import torch
-import numpy as np
-
-
-def append_dims(x, target_dims):
-    """Appends dimensions to the end of a tensor until it has target_dims dimensions.
-    From https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/utils.py"""
-    dims_to_append = target_dims - x.ndim
-    if dims_to_append < 0:
-        raise ValueError(f'input has {x.ndim} dims but target_dims is {target_dims}, which is less')
-    return x[(...,) + (None,) * dims_to_append]
-
-
-def norm_thresholding(x0, value):
-    s = append_dims(x0.pow(2).flatten(1).mean(1).sqrt().clamp(min=value), x0.ndim)
-    return x0 * (value / s)
-
-
-def spatial_norm_thresholding(x0, value):
-    # b c h w
-    s = x0.pow(2).mean(1, keepdim=True).sqrt().clamp(min=value)
-    return x0 * (value / s)
\ No newline at end of file
diff --git a/ControlNet/ldm/modules/attention.py b/ControlNet/ldm/modules/attention.py
deleted file mode 100644
index 152dacaacf95fc3149b0e89c6dcb8fa54cbe35df..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/modules/attention.py
+++ /dev/null
@@ -1,341 +0,0 @@
-from inspect import isfunction
-import math
-import torch
-import torch.nn.functional as F
-from torch import nn, einsum
-from einops import rearrange, repeat
-from typing import Optional, Any
-
-from ControlNet.ldm.modules.diffusionmodules.util import checkpoint
-
-
-try:
-    import xformers
-    import xformers.ops
-    XFORMERS_IS_AVAILBLE = True
-except:
-    XFORMERS_IS_AVAILBLE = False
-
-# CrossAttn precision handling
-import os
-_ATTN_PRECISION = os.environ.get("ATTN_PRECISION", "fp32")
-
-def exists(val):
-    return val is not None
-
-
-def uniq(arr):
-    return{el: True for el in arr}.keys()
-
-
-def default(val, d):
-    if exists(val):
-        return val
-    return d() if isfunction(d) else d
-
-
-def max_neg_value(t):
-    return -torch.finfo(t.dtype).max
-
-
-def init_(tensor):
-    dim = tensor.shape[-1]
-    std = 1 / math.sqrt(dim)
-    tensor.uniform_(-std, std)
-    return tensor
-
-
-# feedforward
-class GEGLU(nn.Module):
-    def __init__(self, dim_in, dim_out):
-        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out * 2)
-
-    def forward(self, x):
-        x, gate = self.proj(x).chunk(2, dim=-1)
-        return x * F.gelu(gate)
-
-
-class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
-        super().__init__()
-        inner_dim = int(dim * mult)
-        dim_out = default(dim_out, dim)
-        project_in = nn.Sequential(
-            nn.Linear(dim, inner_dim),
-            nn.GELU()
-        ) if not glu else GEGLU(dim, inner_dim)
-
-        self.net = nn.Sequential(
-            project_in,
-            nn.Dropout(dropout),
-            nn.Linear(inner_dim, dim_out)
-        )
-
-    def forward(self, x):
-        return self.net(x)
-
-
-def zero_module(module):
-    """
-    Zero out the parameters of a module and return it.
-    """
-    for p in module.parameters():
-        p.detach().zero_()
-    return module
-
-
-def Normalize(in_channels):
-    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
-
-
-class SpatialSelfAttention(nn.Module):
-    def __init__(self, in_channels):
-        super().__init__()
-        self.in_channels = in_channels
-
-        self.norm = Normalize(in_channels)
-        self.q = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.k = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.v = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.proj_out = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=1,
-                                        stride=1,
-                                        padding=0)
-
-    def forward(self, x):
-        h_ = x
-        h_ = self.norm(h_)
-        q = self.q(h_)
-        k = self.k(h_)
-        v = self.v(h_)
-
-        # compute attention
-        b,c,h,w = q.shape
-        q = rearrange(q, 'b c h w -> b (h w) c')
-        k = rearrange(k, 'b c h w -> b c (h w)')
-        w_ = torch.einsum('bij,bjk->bik', q, k)
-
-        w_ = w_ * (int(c)**(-0.5))
-        w_ = torch.nn.functional.softmax(w_, dim=2)
-
-        # attend to values
-        v = rearrange(v, 'b c h w -> b c (h w)')
-        w_ = rearrange(w_, 'b i j -> b j i')
-        h_ = torch.einsum('bij,bjk->bik', v, w_)
-        h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h)
-        h_ = self.proj_out(h_)
-
-        return x+h_
-
-
-class CrossAttention(nn.Module):
-    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
-        super().__init__()
-        inner_dim = dim_head * heads
-        context_dim = default(context_dim, query_dim)
-
-        self.scale = dim_head ** -0.5
-        self.heads = heads
-
-        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
-        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
-        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
-
-        self.to_out = nn.Sequential(
-            nn.Linear(inner_dim, query_dim),
-            nn.Dropout(dropout)
-        )
-
-    def forward(self, x, context=None, mask=None):
-        h = self.heads
-
-        q = self.to_q(x)
-        context = default(context, x)
-        k = self.to_k(context)
-        v = self.to_v(context)
-
-        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
-
-        # force cast to fp32 to avoid overflowing
-        if _ATTN_PRECISION =="fp32":
-            with torch.autocast(enabled=False, device_type = 'cuda'):
-                q, k = q.float(), k.float()
-                sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
-        else:
-            sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
-        
-        del q, k
-    
-        if exists(mask):
-            mask = rearrange(mask, 'b ... -> b (...)')
-            max_neg_value = -torch.finfo(sim.dtype).max
-            mask = repeat(mask, 'b j -> (b h) () j', h=h)
-            sim.masked_fill_(~mask, max_neg_value)
-
-        # attention, what we cannot get enough of
-        sim = sim.softmax(dim=-1)
-
-        out = einsum('b i j, b j d -> b i d', sim, v)
-        out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
-        return self.to_out(out)
-
-
-class MemoryEfficientCrossAttention(nn.Module):
-    # https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
-    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0):
-        super().__init__()
-        print(f"Setting up {self.__class__.__name__}. Query dim is {query_dim}, context_dim is {context_dim} and using "
-              f"{heads} heads.")
-        inner_dim = dim_head * heads
-        context_dim = default(context_dim, query_dim)
-
-        self.heads = heads
-        self.dim_head = dim_head
-
-        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
-        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
-        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
-
-        self.to_out = nn.Sequential(nn.Linear(inner_dim, query_dim), nn.Dropout(dropout))
-        self.attention_op: Optional[Any] = None
-
-    def forward(self, x, context=None, mask=None):
-        q = self.to_q(x)
-        context = default(context, x)
-        k = self.to_k(context)
-        v = self.to_v(context)
-
-        b, _, _ = q.shape
-        q, k, v = map(
-            lambda t: t.unsqueeze(3)
-            .reshape(b, t.shape[1], self.heads, self.dim_head)
-            .permute(0, 2, 1, 3)
-            .reshape(b * self.heads, t.shape[1], self.dim_head)
-            .contiguous(),
-            (q, k, v),
-        )
-
-        # actually compute the attention, what we cannot get enough of
-        out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=None, op=self.attention_op)
-
-        if exists(mask):
-            raise NotImplementedError
-        out = (
-            out.unsqueeze(0)
-            .reshape(b, self.heads, out.shape[1], self.dim_head)
-            .permute(0, 2, 1, 3)
-            .reshape(b, out.shape[1], self.heads * self.dim_head)
-        )
-        return self.to_out(out)
-
-
-class BasicTransformerBlock(nn.Module):
-    ATTENTION_MODES = {
-        "softmax": CrossAttention,  # vanilla attention
-        "softmax-xformers": MemoryEfficientCrossAttention
-    }
-    def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True,
-                 disable_self_attn=False):
-        super().__init__()
-        attn_mode = "softmax-xformers" if XFORMERS_IS_AVAILBLE else "softmax"
-        assert attn_mode in self.ATTENTION_MODES
-        attn_cls = self.ATTENTION_MODES[attn_mode]
-        self.disable_self_attn = disable_self_attn
-        self.attn1 = attn_cls(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout,
-                              context_dim=context_dim if self.disable_self_attn else None)  # is a self-attention if not self.disable_self_attn
-        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
-        self.attn2 = attn_cls(query_dim=dim, context_dim=context_dim,
-                              heads=n_heads, dim_head=d_head, dropout=dropout)  # is self-attn if context is none
-        self.norm1 = nn.LayerNorm(dim)
-        self.norm2 = nn.LayerNorm(dim)
-        self.norm3 = nn.LayerNorm(dim)
-        self.checkpoint = checkpoint
-
-    def forward(self, x, context=None):
-        return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint)
-
-    def _forward(self, x, context=None):
-        x = self.attn1(self.norm1(x), context=context if self.disable_self_attn else None) + x
-        x = self.attn2(self.norm2(x), context=context) + x
-        x = self.ff(self.norm3(x)) + x
-        return x
-
-
-class SpatialTransformer(nn.Module):
-    """
-    Transformer block for image-like data.
-    First, project the input (aka embedding)
-    and reshape to b, t, d.
-    Then apply standard transformer action.
-    Finally, reshape to image
-    NEW: use_linear for more efficiency instead of the 1x1 convs
-    """
-    def __init__(self, in_channels, n_heads, d_head,
-                 depth=1, dropout=0., context_dim=None,
-                 disable_self_attn=False, use_linear=False,
-                 use_checkpoint=True):
-        super().__init__()
-        if exists(context_dim) and not isinstance(context_dim, list):
-            context_dim = [context_dim]
-        self.in_channels = in_channels
-        inner_dim = n_heads * d_head
-        self.norm = Normalize(in_channels)
-        if not use_linear:
-            self.proj_in = nn.Conv2d(in_channels,
-                                     inner_dim,
-                                     kernel_size=1,
-                                     stride=1,
-                                     padding=0)
-        else:
-            self.proj_in = nn.Linear(in_channels, inner_dim)
-
-        self.transformer_blocks = nn.ModuleList(
-            [BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim[d],
-                                   disable_self_attn=disable_self_attn, checkpoint=use_checkpoint)
-                for d in range(depth)]
-        )
-        if not use_linear:
-            self.proj_out = zero_module(nn.Conv2d(inner_dim,
-                                                  in_channels,
-                                                  kernel_size=1,
-                                                  stride=1,
-                                                  padding=0))
-        else:
-            self.proj_out = zero_module(nn.Linear(in_channels, inner_dim))
-        self.use_linear = use_linear
-
-    def forward(self, x, context=None):
-        # note: if no context is given, cross-attention defaults to self-attention
-        if not isinstance(context, list):
-            context = [context]
-        b, c, h, w = x.shape
-        x_in = x
-        x = self.norm(x)
-        if not self.use_linear:
-            x = self.proj_in(x)
-        x = rearrange(x, 'b c h w -> b (h w) c').contiguous()
-        if self.use_linear:
-            x = self.proj_in(x)
-        for i, block in enumerate(self.transformer_blocks):
-            x = block(x, context=context[i])
-        if self.use_linear:
-            x = self.proj_out(x)
-        x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous()
-        if not self.use_linear:
-            x = self.proj_out(x)
-        return x + x_in
-
diff --git a/ControlNet/ldm/modules/diffusionmodules/__init__.py b/ControlNet/ldm/modules/diffusionmodules/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/ControlNet/ldm/modules/diffusionmodules/model.py b/ControlNet/ldm/modules/diffusionmodules/model.py
deleted file mode 100644
index 13fa72a48e4656fd1c3e2cf69fd0bdef4893d923..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/modules/diffusionmodules/model.py
+++ /dev/null
@@ -1,852 +0,0 @@
-# pytorch_diffusion + derived encoder decoder
-import math
-import torch
-import torch.nn as nn
-import numpy as np
-from einops import rearrange
-from typing import Optional, Any
-
-from ControlNet.ldm.modules.attention import MemoryEfficientCrossAttention
-
-try:
-    import xformers
-    import xformers.ops
-    XFORMERS_IS_AVAILBLE = True
-except:
-    XFORMERS_IS_AVAILBLE = False
-    print("No module 'xformers'. Proceeding without it.")
-
-
-def get_timestep_embedding(timesteps, embedding_dim):
-    """
-    This matches the implementation in Denoising Diffusion Probabilistic Models:
-    From Fairseq.
-    Build sinusoidal embeddings.
-    This matches the implementation in tensor2tensor, but differs slightly
-    from the description in Section 3.5 of "Attention Is All You Need".
-    """
-    assert len(timesteps.shape) == 1
-
-    half_dim = embedding_dim // 2
-    emb = math.log(10000) / (half_dim - 1)
-    emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
-    emb = emb.to(device=timesteps.device)
-    emb = timesteps.float()[:, None] * emb[None, :]
-    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
-    if embedding_dim % 2 == 1:  # zero pad
-        emb = torch.nn.functional.pad(emb, (0,1,0,0))
-    return emb
-
-
-def nonlinearity(x):
-    # swish
-    return x*torch.sigmoid(x)
-
-
-def Normalize(in_channels, num_groups=32):
-    return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
-
-
-class Upsample(nn.Module):
-    def __init__(self, in_channels, with_conv):
-        super().__init__()
-        self.with_conv = with_conv
-        if self.with_conv:
-            self.conv = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
-
-    def forward(self, x):
-        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
-        if self.with_conv:
-            x = self.conv(x)
-        return x
-
-
-class Downsample(nn.Module):
-    def __init__(self, in_channels, with_conv):
-        super().__init__()
-        self.with_conv = with_conv
-        if self.with_conv:
-            # no asymmetric padding in torch conv, must do it ourselves
-            self.conv = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=3,
-                                        stride=2,
-                                        padding=0)
-
-    def forward(self, x):
-        if self.with_conv:
-            pad = (0,1,0,1)
-            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
-            x = self.conv(x)
-        else:
-            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
-        return x
-
-
-class ResnetBlock(nn.Module):
-    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
-                 dropout, temb_channels=512):
-        super().__init__()
-        self.in_channels = in_channels
-        out_channels = in_channels if out_channels is None else out_channels
-        self.out_channels = out_channels
-        self.use_conv_shortcut = conv_shortcut
-
-        self.norm1 = Normalize(in_channels)
-        self.conv1 = torch.nn.Conv2d(in_channels,
-                                     out_channels,
-                                     kernel_size=3,
-                                     stride=1,
-                                     padding=1)
-        if temb_channels > 0:
-            self.temb_proj = torch.nn.Linear(temb_channels,
-                                             out_channels)
-        self.norm2 = Normalize(out_channels)
-        self.dropout = torch.nn.Dropout(dropout)
-        self.conv2 = torch.nn.Conv2d(out_channels,
-                                     out_channels,
-                                     kernel_size=3,
-                                     stride=1,
-                                     padding=1)
-        if self.in_channels != self.out_channels:
-            if self.use_conv_shortcut:
-                self.conv_shortcut = torch.nn.Conv2d(in_channels,
-                                                     out_channels,
-                                                     kernel_size=3,
-                                                     stride=1,
-                                                     padding=1)
-            else:
-                self.nin_shortcut = torch.nn.Conv2d(in_channels,
-                                                    out_channels,
-                                                    kernel_size=1,
-                                                    stride=1,
-                                                    padding=0)
-
-    def forward(self, x, temb):
-        h = x
-        h = self.norm1(h)
-        h = nonlinearity(h)
-        h = self.conv1(h)
-
-        if temb is not None:
-            h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
-
-        h = self.norm2(h)
-        h = nonlinearity(h)
-        h = self.dropout(h)
-        h = self.conv2(h)
-
-        if self.in_channels != self.out_channels:
-            if self.use_conv_shortcut:
-                x = self.conv_shortcut(x)
-            else:
-                x = self.nin_shortcut(x)
-
-        return x+h
-
-
-class AttnBlock(nn.Module):
-    def __init__(self, in_channels):
-        super().__init__()
-        self.in_channels = in_channels
-
-        self.norm = Normalize(in_channels)
-        self.q = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.k = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.v = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.proj_out = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=1,
-                                        stride=1,
-                                        padding=0)
-
-    def forward(self, x):
-        h_ = x
-        h_ = self.norm(h_)
-        q = self.q(h_)
-        k = self.k(h_)
-        v = self.v(h_)
-
-        # compute attention
-        b,c,h,w = q.shape
-        q = q.reshape(b,c,h*w)
-        q = q.permute(0,2,1)   # b,hw,c
-        k = k.reshape(b,c,h*w) # b,c,hw
-        w_ = torch.bmm(q,k)     # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
-        w_ = w_ * (int(c)**(-0.5))
-        w_ = torch.nn.functional.softmax(w_, dim=2)
-
-        # attend to values
-        v = v.reshape(b,c,h*w)
-        w_ = w_.permute(0,2,1)   # b,hw,hw (first hw of k, second of q)
-        h_ = torch.bmm(v,w_)     # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
-        h_ = h_.reshape(b,c,h,w)
-
-        h_ = self.proj_out(h_)
-
-        return x+h_
-
-class MemoryEfficientAttnBlock(nn.Module):
-    """
-        Uses xformers efficient implementation,
-        see https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
-        Note: this is a single-head self-attention operation
-    """
-    #
-    def __init__(self, in_channels):
-        super().__init__()
-        self.in_channels = in_channels
-
-        self.norm = Normalize(in_channels)
-        self.q = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.k = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.v = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.proj_out = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=1,
-                                        stride=1,
-                                        padding=0)
-        self.attention_op: Optional[Any] = None
-
-    def forward(self, x):
-        h_ = x
-        h_ = self.norm(h_)
-        q = self.q(h_)
-        k = self.k(h_)
-        v = self.v(h_)
-
-        # compute attention
-        B, C, H, W = q.shape
-        q, k, v = map(lambda x: rearrange(x, 'b c h w -> b (h w) c'), (q, k, v))
-
-        q, k, v = map(
-            lambda t: t.unsqueeze(3)
-            .reshape(B, t.shape[1], 1, C)
-            .permute(0, 2, 1, 3)
-            .reshape(B * 1, t.shape[1], C)
-            .contiguous(),
-            (q, k, v),
-        )
-        out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=None, op=self.attention_op)
-
-        out = (
-            out.unsqueeze(0)
-            .reshape(B, 1, out.shape[1], C)
-            .permute(0, 2, 1, 3)
-            .reshape(B, out.shape[1], C)
-        )
-        out = rearrange(out, 'b (h w) c -> b c h w', b=B, h=H, w=W, c=C)
-        out = self.proj_out(out)
-        return x+out
-
-
-class MemoryEfficientCrossAttentionWrapper(MemoryEfficientCrossAttention):
-    def forward(self, x, context=None, mask=None):
-        b, c, h, w = x.shape
-        x = rearrange(x, 'b c h w -> b (h w) c')
-        out = super().forward(x, context=context, mask=mask)
-        out = rearrange(out, 'b (h w) c -> b c h w', h=h, w=w, c=c)
-        return x + out
-
-
-def make_attn(in_channels, attn_type="vanilla", attn_kwargs=None):
-    assert attn_type in ["vanilla", "vanilla-xformers", "memory-efficient-cross-attn", "linear", "none"], f'attn_type {attn_type} unknown'
-    if XFORMERS_IS_AVAILBLE and attn_type == "vanilla":
-        attn_type = "vanilla-xformers"
-    print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
-    if attn_type == "vanilla":
-        assert attn_kwargs is None
-        return AttnBlock(in_channels)
-    elif attn_type == "vanilla-xformers":
-        print(f"building MemoryEfficientAttnBlock with {in_channels} in_channels...")
-        return MemoryEfficientAttnBlock(in_channels)
-    elif type == "memory-efficient-cross-attn":
-        attn_kwargs["query_dim"] = in_channels
-        return MemoryEfficientCrossAttentionWrapper(**attn_kwargs)
-    elif attn_type == "none":
-        return nn.Identity(in_channels)
-    else:
-        raise NotImplementedError()
-
-
-class Model(nn.Module):
-    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
-                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
-                 resolution, use_timestep=True, use_linear_attn=False, attn_type="vanilla"):
-        super().__init__()
-        if use_linear_attn: attn_type = "linear"
-        self.ch = ch
-        self.temb_ch = self.ch*4
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        self.resolution = resolution
-        self.in_channels = in_channels
-
-        self.use_timestep = use_timestep
-        if self.use_timestep:
-            # timestep embedding
-            self.temb = nn.Module()
-            self.temb.dense = nn.ModuleList([
-                torch.nn.Linear(self.ch,
-                                self.temb_ch),
-                torch.nn.Linear(self.temb_ch,
-                                self.temb_ch),
-            ])
-
-        # downsampling
-        self.conv_in = torch.nn.Conv2d(in_channels,
-                                       self.ch,
-                                       kernel_size=3,
-                                       stride=1,
-                                       padding=1)
-
-        curr_res = resolution
-        in_ch_mult = (1,)+tuple(ch_mult)
-        self.down = nn.ModuleList()
-        for i_level in range(self.num_resolutions):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_in = ch*in_ch_mult[i_level]
-            block_out = ch*ch_mult[i_level]
-            for i_block in range(self.num_res_blocks):
-                block.append(ResnetBlock(in_channels=block_in,
-                                         out_channels=block_out,
-                                         temb_channels=self.temb_ch,
-                                         dropout=dropout))
-                block_in = block_out
-                if curr_res in attn_resolutions:
-                    attn.append(make_attn(block_in, attn_type=attn_type))
-            down = nn.Module()
-            down.block = block
-            down.attn = attn
-            if i_level != self.num_resolutions-1:
-                down.downsample = Downsample(block_in, resamp_with_conv)
-                curr_res = curr_res // 2
-            self.down.append(down)
-
-        # middle
-        self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(in_channels=block_in,
-                                       out_channels=block_in,
-                                       temb_channels=self.temb_ch,
-                                       dropout=dropout)
-        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
-        self.mid.block_2 = ResnetBlock(in_channels=block_in,
-                                       out_channels=block_in,
-                                       temb_channels=self.temb_ch,
-                                       dropout=dropout)
-
-        # upsampling
-        self.up = nn.ModuleList()
-        for i_level in reversed(range(self.num_resolutions)):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_out = ch*ch_mult[i_level]
-            skip_in = ch*ch_mult[i_level]
-            for i_block in range(self.num_res_blocks+1):
-                if i_block == self.num_res_blocks:
-                    skip_in = ch*in_ch_mult[i_level]
-                block.append(ResnetBlock(in_channels=block_in+skip_in,
-                                         out_channels=block_out,
-                                         temb_channels=self.temb_ch,
-                                         dropout=dropout))
-                block_in = block_out
-                if curr_res in attn_resolutions:
-                    attn.append(make_attn(block_in, attn_type=attn_type))
-            up = nn.Module()
-            up.block = block
-            up.attn = attn
-            if i_level != 0:
-                up.upsample = Upsample(block_in, resamp_with_conv)
-                curr_res = curr_res * 2
-            self.up.insert(0, up) # prepend to get consistent order
-
-        # end
-        self.norm_out = Normalize(block_in)
-        self.conv_out = torch.nn.Conv2d(block_in,
-                                        out_ch,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
-
-    def forward(self, x, t=None, context=None):
-        #assert x.shape[2] == x.shape[3] == self.resolution
-        if context is not None:
-            # assume aligned context, cat along channel axis
-            x = torch.cat((x, context), dim=1)
-        if self.use_timestep:
-            # timestep embedding
-            assert t is not None
-            temb = get_timestep_embedding(t, self.ch)
-            temb = self.temb.dense[0](temb)
-            temb = nonlinearity(temb)
-            temb = self.temb.dense[1](temb)
-        else:
-            temb = None
-
-        # downsampling
-        hs = [self.conv_in(x)]
-        for i_level in range(self.num_resolutions):
-            for i_block in range(self.num_res_blocks):
-                h = self.down[i_level].block[i_block](hs[-1], temb)
-                if len(self.down[i_level].attn) > 0:
-                    h = self.down[i_level].attn[i_block](h)
-                hs.append(h)
-            if i_level != self.num_resolutions-1:
-                hs.append(self.down[i_level].downsample(hs[-1]))
-
-        # middle
-        h = hs[-1]
-        h = self.mid.block_1(h, temb)
-        h = self.mid.attn_1(h)
-        h = self.mid.block_2(h, temb)
-
-        # upsampling
-        for i_level in reversed(range(self.num_resolutions)):
-            for i_block in range(self.num_res_blocks+1):
-                h = self.up[i_level].block[i_block](
-                    torch.cat([h, hs.pop()], dim=1), temb)
-                if len(self.up[i_level].attn) > 0:
-                    h = self.up[i_level].attn[i_block](h)
-            if i_level != 0:
-                h = self.up[i_level].upsample(h)
-
-        # end
-        h = self.norm_out(h)
-        h = nonlinearity(h)
-        h = self.conv_out(h)
-        return h
-
-    def get_last_layer(self):
-        return self.conv_out.weight
-
-
-class Encoder(nn.Module):
-    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
-                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
-                 resolution, z_channels, double_z=True, use_linear_attn=False, attn_type="vanilla",
-                 **ignore_kwargs):
-        super().__init__()
-        if use_linear_attn: attn_type = "linear"
-        self.ch = ch
-        self.temb_ch = 0
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        self.resolution = resolution
-        self.in_channels = in_channels
-
-        # downsampling
-        self.conv_in = torch.nn.Conv2d(in_channels,
-                                       self.ch,
-                                       kernel_size=3,
-                                       stride=1,
-                                       padding=1)
-
-        curr_res = resolution
-        in_ch_mult = (1,)+tuple(ch_mult)
-        self.in_ch_mult = in_ch_mult
-        self.down = nn.ModuleList()
-        for i_level in range(self.num_resolutions):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_in = ch*in_ch_mult[i_level]
-            block_out = ch*ch_mult[i_level]
-            for i_block in range(self.num_res_blocks):
-                block.append(ResnetBlock(in_channels=block_in,
-                                         out_channels=block_out,
-                                         temb_channels=self.temb_ch,
-                                         dropout=dropout))
-                block_in = block_out
-                if curr_res in attn_resolutions:
-                    attn.append(make_attn(block_in, attn_type=attn_type))
-            down = nn.Module()
-            down.block = block
-            down.attn = attn
-            if i_level != self.num_resolutions-1:
-                down.downsample = Downsample(block_in, resamp_with_conv)
-                curr_res = curr_res // 2
-            self.down.append(down)
-
-        # middle
-        self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(in_channels=block_in,
-                                       out_channels=block_in,
-                                       temb_channels=self.temb_ch,
-                                       dropout=dropout)
-        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
-        self.mid.block_2 = ResnetBlock(in_channels=block_in,
-                                       out_channels=block_in,
-                                       temb_channels=self.temb_ch,
-                                       dropout=dropout)
-
-        # end
-        self.norm_out = Normalize(block_in)
-        self.conv_out = torch.nn.Conv2d(block_in,
-                                        2*z_channels if double_z else z_channels,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
-
-    def forward(self, x):
-        # timestep embedding
-        temb = None
-
-        # downsampling
-        hs = [self.conv_in(x)]
-        for i_level in range(self.num_resolutions):
-            for i_block in range(self.num_res_blocks):
-                h = self.down[i_level].block[i_block](hs[-1], temb)
-                if len(self.down[i_level].attn) > 0:
-                    h = self.down[i_level].attn[i_block](h)
-                hs.append(h)
-            if i_level != self.num_resolutions-1:
-                hs.append(self.down[i_level].downsample(hs[-1]))
-
-        # middle
-        h = hs[-1]
-        h = self.mid.block_1(h, temb)
-        h = self.mid.attn_1(h)
-        h = self.mid.block_2(h, temb)
-
-        # end
-        h = self.norm_out(h)
-        h = nonlinearity(h)
-        h = self.conv_out(h)
-        return h
-
-
-class Decoder(nn.Module):
-    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
-                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
-                 resolution, z_channels, give_pre_end=False, tanh_out=False, use_linear_attn=False,
-                 attn_type="vanilla", **ignorekwargs):
-        super().__init__()
-        if use_linear_attn: attn_type = "linear"
-        self.ch = ch
-        self.temb_ch = 0
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        self.resolution = resolution
-        self.in_channels = in_channels
-        self.give_pre_end = give_pre_end
-        self.tanh_out = tanh_out
-
-        # compute in_ch_mult, block_in and curr_res at lowest res
-        in_ch_mult = (1,)+tuple(ch_mult)
-        block_in = ch*ch_mult[self.num_resolutions-1]
-        curr_res = resolution // 2**(self.num_resolutions-1)
-        self.z_shape = (1,z_channels,curr_res,curr_res)
-        print("Working with z of shape {} = {} dimensions.".format(
-            self.z_shape, np.prod(self.z_shape)))
-
-        # z to block_in
-        self.conv_in = torch.nn.Conv2d(z_channels,
-                                       block_in,
-                                       kernel_size=3,
-                                       stride=1,
-                                       padding=1)
-
-        # middle
-        self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(in_channels=block_in,
-                                       out_channels=block_in,
-                                       temb_channels=self.temb_ch,
-                                       dropout=dropout)
-        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
-        self.mid.block_2 = ResnetBlock(in_channels=block_in,
-                                       out_channels=block_in,
-                                       temb_channels=self.temb_ch,
-                                       dropout=dropout)
-
-        # upsampling
-        self.up = nn.ModuleList()
-        for i_level in reversed(range(self.num_resolutions)):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_out = ch*ch_mult[i_level]
-            for i_block in range(self.num_res_blocks+1):
-                block.append(ResnetBlock(in_channels=block_in,
-                                         out_channels=block_out,
-                                         temb_channels=self.temb_ch,
-                                         dropout=dropout))
-                block_in = block_out
-                if curr_res in attn_resolutions:
-                    attn.append(make_attn(block_in, attn_type=attn_type))
-            up = nn.Module()
-            up.block = block
-            up.attn = attn
-            if i_level != 0:
-                up.upsample = Upsample(block_in, resamp_with_conv)
-                curr_res = curr_res * 2
-            self.up.insert(0, up) # prepend to get consistent order
-
-        # end
-        self.norm_out = Normalize(block_in)
-        self.conv_out = torch.nn.Conv2d(block_in,
-                                        out_ch,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
-
-    def forward(self, z):
-        #assert z.shape[1:] == self.z_shape[1:]
-        self.last_z_shape = z.shape
-
-        # timestep embedding
-        temb = None
-
-        # z to block_in
-        h = self.conv_in(z)
-
-        # middle
-        h = self.mid.block_1(h, temb)
-        h = self.mid.attn_1(h)
-        h = self.mid.block_2(h, temb)
-
-        # upsampling
-        for i_level in reversed(range(self.num_resolutions)):
-            for i_block in range(self.num_res_blocks+1):
-                h = self.up[i_level].block[i_block](h, temb)
-                if len(self.up[i_level].attn) > 0:
-                    h = self.up[i_level].attn[i_block](h)
-            if i_level != 0:
-                h = self.up[i_level].upsample(h)
-
-        # end
-        if self.give_pre_end:
-            return h
-
-        h = self.norm_out(h)
-        h = nonlinearity(h)
-        h = self.conv_out(h)
-        if self.tanh_out:
-            h = torch.tanh(h)
-        return h
-
-
-class SimpleDecoder(nn.Module):
-    def __init__(self, in_channels, out_channels, *args, **kwargs):
-        super().__init__()
-        self.model = nn.ModuleList([nn.Conv2d(in_channels, in_channels, 1),
-                                     ResnetBlock(in_channels=in_channels,
-                                                 out_channels=2 * in_channels,
-                                                 temb_channels=0, dropout=0.0),
-                                     ResnetBlock(in_channels=2 * in_channels,
-                                                out_channels=4 * in_channels,
-                                                temb_channels=0, dropout=0.0),
-                                     ResnetBlock(in_channels=4 * in_channels,
-                                                out_channels=2 * in_channels,
-                                                temb_channels=0, dropout=0.0),
-                                     nn.Conv2d(2*in_channels, in_channels, 1),
-                                     Upsample(in_channels, with_conv=True)])
-        # end
-        self.norm_out = Normalize(in_channels)
-        self.conv_out = torch.nn.Conv2d(in_channels,
-                                        out_channels,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
-
-    def forward(self, x):
-        for i, layer in enumerate(self.model):
-            if i in [1,2,3]:
-                x = layer(x, None)
-            else:
-                x = layer(x)
-
-        h = self.norm_out(x)
-        h = nonlinearity(h)
-        x = self.conv_out(h)
-        return x
-
-
-class UpsampleDecoder(nn.Module):
-    def __init__(self, in_channels, out_channels, ch, num_res_blocks, resolution,
-                 ch_mult=(2,2), dropout=0.0):
-        super().__init__()
-        # upsampling
-        self.temb_ch = 0
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        block_in = in_channels
-        curr_res = resolution // 2 ** (self.num_resolutions - 1)
-        self.res_blocks = nn.ModuleList()
-        self.upsample_blocks = nn.ModuleList()
-        for i_level in range(self.num_resolutions):
-            res_block = []
-            block_out = ch * ch_mult[i_level]
-            for i_block in range(self.num_res_blocks + 1):
-                res_block.append(ResnetBlock(in_channels=block_in,
-                                         out_channels=block_out,
-                                         temb_channels=self.temb_ch,
-                                         dropout=dropout))
-                block_in = block_out
-            self.res_blocks.append(nn.ModuleList(res_block))
-            if i_level != self.num_resolutions - 1:
-                self.upsample_blocks.append(Upsample(block_in, True))
-                curr_res = curr_res * 2
-
-        # end
-        self.norm_out = Normalize(block_in)
-        self.conv_out = torch.nn.Conv2d(block_in,
-                                        out_channels,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
-
-    def forward(self, x):
-        # upsampling
-        h = x
-        for k, i_level in enumerate(range(self.num_resolutions)):
-            for i_block in range(self.num_res_blocks + 1):
-                h = self.res_blocks[i_level][i_block](h, None)
-            if i_level != self.num_resolutions - 1:
-                h = self.upsample_blocks[k](h)
-        h = self.norm_out(h)
-        h = nonlinearity(h)
-        h = self.conv_out(h)
-        return h
-
-
-class LatentRescaler(nn.Module):
-    def __init__(self, factor, in_channels, mid_channels, out_channels, depth=2):
-        super().__init__()
-        # residual block, interpolate, residual block
-        self.factor = factor
-        self.conv_in = nn.Conv2d(in_channels,
-                                 mid_channels,
-                                 kernel_size=3,
-                                 stride=1,
-                                 padding=1)
-        self.res_block1 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
-                                                     out_channels=mid_channels,
-                                                     temb_channels=0,
-                                                     dropout=0.0) for _ in range(depth)])
-        self.attn = AttnBlock(mid_channels)
-        self.res_block2 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
-                                                     out_channels=mid_channels,
-                                                     temb_channels=0,
-                                                     dropout=0.0) for _ in range(depth)])
-
-        self.conv_out = nn.Conv2d(mid_channels,
-                                  out_channels,
-                                  kernel_size=1,
-                                  )
-
-    def forward(self, x):
-        x = self.conv_in(x)
-        for block in self.res_block1:
-            x = block(x, None)
-        x = torch.nn.functional.interpolate(x, size=(int(round(x.shape[2]*self.factor)), int(round(x.shape[3]*self.factor))))
-        x = self.attn(x)
-        for block in self.res_block2:
-            x = block(x, None)
-        x = self.conv_out(x)
-        return x
-
-
-class MergedRescaleEncoder(nn.Module):
-    def __init__(self, in_channels, ch, resolution, out_ch, num_res_blocks,
-                 attn_resolutions, dropout=0.0, resamp_with_conv=True,
-                 ch_mult=(1,2,4,8), rescale_factor=1.0, rescale_module_depth=1):
-        super().__init__()
-        intermediate_chn = ch * ch_mult[-1]
-        self.encoder = Encoder(in_channels=in_channels, num_res_blocks=num_res_blocks, ch=ch, ch_mult=ch_mult,
-                               z_channels=intermediate_chn, double_z=False, resolution=resolution,
-                               attn_resolutions=attn_resolutions, dropout=dropout, resamp_with_conv=resamp_with_conv,
-                               out_ch=None)
-        self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=intermediate_chn,
-                                       mid_channels=intermediate_chn, out_channels=out_ch, depth=rescale_module_depth)
-
-    def forward(self, x):
-        x = self.encoder(x)
-        x = self.rescaler(x)
-        return x
-
-
-class MergedRescaleDecoder(nn.Module):
-    def __init__(self, z_channels, out_ch, resolution, num_res_blocks, attn_resolutions, ch, ch_mult=(1,2,4,8),
-                 dropout=0.0, resamp_with_conv=True, rescale_factor=1.0, rescale_module_depth=1):
-        super().__init__()
-        tmp_chn = z_channels*ch_mult[-1]
-        self.decoder = Decoder(out_ch=out_ch, z_channels=tmp_chn, attn_resolutions=attn_resolutions, dropout=dropout,
-                               resamp_with_conv=resamp_with_conv, in_channels=None, num_res_blocks=num_res_blocks,
-                               ch_mult=ch_mult, resolution=resolution, ch=ch)
-        self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=z_channels, mid_channels=tmp_chn,
-                                       out_channels=tmp_chn, depth=rescale_module_depth)
-
-    def forward(self, x):
-        x = self.rescaler(x)
-        x = self.decoder(x)
-        return x
-
-
-class Upsampler(nn.Module):
-    def __init__(self, in_size, out_size, in_channels, out_channels, ch_mult=2):
-        super().__init__()
-        assert out_size >= in_size
-        num_blocks = int(np.log2(out_size//in_size))+1
-        factor_up = 1.+ (out_size % in_size)
-        print(f"Building {self.__class__.__name__} with in_size: {in_size} --> out_size {out_size} and factor {factor_up}")
-        self.rescaler = LatentRescaler(factor=factor_up, in_channels=in_channels, mid_channels=2*in_channels,
-                                       out_channels=in_channels)
-        self.decoder = Decoder(out_ch=out_channels, resolution=out_size, z_channels=in_channels, num_res_blocks=2,
-                               attn_resolutions=[], in_channels=None, ch=in_channels,
-                               ch_mult=[ch_mult for _ in range(num_blocks)])
-
-    def forward(self, x):
-        x = self.rescaler(x)
-        x = self.decoder(x)
-        return x
-
-
-class Resize(nn.Module):
-    def __init__(self, in_channels=None, learned=False, mode="bilinear"):
-        super().__init__()
-        self.with_conv = learned
-        self.mode = mode
-        if self.with_conv:
-            print(f"Note: {self.__class__.__name} uses learned downsampling and will ignore the fixed {mode} mode")
-            raise NotImplementedError()
-            assert in_channels is not None
-            # no asymmetric padding in torch conv, must do it ourselves
-            self.conv = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=4,
-                                        stride=2,
-                                        padding=1)
-
-    def forward(self, x, scale_factor=1.0):
-        if scale_factor==1.0:
-            return x
-        else:
-            x = torch.nn.functional.interpolate(x, mode=self.mode, align_corners=False, scale_factor=scale_factor)
-        return x
diff --git a/ControlNet/ldm/modules/diffusionmodules/openaimodel.py b/ControlNet/ldm/modules/diffusionmodules/openaimodel.py
deleted file mode 100644
index 10d467f9f6153dade3a394547f89076ba4a7c677..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/modules/diffusionmodules/openaimodel.py
+++ /dev/null
@@ -1,786 +0,0 @@
-from abc import abstractmethod
-import math
-
-import numpy as np
-import torch as th
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ControlNet.ldm.modules.diffusionmodules.util import (
-    checkpoint,
-    conv_nd,
-    linear,
-    avg_pool_nd,
-    zero_module,
-    normalization,
-    timestep_embedding,
-)
-from ControlNet.ldm.modules.attention import SpatialTransformer
-from ControlNet.ldm.util import exists
-
-
-# dummy replace
-def convert_module_to_f16(x):
-    pass
-
-def convert_module_to_f32(x):
-    pass
-
-
-## go
-class AttentionPool2d(nn.Module):
-    """
-    Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
-    """
-
-    def __init__(
-        self,
-        spacial_dim: int,
-        embed_dim: int,
-        num_heads_channels: int,
-        output_dim: int = None,
-    ):
-        super().__init__()
-        self.positional_embedding = nn.Parameter(th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5)
-        self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
-        self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
-        self.num_heads = embed_dim // num_heads_channels
-        self.attention = QKVAttention(self.num_heads)
-
-    def forward(self, x):
-        b, c, *_spatial = x.shape
-        x = x.reshape(b, c, -1)  # NC(HW)
-        x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1)  # NC(HW+1)
-        x = x + self.positional_embedding[None, :, :].to(x.dtype)  # NC(HW+1)
-        x = self.qkv_proj(x)
-        x = self.attention(x)
-        x = self.c_proj(x)
-        return x[:, :, 0]
-
-
-class TimestepBlock(nn.Module):
-    """
-    Any module where forward() takes timestep embeddings as a second argument.
-    """
-
-    @abstractmethod
-    def forward(self, x, emb):
-        """
-        Apply the module to `x` given `emb` timestep embeddings.
-        """
-
-
-class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
-    """
-    A sequential module that passes timestep embeddings to the children that
-    support it as an extra input.
-    """
-
-    def forward(self, x, emb, context=None):
-        for layer in self:
-            if isinstance(layer, TimestepBlock):
-                x = layer(x, emb)
-            elif isinstance(layer, SpatialTransformer):
-                x = layer(x, context)
-            else:
-                x = layer(x)
-        return x
-
-
-class Upsample(nn.Module):
-    """
-    An upsampling layer with an optional convolution.
-    :param channels: channels in the inputs and outputs.
-    :param use_conv: a bool determining if a convolution is applied.
-    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
-                 upsampling occurs in the inner-two dimensions.
-    """
-
-    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.dims = dims
-        if use_conv:
-            self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=padding)
-
-    def forward(self, x):
-        assert x.shape[1] == self.channels
-        if self.dims == 3:
-            x = F.interpolate(
-                x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
-            )
-        else:
-            x = F.interpolate(x, scale_factor=2, mode="nearest")
-        if self.use_conv:
-            x = self.conv(x)
-        return x
-
-class TransposedUpsample(nn.Module):
-    'Learned 2x upsampling without padding'
-    def __init__(self, channels, out_channels=None, ks=5):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-
-        self.up = nn.ConvTranspose2d(self.channels,self.out_channels,kernel_size=ks,stride=2)
-
-    def forward(self,x):
-        return self.up(x)
-
-
-class Downsample(nn.Module):
-    """
-    A downsampling layer with an optional convolution.
-    :param channels: channels in the inputs and outputs.
-    :param use_conv: a bool determining if a convolution is applied.
-    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
-                 downsampling occurs in the inner-two dimensions.
-    """
-
-    def __init__(self, channels, use_conv, dims=2, out_channels=None,padding=1):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.dims = dims
-        stride = 2 if dims != 3 else (1, 2, 2)
-        if use_conv:
-            self.op = conv_nd(
-                dims, self.channels, self.out_channels, 3, stride=stride, padding=padding
-            )
-        else:
-            assert self.channels == self.out_channels
-            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
-
-    def forward(self, x):
-        assert x.shape[1] == self.channels
-        return self.op(x)
-
-
-class ResBlock(TimestepBlock):
-    """
-    A residual block that can optionally change the number of channels.
-    :param channels: the number of input channels.
-    :param emb_channels: the number of timestep embedding channels.
-    :param dropout: the rate of dropout.
-    :param out_channels: if specified, the number of out channels.
-    :param use_conv: if True and out_channels is specified, use a spatial
-        convolution instead of a smaller 1x1 convolution to change the
-        channels in the skip connection.
-    :param dims: determines if the signal is 1D, 2D, or 3D.
-    :param use_checkpoint: if True, use gradient checkpointing on this module.
-    :param up: if True, use this block for upsampling.
-    :param down: if True, use this block for downsampling.
-    """
-
-    def __init__(
-        self,
-        channels,
-        emb_channels,
-        dropout,
-        out_channels=None,
-        use_conv=False,
-        use_scale_shift_norm=False,
-        dims=2,
-        use_checkpoint=False,
-        up=False,
-        down=False,
-    ):
-        super().__init__()
-        self.channels = channels
-        self.emb_channels = emb_channels
-        self.dropout = dropout
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.use_checkpoint = use_checkpoint
-        self.use_scale_shift_norm = use_scale_shift_norm
-
-        self.in_layers = nn.Sequential(
-            normalization(channels),
-            nn.SiLU(),
-            conv_nd(dims, channels, self.out_channels, 3, padding=1),
-        )
-
-        self.updown = up or down
-
-        if up:
-            self.h_upd = Upsample(channels, False, dims)
-            self.x_upd = Upsample(channels, False, dims)
-        elif down:
-            self.h_upd = Downsample(channels, False, dims)
-            self.x_upd = Downsample(channels, False, dims)
-        else:
-            self.h_upd = self.x_upd = nn.Identity()
-
-        self.emb_layers = nn.Sequential(
-            nn.SiLU(),
-            linear(
-                emb_channels,
-                2 * self.out_channels if use_scale_shift_norm else self.out_channels,
-            ),
-        )
-        self.out_layers = nn.Sequential(
-            normalization(self.out_channels),
-            nn.SiLU(),
-            nn.Dropout(p=dropout),
-            zero_module(
-                conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)
-            ),
-        )
-
-        if self.out_channels == channels:
-            self.skip_connection = nn.Identity()
-        elif use_conv:
-            self.skip_connection = conv_nd(
-                dims, channels, self.out_channels, 3, padding=1
-            )
-        else:
-            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
-
-    def forward(self, x, emb):
-        """
-        Apply the block to a Tensor, conditioned on a timestep embedding.
-        :param x: an [N x C x ...] Tensor of features.
-        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
-        :return: an [N x C x ...] Tensor of outputs.
-        """
-        return checkpoint(
-            self._forward, (x, emb), self.parameters(), self.use_checkpoint
-        )
-
-
-    def _forward(self, x, emb):
-        if self.updown:
-            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
-            h = in_rest(x)
-            h = self.h_upd(h)
-            x = self.x_upd(x)
-            h = in_conv(h)
-        else:
-            h = self.in_layers(x)
-        emb_out = self.emb_layers(emb).type(h.dtype)
-        while len(emb_out.shape) < len(h.shape):
-            emb_out = emb_out[..., None]
-        if self.use_scale_shift_norm:
-            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
-            scale, shift = th.chunk(emb_out, 2, dim=1)
-            h = out_norm(h) * (1 + scale) + shift
-            h = out_rest(h)
-        else:
-            h = h + emb_out
-            h = self.out_layers(h)
-        return self.skip_connection(x) + h
-
-
-class AttentionBlock(nn.Module):
-    """
-    An attention block that allows spatial positions to attend to each other.
-    Originally ported from here, but adapted to the N-d case.
-    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
-    """
-
-    def __init__(
-        self,
-        channels,
-        num_heads=1,
-        num_head_channels=-1,
-        use_checkpoint=False,
-        use_new_attention_order=False,
-    ):
-        super().__init__()
-        self.channels = channels
-        if num_head_channels == -1:
-            self.num_heads = num_heads
-        else:
-            assert (
-                channels % num_head_channels == 0
-            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
-            self.num_heads = channels // num_head_channels
-        self.use_checkpoint = use_checkpoint
-        self.norm = normalization(channels)
-        self.qkv = conv_nd(1, channels, channels * 3, 1)
-        if use_new_attention_order:
-            # split qkv before split heads
-            self.attention = QKVAttention(self.num_heads)
-        else:
-            # split heads before split qkv
-            self.attention = QKVAttentionLegacy(self.num_heads)
-
-        self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
-
-    def forward(self, x):
-        return checkpoint(self._forward, (x,), self.parameters(), True)   # TODO: check checkpoint usage, is True # TODO: fix the .half call!!!
-        #return pt_checkpoint(self._forward, x)  # pytorch
-
-    def _forward(self, x):
-        b, c, *spatial = x.shape
-        x = x.reshape(b, c, -1)
-        qkv = self.qkv(self.norm(x))
-        h = self.attention(qkv)
-        h = self.proj_out(h)
-        return (x + h).reshape(b, c, *spatial)
-
-
-def count_flops_attn(model, _x, y):
-    """
-    A counter for the `thop` package to count the operations in an
-    attention operation.
-    Meant to be used like:
-        macs, params = thop.profile(
-            model,
-            inputs=(inputs, timestamps),
-            custom_ops={QKVAttention: QKVAttention.count_flops},
-        )
-    """
-    b, c, *spatial = y[0].shape
-    num_spatial = int(np.prod(spatial))
-    # We perform two matmuls with the same number of ops.
-    # The first computes the weight matrix, the second computes
-    # the combination of the value vectors.
-    matmul_ops = 2 * b * (num_spatial ** 2) * c
-    model.total_ops += th.DoubleTensor([matmul_ops])
-
-
-class QKVAttentionLegacy(nn.Module):
-    """
-    A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
-    """
-
-    def __init__(self, n_heads):
-        super().__init__()
-        self.n_heads = n_heads
-
-    def forward(self, qkv):
-        """
-        Apply QKV attention.
-        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
-        :return: an [N x (H * C) x T] tensor after attention.
-        """
-        bs, width, length = qkv.shape
-        assert width % (3 * self.n_heads) == 0
-        ch = width // (3 * self.n_heads)
-        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
-        scale = 1 / math.sqrt(math.sqrt(ch))
-        weight = th.einsum(
-            "bct,bcs->bts", q * scale, k * scale
-        )  # More stable with f16 than dividing afterwards
-        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
-        a = th.einsum("bts,bcs->bct", weight, v)
-        return a.reshape(bs, -1, length)
-
-    @staticmethod
-    def count_flops(model, _x, y):
-        return count_flops_attn(model, _x, y)
-
-
-class QKVAttention(nn.Module):
-    """
-    A module which performs QKV attention and splits in a different order.
-    """
-
-    def __init__(self, n_heads):
-        super().__init__()
-        self.n_heads = n_heads
-
-    def forward(self, qkv):
-        """
-        Apply QKV attention.
-        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
-        :return: an [N x (H * C) x T] tensor after attention.
-        """
-        bs, width, length = qkv.shape
-        assert width % (3 * self.n_heads) == 0
-        ch = width // (3 * self.n_heads)
-        q, k, v = qkv.chunk(3, dim=1)
-        scale = 1 / math.sqrt(math.sqrt(ch))
-        weight = th.einsum(
-            "bct,bcs->bts",
-            (q * scale).view(bs * self.n_heads, ch, length),
-            (k * scale).view(bs * self.n_heads, ch, length),
-        )  # More stable with f16 than dividing afterwards
-        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
-        a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
-        return a.reshape(bs, -1, length)
-
-    @staticmethod
-    def count_flops(model, _x, y):
-        return count_flops_attn(model, _x, y)
-
-
-class UNetModel(nn.Module):
-    """
-    The full UNet model with attention and timestep embedding.
-    :param in_channels: channels in the input Tensor.
-    :param model_channels: base channel count for the model.
-    :param out_channels: channels in the output Tensor.
-    :param num_res_blocks: number of residual blocks per downsample.
-    :param attention_resolutions: a collection of downsample rates at which
-        attention will take place. May be a set, list, or tuple.
-        For example, if this contains 4, then at 4x downsampling, attention
-        will be used.
-    :param dropout: the dropout probability.
-    :param channel_mult: channel multiplier for each level of the UNet.
-    :param conv_resample: if True, use learned convolutions for upsampling and
-        downsampling.
-    :param dims: determines if the signal is 1D, 2D, or 3D.
-    :param num_classes: if specified (as an int), then this model will be
-        class-conditional with `num_classes` classes.
-    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
-    :param num_heads: the number of attention heads in each attention layer.
-    :param num_heads_channels: if specified, ignore num_heads and instead use
-                               a fixed channel width per attention head.
-    :param num_heads_upsample: works with num_heads to set a different number
-                               of heads for upsampling. Deprecated.
-    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
-    :param resblock_updown: use residual blocks for up/downsampling.
-    :param use_new_attention_order: use a different attention pattern for potentially
-                                    increased efficiency.
-    """
-
-    def __init__(
-        self,
-        image_size,
-        in_channels,
-        model_channels,
-        out_channels,
-        num_res_blocks,
-        attention_resolutions,
-        dropout=0,
-        channel_mult=(1, 2, 4, 8),
-        conv_resample=True,
-        dims=2,
-        num_classes=None,
-        use_checkpoint=False,
-        use_fp16=False,
-        num_heads=-1,
-        num_head_channels=-1,
-        num_heads_upsample=-1,
-        use_scale_shift_norm=False,
-        resblock_updown=False,
-        use_new_attention_order=False,
-        use_spatial_transformer=False,    # custom transformer support
-        transformer_depth=1,              # custom transformer support
-        context_dim=None,                 # custom transformer support
-        n_embed=None,                     # custom support for prediction of discrete ids into codebook of first stage vq model
-        legacy=True,
-        disable_self_attentions=None,
-        num_attention_blocks=None,
-        disable_middle_self_attn=False,
-        use_linear_in_transformer=False,
-    ):
-        super().__init__()
-        if use_spatial_transformer:
-            assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
-
-        if context_dim is not None:
-            assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
-            from omegaconf.listconfig import ListConfig
-            if type(context_dim) == ListConfig:
-                context_dim = list(context_dim)
-
-        if num_heads_upsample == -1:
-            num_heads_upsample = num_heads
-
-        if num_heads == -1:
-            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
-
-        if num_head_channels == -1:
-            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
-
-        self.image_size = image_size
-        self.in_channels = in_channels
-        self.model_channels = model_channels
-        self.out_channels = out_channels
-        if isinstance(num_res_blocks, int):
-            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
-        else:
-            if len(num_res_blocks) != len(channel_mult):
-                raise ValueError("provide num_res_blocks either as an int (globally constant) or "
-                                 "as a list/tuple (per-level) with the same length as channel_mult")
-            self.num_res_blocks = num_res_blocks
-        if disable_self_attentions is not None:
-            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
-            assert len(disable_self_attentions) == len(channel_mult)
-        if num_attention_blocks is not None:
-            assert len(num_attention_blocks) == len(self.num_res_blocks)
-            assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks))))
-            print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
-                  f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
-                  f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
-                  f"attention will still not be set.")
-
-        self.attention_resolutions = attention_resolutions
-        self.dropout = dropout
-        self.channel_mult = channel_mult
-        self.conv_resample = conv_resample
-        self.num_classes = num_classes
-        self.use_checkpoint = use_checkpoint
-        self.dtype = th.float16 if use_fp16 else th.float32
-        self.num_heads = num_heads
-        self.num_head_channels = num_head_channels
-        self.num_heads_upsample = num_heads_upsample
-        self.predict_codebook_ids = n_embed is not None
-
-        time_embed_dim = model_channels * 4
-        self.time_embed = nn.Sequential(
-            linear(model_channels, time_embed_dim),
-            nn.SiLU(),
-            linear(time_embed_dim, time_embed_dim),
-        )
-
-        if self.num_classes is not None:
-            if isinstance(self.num_classes, int):
-                self.label_emb = nn.Embedding(num_classes, time_embed_dim)
-            elif self.num_classes == "continuous":
-                print("setting up linear c_adm embedding layer")
-                self.label_emb = nn.Linear(1, time_embed_dim)
-            else:
-                raise ValueError()
-
-        self.input_blocks = nn.ModuleList(
-            [
-                TimestepEmbedSequential(
-                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
-                )
-            ]
-        )
-        self._feature_size = model_channels
-        input_block_chans = [model_channels]
-        ch = model_channels
-        ds = 1
-        for level, mult in enumerate(channel_mult):
-            for nr in range(self.num_res_blocks[level]):
-                layers = [
-                    ResBlock(
-                        ch,
-                        time_embed_dim,
-                        dropout,
-                        out_channels=mult * model_channels,
-                        dims=dims,
-                        use_checkpoint=use_checkpoint,
-                        use_scale_shift_norm=use_scale_shift_norm,
-                    )
-                ]
-                ch = mult * model_channels
-                if ds in attention_resolutions:
-                    if num_head_channels == -1:
-                        dim_head = ch // num_heads
-                    else:
-                        num_heads = ch // num_head_channels
-                        dim_head = num_head_channels
-                    if legacy:
-                        #num_heads = 1
-                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
-                    if exists(disable_self_attentions):
-                        disabled_sa = disable_self_attentions[level]
-                    else:
-                        disabled_sa = False
-
-                    if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
-                        layers.append(
-                            AttentionBlock(
-                                ch,
-                                use_checkpoint=use_checkpoint,
-                                num_heads=num_heads,
-                                num_head_channels=dim_head,
-                                use_new_attention_order=use_new_attention_order,
-                            ) if not use_spatial_transformer else SpatialTransformer(
-                                ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
-                                disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
-                                use_checkpoint=use_checkpoint
-                            )
-                        )
-                self.input_blocks.append(TimestepEmbedSequential(*layers))
-                self._feature_size += ch
-                input_block_chans.append(ch)
-            if level != len(channel_mult) - 1:
-                out_ch = ch
-                self.input_blocks.append(
-                    TimestepEmbedSequential(
-                        ResBlock(
-                            ch,
-                            time_embed_dim,
-                            dropout,
-                            out_channels=out_ch,
-                            dims=dims,
-                            use_checkpoint=use_checkpoint,
-                            use_scale_shift_norm=use_scale_shift_norm,
-                            down=True,
-                        )
-                        if resblock_updown
-                        else Downsample(
-                            ch, conv_resample, dims=dims, out_channels=out_ch
-                        )
-                    )
-                )
-                ch = out_ch
-                input_block_chans.append(ch)
-                ds *= 2
-                self._feature_size += ch
-
-        if num_head_channels == -1:
-            dim_head = ch // num_heads
-        else:
-            num_heads = ch // num_head_channels
-            dim_head = num_head_channels
-        if legacy:
-            #num_heads = 1
-            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
-        self.middle_block = TimestepEmbedSequential(
-            ResBlock(
-                ch,
-                time_embed_dim,
-                dropout,
-                dims=dims,
-                use_checkpoint=use_checkpoint,
-                use_scale_shift_norm=use_scale_shift_norm,
-            ),
-            AttentionBlock(
-                ch,
-                use_checkpoint=use_checkpoint,
-                num_heads=num_heads,
-                num_head_channels=dim_head,
-                use_new_attention_order=use_new_attention_order,
-            ) if not use_spatial_transformer else SpatialTransformer(  # always uses a self-attn
-                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
-                            disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
-                            use_checkpoint=use_checkpoint
-                        ),
-            ResBlock(
-                ch,
-                time_embed_dim,
-                dropout,
-                dims=dims,
-                use_checkpoint=use_checkpoint,
-                use_scale_shift_norm=use_scale_shift_norm,
-            ),
-        )
-        self._feature_size += ch
-
-        self.output_blocks = nn.ModuleList([])
-        for level, mult in list(enumerate(channel_mult))[::-1]:
-            for i in range(self.num_res_blocks[level] + 1):
-                ich = input_block_chans.pop()
-                layers = [
-                    ResBlock(
-                        ch + ich,
-                        time_embed_dim,
-                        dropout,
-                        out_channels=model_channels * mult,
-                        dims=dims,
-                        use_checkpoint=use_checkpoint,
-                        use_scale_shift_norm=use_scale_shift_norm,
-                    )
-                ]
-                ch = model_channels * mult
-                if ds in attention_resolutions:
-                    if num_head_channels == -1:
-                        dim_head = ch // num_heads
-                    else:
-                        num_heads = ch // num_head_channels
-                        dim_head = num_head_channels
-                    if legacy:
-                        #num_heads = 1
-                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
-                    if exists(disable_self_attentions):
-                        disabled_sa = disable_self_attentions[level]
-                    else:
-                        disabled_sa = False
-
-                    if not exists(num_attention_blocks) or i < num_attention_blocks[level]:
-                        layers.append(
-                            AttentionBlock(
-                                ch,
-                                use_checkpoint=use_checkpoint,
-                                num_heads=num_heads_upsample,
-                                num_head_channels=dim_head,
-                                use_new_attention_order=use_new_attention_order,
-                            ) if not use_spatial_transformer else SpatialTransformer(
-                                ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
-                                disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
-                                use_checkpoint=use_checkpoint
-                            )
-                        )
-                if level and i == self.num_res_blocks[level]:
-                    out_ch = ch
-                    layers.append(
-                        ResBlock(
-                            ch,
-                            time_embed_dim,
-                            dropout,
-                            out_channels=out_ch,
-                            dims=dims,
-                            use_checkpoint=use_checkpoint,
-                            use_scale_shift_norm=use_scale_shift_norm,
-                            up=True,
-                        )
-                        if resblock_updown
-                        else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
-                    )
-                    ds //= 2
-                self.output_blocks.append(TimestepEmbedSequential(*layers))
-                self._feature_size += ch
-
-        self.out = nn.Sequential(
-            normalization(ch),
-            nn.SiLU(),
-            zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
-        )
-        if self.predict_codebook_ids:
-            self.id_predictor = nn.Sequential(
-            normalization(ch),
-            conv_nd(dims, model_channels, n_embed, 1),
-            #nn.LogSoftmax(dim=1)  # change to cross_entropy and produce non-normalized logits
-        )
-
-    def convert_to_fp16(self):
-        """
-        Convert the torso of the model to float16.
-        """
-        self.input_blocks.apply(convert_module_to_f16)
-        self.middle_block.apply(convert_module_to_f16)
-        self.output_blocks.apply(convert_module_to_f16)
-
-    def convert_to_fp32(self):
-        """
-        Convert the torso of the model to float32.
-        """
-        self.input_blocks.apply(convert_module_to_f32)
-        self.middle_block.apply(convert_module_to_f32)
-        self.output_blocks.apply(convert_module_to_f32)
-
-    def forward(self, x, timesteps=None, context=None, y=None,**kwargs):
-        """
-        Apply the model to an input batch.
-        :param x: an [N x C x ...] Tensor of inputs.
-        :param timesteps: a 1-D batch of timesteps.
-        :param context: conditioning plugged in via crossattn
-        :param y: an [N] Tensor of labels, if class-conditional.
-        :return: an [N x C x ...] Tensor of outputs.
-        """
-        assert (y is not None) == (
-            self.num_classes is not None
-        ), "must specify y if and only if the model is class-conditional"
-        hs = []
-        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
-        emb = self.time_embed(t_emb)
-
-        if self.num_classes is not None:
-            assert y.shape[0] == x.shape[0]
-            emb = emb + self.label_emb(y)
-
-        h = x.type(self.dtype)
-        for module in self.input_blocks:
-            h = module(h, emb, context)
-            hs.append(h)
-        h = self.middle_block(h, emb, context)
-        for module in self.output_blocks:
-            h = th.cat([h, hs.pop()], dim=1)
-            h = module(h, emb, context)
-        h = h.type(x.dtype)
-        if self.predict_codebook_ids:
-            return self.id_predictor(h)
-        else:
-            return self.out(h)
diff --git a/ControlNet/ldm/modules/diffusionmodules/upscaling.py b/ControlNet/ldm/modules/diffusionmodules/upscaling.py
deleted file mode 100644
index 03816662098ce1ffac79bd939b892e867ab91988..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/modules/diffusionmodules/upscaling.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import torch
-import torch.nn as nn
-import numpy as np
-from functools import partial
-
-from ldm.modules.diffusionmodules.util import extract_into_tensor, make_beta_schedule
-from ldm.util import default
-
-
-class AbstractLowScaleModel(nn.Module):
-    # for concatenating a downsampled image to the latent representation
-    def __init__(self, noise_schedule_config=None):
-        super(AbstractLowScaleModel, self).__init__()
-        if noise_schedule_config is not None:
-            self.register_schedule(**noise_schedule_config)
-
-    def register_schedule(self, beta_schedule="linear", timesteps=1000,
-                          linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
-        betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end,
-                                   cosine_s=cosine_s)
-        alphas = 1. - betas
-        alphas_cumprod = np.cumprod(alphas, axis=0)
-        alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
-
-        timesteps, = betas.shape
-        self.num_timesteps = int(timesteps)
-        self.linear_start = linear_start
-        self.linear_end = linear_end
-        assert alphas_cumprod.shape[0] == self.num_timesteps, 'alphas have to be defined for each timestep'
-
-        to_torch = partial(torch.tensor, dtype=torch.float32)
-
-        self.register_buffer('betas', to_torch(betas))
-        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
-        self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
-
-        # calculations for diffusion q(x_t | x_{t-1}) and others
-        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
-        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
-        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
-        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
-        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
-
-    def q_sample(self, x_start, t, noise=None):
-        noise = default(noise, lambda: torch.randn_like(x_start))
-        return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
-                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
-
-    def forward(self, x):
-        return x, None
-
-    def decode(self, x):
-        return x
-
-
-class SimpleImageConcat(AbstractLowScaleModel):
-    # no noise level conditioning
-    def __init__(self):
-        super(SimpleImageConcat, self).__init__(noise_schedule_config=None)
-        self.max_noise_level = 0
-
-    def forward(self, x):
-        # fix to constant noise level
-        return x, torch.zeros(x.shape[0], device=x.device).long()
-
-
-class ImageConcatWithNoiseAugmentation(AbstractLowScaleModel):
-    def __init__(self, noise_schedule_config, max_noise_level=1000, to_cuda=False):
-        super().__init__(noise_schedule_config=noise_schedule_config)
-        self.max_noise_level = max_noise_level
-
-    def forward(self, x, noise_level=None):
-        if noise_level is None:
-            noise_level = torch.randint(0, self.max_noise_level, (x.shape[0],), device=x.device).long()
-        else:
-            assert isinstance(noise_level, torch.Tensor)
-        z = self.q_sample(x, noise_level)
-        return z, noise_level
-
-
-
diff --git a/ControlNet/ldm/modules/diffusionmodules/util.py b/ControlNet/ldm/modules/diffusionmodules/util.py
deleted file mode 100644
index 59e0404fc4b9ce2dd35e7ac31b1f6c9155c6e5cc..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/modules/diffusionmodules/util.py
+++ /dev/null
@@ -1,270 +0,0 @@
-# adopted from
-# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
-# and
-# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
-# and
-# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
-#
-# thanks!
-
-
-import os
-import math
-import torch
-import torch.nn as nn
-import numpy as np
-from einops import repeat
-
-from ControlNet.ldm.util import instantiate_from_config
-
-
-def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
-    if schedule == "linear":
-        betas = (
-                torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
-        )
-
-    elif schedule == "cosine":
-        timesteps = (
-                torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
-        )
-        alphas = timesteps / (1 + cosine_s) * np.pi / 2
-        alphas = torch.cos(alphas).pow(2)
-        alphas = alphas / alphas[0]
-        betas = 1 - alphas[1:] / alphas[:-1]
-        betas = np.clip(betas, a_min=0, a_max=0.999)
-
-    elif schedule == "sqrt_linear":
-        betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
-    elif schedule == "sqrt":
-        betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) ** 0.5
-    else:
-        raise ValueError(f"schedule '{schedule}' unknown.")
-    return betas.numpy()
-
-
-def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True):
-    if ddim_discr_method == 'uniform':
-        c = num_ddpm_timesteps // num_ddim_timesteps
-        ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
-    elif ddim_discr_method == 'quad':
-        ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8), num_ddim_timesteps)) ** 2).astype(int)
-    else:
-        raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"')
-
-    # assert ddim_timesteps.shape[0] == num_ddim_timesteps
-    # add one to get the final alpha values right (the ones from first scale to data during sampling)
-    steps_out = ddim_timesteps + 1
-    if verbose:
-        print(f'Selected timesteps for ddim sampler: {steps_out}')
-    return steps_out
-
-
-def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
-    # select alphas for computing the variance schedule
-    alphas = alphacums[ddim_timesteps]
-    alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
-
-    # according the the formula provided in https://arxiv.org/abs/2010.02502
-    sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
-    if verbose:
-        print(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}')
-        print(f'For the chosen value of eta, which is {eta}, '
-              f'this results in the following sigma_t schedule for ddim sampler {sigmas}')
-    return sigmas, alphas, alphas_prev
-
-
-def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function,
-    which defines the cumulative product of (1-beta) over time from t = [0,1].
-    :param num_diffusion_timesteps: the number of betas to produce.
-    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
-                      produces the cumulative product of (1-beta) up to that
-                      part of the diffusion process.
-    :param max_beta: the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-    """
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return np.array(betas)
-
-
-def extract_into_tensor(a, t, x_shape):
-    b, *_ = t.shape
-    out = a.gather(-1, t)
-    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
-
-
-def checkpoint(func, inputs, params, flag):
-    """
-    Evaluate a function without caching intermediate activations, allowing for
-    reduced memory at the expense of extra compute in the backward pass.
-    :param func: the function to evaluate.
-    :param inputs: the argument sequence to pass to `func`.
-    :param params: a sequence of parameters `func` depends on but does not
-                   explicitly take as arguments.
-    :param flag: if False, disable gradient checkpointing.
-    """
-    if flag:
-        args = tuple(inputs) + tuple(params)
-        return CheckpointFunction.apply(func, len(inputs), *args)
-    else:
-        return func(*inputs)
-
-
-class CheckpointFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, run_function, length, *args):
-        ctx.run_function = run_function
-        ctx.input_tensors = list(args[:length])
-        ctx.input_params = list(args[length:])
-        ctx.gpu_autocast_kwargs = {"enabled": torch.is_autocast_enabled(),
-                                   "dtype": torch.get_autocast_gpu_dtype(),
-                                   "cache_enabled": torch.is_autocast_cache_enabled()}
-        with torch.no_grad():
-            output_tensors = ctx.run_function(*ctx.input_tensors)
-        return output_tensors
-
-    @staticmethod
-    def backward(ctx, *output_grads):
-        ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
-        with torch.enable_grad(), \
-                torch.cuda.amp.autocast(**ctx.gpu_autocast_kwargs):
-            # Fixes a bug where the first op in run_function modifies the
-            # Tensor storage in place, which is not allowed for detach()'d
-            # Tensors.
-            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
-            output_tensors = ctx.run_function(*shallow_copies)
-        input_grads = torch.autograd.grad(
-            output_tensors,
-            ctx.input_tensors + ctx.input_params,
-            output_grads,
-            allow_unused=True,
-        )
-        del ctx.input_tensors
-        del ctx.input_params
-        del output_tensors
-        return (None, None) + input_grads
-
-
-def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
-    """
-    Create sinusoidal timestep embeddings.
-    :param timesteps: a 1-D Tensor of N indices, one per batch element.
-                      These may be fractional.
-    :param dim: the dimension of the output.
-    :param max_period: controls the minimum frequency of the embeddings.
-    :return: an [N x dim] Tensor of positional embeddings.
-    """
-    if not repeat_only:
-        half = dim // 2
-        freqs = torch.exp(
-            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
-        ).to(device=timesteps.device)
-        args = timesteps[:, None].float() * freqs[None]
-        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-        if dim % 2:
-            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
-    else:
-        embedding = repeat(timesteps, 'b -> b d', d=dim)
-    return embedding
-
-
-def zero_module(module):
-    """
-    Zero out the parameters of a module and return it.
-    """
-    for p in module.parameters():
-        p.detach().zero_()
-    return module
-
-
-def scale_module(module, scale):
-    """
-    Scale the parameters of a module and return it.
-    """
-    for p in module.parameters():
-        p.detach().mul_(scale)
-    return module
-
-
-def mean_flat(tensor):
-    """
-    Take the mean over all non-batch dimensions.
-    """
-    return tensor.mean(dim=list(range(1, len(tensor.shape))))
-
-
-def normalization(channels):
-    """
-    Make a standard normalization layer.
-    :param channels: number of input channels.
-    :return: an nn.Module for normalization.
-    """
-    return GroupNorm32(32, channels)
-
-
-# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
-class SiLU(nn.Module):
-    def forward(self, x):
-        return x * torch.sigmoid(x)
-
-
-class GroupNorm32(nn.GroupNorm):
-    def forward(self, x):
-        return super().forward(x.float()).type(x.dtype)
-
-def conv_nd(dims, *args, **kwargs):
-    """
-    Create a 1D, 2D, or 3D convolution module.
-    """
-    if dims == 1:
-        return nn.Conv1d(*args, **kwargs)
-    elif dims == 2:
-        return nn.Conv2d(*args, **kwargs)
-    elif dims == 3:
-        return nn.Conv3d(*args, **kwargs)
-    raise ValueError(f"unsupported dimensions: {dims}")
-
-
-def linear(*args, **kwargs):
-    """
-    Create a linear module.
-    """
-    return nn.Linear(*args, **kwargs)
-
-
-def avg_pool_nd(dims, *args, **kwargs):
-    """
-    Create a 1D, 2D, or 3D average pooling module.
-    """
-    if dims == 1:
-        return nn.AvgPool1d(*args, **kwargs)
-    elif dims == 2:
-        return nn.AvgPool2d(*args, **kwargs)
-    elif dims == 3:
-        return nn.AvgPool3d(*args, **kwargs)
-    raise ValueError(f"unsupported dimensions: {dims}")
-
-
-class HybridConditioner(nn.Module):
-
-    def __init__(self, c_concat_config, c_crossattn_config):
-        super().__init__()
-        self.concat_conditioner = instantiate_from_config(c_concat_config)
-        self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
-
-    def forward(self, c_concat, c_crossattn):
-        c_concat = self.concat_conditioner(c_concat)
-        c_crossattn = self.crossattn_conditioner(c_crossattn)
-        return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]}
-
-
-def noise_like(shape, device, repeat=False):
-    repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
-    noise = lambda: torch.randn(shape, device=device)
-    return repeat_noise() if repeat else noise()
\ No newline at end of file
diff --git a/ControlNet/ldm/modules/distributions/__init__.py b/ControlNet/ldm/modules/distributions/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/ControlNet/ldm/modules/distributions/distributions.py b/ControlNet/ldm/modules/distributions/distributions.py
deleted file mode 100644
index f2b8ef901130efc171aa69742ca0244d94d3f2e9..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/modules/distributions/distributions.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import torch
-import numpy as np
-
-
-class AbstractDistribution:
-    def sample(self):
-        raise NotImplementedError()
-
-    def mode(self):
-        raise NotImplementedError()
-
-
-class DiracDistribution(AbstractDistribution):
-    def __init__(self, value):
-        self.value = value
-
-    def sample(self):
-        return self.value
-
-    def mode(self):
-        return self.value
-
-
-class DiagonalGaussianDistribution(object):
-    def __init__(self, parameters, deterministic=False):
-        self.parameters = parameters
-        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
-        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
-        self.deterministic = deterministic
-        self.std = torch.exp(0.5 * self.logvar)
-        self.var = torch.exp(self.logvar)
-        if self.deterministic:
-            self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
-
-    def sample(self):
-        x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
-        return x
-
-    def kl(self, other=None):
-        if self.deterministic:
-            return torch.Tensor([0.])
-        else:
-            if other is None:
-                return 0.5 * torch.sum(torch.pow(self.mean, 2)
-                                       + self.var - 1.0 - self.logvar,
-                                       dim=[1, 2, 3])
-            else:
-                return 0.5 * torch.sum(
-                    torch.pow(self.mean - other.mean, 2) / other.var
-                    + self.var / other.var - 1.0 - self.logvar + other.logvar,
-                    dim=[1, 2, 3])
-
-    def nll(self, sample, dims=[1,2,3]):
-        if self.deterministic:
-            return torch.Tensor([0.])
-        logtwopi = np.log(2.0 * np.pi)
-        return 0.5 * torch.sum(
-            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
-            dim=dims)
-
-    def mode(self):
-        return self.mean
-
-
-def normal_kl(mean1, logvar1, mean2, logvar2):
-    """
-    source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
-    Compute the KL divergence between two gaussians.
-    Shapes are automatically broadcasted, so batches can be compared to
-    scalars, among other use cases.
-    """
-    tensor = None
-    for obj in (mean1, logvar1, mean2, logvar2):
-        if isinstance(obj, torch.Tensor):
-            tensor = obj
-            break
-    assert tensor is not None, "at least one argument must be a Tensor"
-
-    # Force variances to be Tensors. Broadcasting helps convert scalars to
-    # Tensors, but it does not work for torch.exp().
-    logvar1, logvar2 = [
-        x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
-        for x in (logvar1, logvar2)
-    ]
-
-    return 0.5 * (
-        -1.0
-        + logvar2
-        - logvar1
-        + torch.exp(logvar1 - logvar2)
-        + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
-    )
diff --git a/ControlNet/ldm/modules/ema.py b/ControlNet/ldm/modules/ema.py
deleted file mode 100644
index bded25019b9bcbcd0260f0b8185f8c7859ca58c4..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/modules/ema.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import torch
-from torch import nn
-
-
-class LitEma(nn.Module):
-    def __init__(self, model, decay=0.9999, use_num_upates=True):
-        super().__init__()
-        if decay < 0.0 or decay > 1.0:
-            raise ValueError('Decay must be between 0 and 1')
-
-        self.m_name2s_name = {}
-        self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
-        self.register_buffer('num_updates', torch.tensor(0, dtype=torch.int) if use_num_upates
-        else torch.tensor(-1, dtype=torch.int))
-
-        for name, p in model.named_parameters():
-            if p.requires_grad:
-                # remove as '.'-character is not allowed in buffers
-                s_name = name.replace('.', '')
-                self.m_name2s_name.update({name: s_name})
-                self.register_buffer(s_name, p.clone().detach().data)
-
-        self.collected_params = []
-
-    def reset_num_updates(self):
-        del self.num_updates
-        self.register_buffer('num_updates', torch.tensor(0, dtype=torch.int))
-
-    def forward(self, model):
-        decay = self.decay
-
-        if self.num_updates >= 0:
-            self.num_updates += 1
-            decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates))
-
-        one_minus_decay = 1.0 - decay
-
-        with torch.no_grad():
-            m_param = dict(model.named_parameters())
-            shadow_params = dict(self.named_buffers())
-
-            for key in m_param:
-                if m_param[key].requires_grad:
-                    sname = self.m_name2s_name[key]
-                    shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
-                    shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key]))
-                else:
-                    assert not key in self.m_name2s_name
-
-    def copy_to(self, model):
-        m_param = dict(model.named_parameters())
-        shadow_params = dict(self.named_buffers())
-        for key in m_param:
-            if m_param[key].requires_grad:
-                m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
-            else:
-                assert not key in self.m_name2s_name
-
-    def store(self, parameters):
-        """
-        Save the current parameters for restoring later.
-        Args:
-          parameters: Iterable of `torch.nn.Parameter`; the parameters to be
-            temporarily stored.
-        """
-        self.collected_params = [param.clone() for param in parameters]
-
-    def restore(self, parameters):
-        """
-        Restore the parameters stored with the `store` method.
-        Useful to validate the model with EMA parameters without affecting the
-        original optimization process. Store the parameters before the
-        `copy_to` method. After validation (or model saving), use this to
-        restore the former parameters.
-        Args:
-          parameters: Iterable of `torch.nn.Parameter`; the parameters to be
-            updated with the stored parameters.
-        """
-        for c_param, param in zip(self.collected_params, parameters):
-            param.data.copy_(c_param.data)
diff --git a/ControlNet/ldm/modules/encoders/__init__.py b/ControlNet/ldm/modules/encoders/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/ControlNet/ldm/modules/encoders/modules.py b/ControlNet/ldm/modules/encoders/modules.py
deleted file mode 100644
index 49aa4415937226d305ba596a54dffc0a1cb7fc9e..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/modules/encoders/modules.py
+++ /dev/null
@@ -1,213 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.utils.checkpoint import checkpoint
-
-from transformers import T5Tokenizer, T5EncoderModel, CLIPTokenizer, CLIPTextModel
-
-import open_clip
-from ControlNet.ldm.util import default, count_params
-
-
-class AbstractEncoder(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def encode(self, *args, **kwargs):
-        raise NotImplementedError
-
-
-class IdentityEncoder(AbstractEncoder):
-
-    def encode(self, x):
-        return x
-
-
-class ClassEmbedder(nn.Module):
-    def __init__(self, embed_dim, n_classes=1000, key='class', ucg_rate=0.1):
-        super().__init__()
-        self.key = key
-        self.embedding = nn.Embedding(n_classes, embed_dim)
-        self.n_classes = n_classes
-        self.ucg_rate = ucg_rate
-
-    def forward(self, batch, key=None, disable_dropout=False):
-        if key is None:
-            key = self.key
-        # this is for use in crossattn
-        c = batch[key][:, None]
-        if self.ucg_rate > 0. and not disable_dropout:
-            mask = 1. - torch.bernoulli(torch.ones_like(c) * self.ucg_rate)
-            c = mask * c + (1-mask) * torch.ones_like(c)*(self.n_classes-1)
-            c = c.long()
-        c = self.embedding(c)
-        return c
-
-    def get_unconditional_conditioning(self, bs, device="cuda"):
-        uc_class = self.n_classes - 1  # 1000 classes --> 0 ... 999, one extra class for ucg (class 1000)
-        uc = torch.ones((bs,), device=device) * uc_class
-        uc = {self.key: uc}
-        return uc
-
-
-def disabled_train(self, mode=True):
-    """Overwrite model.train with this function to make sure train/eval mode
-    does not change anymore."""
-    return self
-
-
-class FrozenT5Embedder(AbstractEncoder):
-    """Uses the T5 transformer encoder for text"""
-    def __init__(self, version="google/t5-v1_1-large", device="cuda", max_length=77, freeze=True):  # others are google/t5-v1_1-xl and google/t5-v1_1-xxl
-        super().__init__()
-        self.tokenizer = T5Tokenizer.from_pretrained(version)
-        self.transformer = T5EncoderModel.from_pretrained(version)
-        self.device = device
-        self.max_length = max_length   # TODO: typical value?
-        if freeze:
-            self.freeze()
-
-    def freeze(self):
-        self.transformer = self.transformer.eval()
-        #self.train = disabled_train
-        for param in self.parameters():
-            param.requires_grad = False
-
-    def forward(self, text):
-        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
-                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
-        tokens = batch_encoding["input_ids"].to(self.device)
-        outputs = self.transformer(input_ids=tokens)
-
-        z = outputs.last_hidden_state
-        return z
-
-    def encode(self, text):
-        return self(text)
-
-
-class FrozenCLIPEmbedder(AbstractEncoder):
-    """Uses the CLIP transformer encoder for text (from huggingface)"""
-    LAYERS = [
-        "last",
-        "pooled",
-        "hidden"
-    ]
-    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77,
-                 freeze=True, layer="last", layer_idx=None):  # clip-vit-base-patch32
-        super().__init__()
-        assert layer in self.LAYERS
-        self.tokenizer = CLIPTokenizer.from_pretrained(version)
-        self.transformer = CLIPTextModel.from_pretrained(version)
-        self.device = device
-        self.max_length = max_length
-        if freeze:
-            self.freeze()
-        self.layer = layer
-        self.layer_idx = layer_idx
-        if layer == "hidden":
-            assert layer_idx is not None
-            assert 0 <= abs(layer_idx) <= 12
-
-    def freeze(self):
-        self.transformer = self.transformer.eval()
-        #self.train = disabled_train
-        for param in self.parameters():
-            param.requires_grad = False
-
-    def forward(self, text):
-        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
-                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
-        tokens = batch_encoding["input_ids"].to(self.device)
-        outputs = self.transformer(input_ids=tokens, output_hidden_states=self.layer=="hidden")
-        if self.layer == "last":
-            z = outputs.last_hidden_state
-        elif self.layer == "pooled":
-            z = outputs.pooler_output[:, None, :]
-        else:
-            z = outputs.hidden_states[self.layer_idx]
-        return z
-
-    def encode(self, text):
-        return self(text)
-
-
-class FrozenOpenCLIPEmbedder(AbstractEncoder):
-    """
-    Uses the OpenCLIP transformer encoder for text
-    """
-    LAYERS = [
-        #"pooled",
-        "last",
-        "penultimate"
-    ]
-    def __init__(self, arch="ViT-H-14", version="laion2b_s32b_b79k", device="cuda", max_length=77,
-                 freeze=True, layer="last"):
-        super().__init__()
-        assert layer in self.LAYERS
-        model, _, _ = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'), pretrained=version)
-        del model.visual
-        self.model = model
-
-        self.device = device
-        self.max_length = max_length
-        if freeze:
-            self.freeze()
-        self.layer = layer
-        if self.layer == "last":
-            self.layer_idx = 0
-        elif self.layer == "penultimate":
-            self.layer_idx = 1
-        else:
-            raise NotImplementedError()
-
-    def freeze(self):
-        self.model = self.model.eval()
-        for param in self.parameters():
-            param.requires_grad = False
-
-    def forward(self, text):
-        tokens = open_clip.tokenize(text)
-        z = self.encode_with_transformer(tokens.to(self.device))
-        return z
-
-    def encode_with_transformer(self, text):
-        x = self.model.token_embedding(text)  # [batch_size, n_ctx, d_model]
-        x = x + self.model.positional_embedding
-        x = x.permute(1, 0, 2)  # NLD -> LND
-        x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask)
-        x = x.permute(1, 0, 2)  # LND -> NLD
-        x = self.model.ln_final(x)
-        return x
-
-    def text_transformer_forward(self, x: torch.Tensor, attn_mask = None):
-        for i, r in enumerate(self.model.transformer.resblocks):
-            if i == len(self.model.transformer.resblocks) - self.layer_idx:
-                break
-            if self.model.transformer.grad_checkpointing and not torch.jit.is_scripting():
-                x = checkpoint(r, x, attn_mask)
-            else:
-                x = r(x, attn_mask=attn_mask)
-        return x
-
-    def encode(self, text):
-        return self(text)
-
-
-class FrozenCLIPT5Encoder(AbstractEncoder):
-    def __init__(self, clip_version="openai/clip-vit-large-patch14", t5_version="google/t5-v1_1-xl", device="cuda",
-                 clip_max_length=77, t5_max_length=77):
-        super().__init__()
-        self.clip_encoder = FrozenCLIPEmbedder(clip_version, device, max_length=clip_max_length)
-        self.t5_encoder = FrozenT5Embedder(t5_version, device, max_length=t5_max_length)
-        print(f"{self.clip_encoder.__class__.__name__} has {count_params(self.clip_encoder)*1.e-6:.2f} M parameters, "
-              f"{self.t5_encoder.__class__.__name__} comes with {count_params(self.t5_encoder)*1.e-6:.2f} M params.")
-
-    def encode(self, text):
-        return self(text)
-
-    def forward(self, text):
-        clip_z = self.clip_encoder.encode(text)
-        t5_z = self.t5_encoder.encode(text)
-        return [clip_z, t5_z]
-
-
diff --git a/ControlNet/ldm/modules/image_degradation/__init__.py b/ControlNet/ldm/modules/image_degradation/__init__.py
deleted file mode 100644
index 7836cada81f90ded99c58d5942eea4c3477f58fc..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/modules/image_degradation/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr
-from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light
diff --git a/ControlNet/ldm/modules/image_degradation/bsrgan.py b/ControlNet/ldm/modules/image_degradation/bsrgan.py
deleted file mode 100644
index 32ef56169978e550090261cddbcf5eb611a6173b..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/modules/image_degradation/bsrgan.py
+++ /dev/null
@@ -1,730 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-# --------------------------------------------
-# Super-Resolution
-# --------------------------------------------
-#
-# Kai Zhang (cskaizhang@gmail.com)
-# https://github.com/cszn
-# From 2019/03--2021/08
-# --------------------------------------------
-"""
-
-import numpy as np
-import cv2
-import torch
-
-from functools import partial
-import random
-from scipy import ndimage
-import scipy
-import scipy.stats as ss
-from scipy.interpolate import interp2d
-from scipy.linalg import orth
-import albumentations
-
-import ldm.modules.image_degradation.utils_image as util
-
-
-def modcrop_np(img, sf):
-    '''
-    Args:
-        img: numpy image, WxH or WxHxC
-        sf: scale factor
-    Return:
-        cropped image
-    '''
-    w, h = img.shape[:2]
-    im = np.copy(img)
-    return im[:w - w % sf, :h - h % sf, ...]
-
-
-"""
-# --------------------------------------------
-# anisotropic Gaussian kernels
-# --------------------------------------------
-"""
-
-
-def analytic_kernel(k):
-    """Calculate the X4 kernel from the X2 kernel (for proof see appendix in paper)"""
-    k_size = k.shape[0]
-    # Calculate the big kernels size
-    big_k = np.zeros((3 * k_size - 2, 3 * k_size - 2))
-    # Loop over the small kernel to fill the big one
-    for r in range(k_size):
-        for c in range(k_size):
-            big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k
-    # Crop the edges of the big kernel to ignore very small values and increase run time of SR
-    crop = k_size // 2
-    cropped_big_k = big_k[crop:-crop, crop:-crop]
-    # Normalize to 1
-    return cropped_big_k / cropped_big_k.sum()
-
-
-def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
-    """ generate an anisotropic Gaussian kernel
-    Args:
-        ksize : e.g., 15, kernel size
-        theta : [0,  pi], rotation angle range
-        l1    : [0.1,50], scaling of eigenvalues
-        l2    : [0.1,l1], scaling of eigenvalues
-        If l1 = l2, will get an isotropic Gaussian kernel.
-    Returns:
-        k     : kernel
-    """
-
-    v = np.dot(np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), np.array([1., 0.]))
-    V = np.array([[v[0], v[1]], [v[1], -v[0]]])
-    D = np.array([[l1, 0], [0, l2]])
-    Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
-    k = gm_blur_kernel(mean=[0, 0], cov=Sigma, size=ksize)
-
-    return k
-
-
-def gm_blur_kernel(mean, cov, size=15):
-    center = size / 2.0 + 0.5
-    k = np.zeros([size, size])
-    for y in range(size):
-        for x in range(size):
-            cy = y - center + 1
-            cx = x - center + 1
-            k[y, x] = ss.multivariate_normal.pdf([cx, cy], mean=mean, cov=cov)
-
-    k = k / np.sum(k)
-    return k
-
-
-def shift_pixel(x, sf, upper_left=True):
-    """shift pixel for super-resolution with different scale factors
-    Args:
-        x: WxHxC or WxH
-        sf: scale factor
-        upper_left: shift direction
-    """
-    h, w = x.shape[:2]
-    shift = (sf - 1) * 0.5
-    xv, yv = np.arange(0, w, 1.0), np.arange(0, h, 1.0)
-    if upper_left:
-        x1 = xv + shift
-        y1 = yv + shift
-    else:
-        x1 = xv - shift
-        y1 = yv - shift
-
-    x1 = np.clip(x1, 0, w - 1)
-    y1 = np.clip(y1, 0, h - 1)
-
-    if x.ndim == 2:
-        x = interp2d(xv, yv, x)(x1, y1)
-    if x.ndim == 3:
-        for i in range(x.shape[-1]):
-            x[:, :, i] = interp2d(xv, yv, x[:, :, i])(x1, y1)
-
-    return x
-
-
-def blur(x, k):
-    '''
-    x: image, NxcxHxW
-    k: kernel, Nx1xhxw
-    '''
-    n, c = x.shape[:2]
-    p1, p2 = (k.shape[-2] - 1) // 2, (k.shape[-1] - 1) // 2
-    x = torch.nn.functional.pad(x, pad=(p1, p2, p1, p2), mode='replicate')
-    k = k.repeat(1, c, 1, 1)
-    k = k.view(-1, 1, k.shape[2], k.shape[3])
-    x = x.view(1, -1, x.shape[2], x.shape[3])
-    x = torch.nn.functional.conv2d(x, k, bias=None, stride=1, padding=0, groups=n * c)
-    x = x.view(n, c, x.shape[2], x.shape[3])
-
-    return x
-
-
-def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var=0.6, max_var=10., noise_level=0):
-    """"
-    # modified version of https://github.com/assafshocher/BlindSR_dataset_generator
-    # Kai Zhang
-    # min_var = 0.175 * sf  # variance of the gaussian kernel will be sampled between min_var and max_var
-    # max_var = 2.5 * sf
-    """
-    # Set random eigen-vals (lambdas) and angle (theta) for COV matrix
-    lambda_1 = min_var + np.random.rand() * (max_var - min_var)
-    lambda_2 = min_var + np.random.rand() * (max_var - min_var)
-    theta = np.random.rand() * np.pi  # random theta
-    noise = -noise_level + np.random.rand(*k_size) * noise_level * 2
-
-    # Set COV matrix using Lambdas and Theta
-    LAMBDA = np.diag([lambda_1, lambda_2])
-    Q = np.array([[np.cos(theta), -np.sin(theta)],
-                  [np.sin(theta), np.cos(theta)]])
-    SIGMA = Q @ LAMBDA @ Q.T
-    INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]
-
-    # Set expectation position (shifting kernel for aligned image)
-    MU = k_size // 2 - 0.5 * (scale_factor - 1)  # - 0.5 * (scale_factor - k_size % 2)
-    MU = MU[None, None, :, None]
-
-    # Create meshgrid for Gaussian
-    [X, Y] = np.meshgrid(range(k_size[0]), range(k_size[1]))
-    Z = np.stack([X, Y], 2)[:, :, :, None]
-
-    # Calcualte Gaussian for every pixel of the kernel
-    ZZ = Z - MU
-    ZZ_t = ZZ.transpose(0, 1, 3, 2)
-    raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise)
-
-    # shift the kernel so it will be centered
-    # raw_kernel_centered = kernel_shift(raw_kernel, scale_factor)
-
-    # Normalize the kernel and return
-    # kernel = raw_kernel_centered / np.sum(raw_kernel_centered)
-    kernel = raw_kernel / np.sum(raw_kernel)
-    return kernel
-
-
-def fspecial_gaussian(hsize, sigma):
-    hsize = [hsize, hsize]
-    siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
-    std = sigma
-    [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
-    arg = -(x * x + y * y) / (2 * std * std)
-    h = np.exp(arg)
-    h[h < scipy.finfo(float).eps * h.max()] = 0
-    sumh = h.sum()
-    if sumh != 0:
-        h = h / sumh
-    return h
-
-
-def fspecial_laplacian(alpha):
-    alpha = max([0, min([alpha, 1])])
-    h1 = alpha / (alpha + 1)
-    h2 = (1 - alpha) / (alpha + 1)
-    h = [[h1, h2, h1], [h2, -4 / (alpha + 1), h2], [h1, h2, h1]]
-    h = np.array(h)
-    return h
-
-
-def fspecial(filter_type, *args, **kwargs):
-    '''
-    python code from:
-    https://github.com/ronaldosena/imagens-medicas-2/blob/40171a6c259edec7827a6693a93955de2bd39e76/Aulas/aula_2_-_uniform_filter/matlab_fspecial.py
-    '''
-    if filter_type == 'gaussian':
-        return fspecial_gaussian(*args, **kwargs)
-    if filter_type == 'laplacian':
-        return fspecial_laplacian(*args, **kwargs)
-
-
-"""
-# --------------------------------------------
-# degradation models
-# --------------------------------------------
-"""
-
-
-def bicubic_degradation(x, sf=3):
-    '''
-    Args:
-        x: HxWxC image, [0, 1]
-        sf: down-scale factor
-    Return:
-        bicubicly downsampled LR image
-    '''
-    x = util.imresize_np(x, scale=1 / sf)
-    return x
-
-
-def srmd_degradation(x, k, sf=3):
-    ''' blur + bicubic downsampling
-    Args:
-        x: HxWxC image, [0, 1]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    Reference:
-        @inproceedings{zhang2018learning,
-          title={Learning a single convolutional super-resolution network for multiple degradations},
-          author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
-          booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
-          pages={3262--3271},
-          year={2018}
-        }
-    '''
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')  # 'nearest' | 'mirror'
-    x = bicubic_degradation(x, sf=sf)
-    return x
-
-
-def dpsr_degradation(x, k, sf=3):
-    ''' bicubic downsampling + blur
-    Args:
-        x: HxWxC image, [0, 1]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    Reference:
-        @inproceedings{zhang2019deep,
-          title={Deep Plug-and-Play Super-Resolution for Arbitrary Blur Kernels},
-          author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
-          booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
-          pages={1671--1681},
-          year={2019}
-        }
-    '''
-    x = bicubic_degradation(x, sf=sf)
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
-    return x
-
-
-def classical_degradation(x, k, sf=3):
-    ''' blur + downsampling
-    Args:
-        x: HxWxC image, [0, 1]/[0, 255]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    '''
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
-    # x = filters.correlate(x, np.expand_dims(np.flip(k), axis=2))
-    st = 0
-    return x[st::sf, st::sf, ...]
-
-
-def add_sharpening(img, weight=0.5, radius=50, threshold=10):
-    """USM sharpening. borrowed from real-ESRGAN
-    Input image: I; Blurry image: B.
-    1. K = I + weight * (I - B)
-    2. Mask = 1 if abs(I - B) > threshold, else: 0
-    3. Blur mask:
-    4. Out = Mask * K + (1 - Mask) * I
-    Args:
-        img (Numpy array): Input image, HWC, BGR; float32, [0, 1].
-        weight (float): Sharp weight. Default: 1.
-        radius (float): Kernel size of Gaussian blur. Default: 50.
-        threshold (int):
-    """
-    if radius % 2 == 0:
-        radius += 1
-    blur = cv2.GaussianBlur(img, (radius, radius), 0)
-    residual = img - blur
-    mask = np.abs(residual) * 255 > threshold
-    mask = mask.astype('float32')
-    soft_mask = cv2.GaussianBlur(mask, (radius, radius), 0)
-
-    K = img + weight * residual
-    K = np.clip(K, 0, 1)
-    return soft_mask * K + (1 - soft_mask) * img
-
-
-def add_blur(img, sf=4):
-    wd2 = 4.0 + sf
-    wd = 2.0 + 0.2 * sf
-    if random.random() < 0.5:
-        l1 = wd2 * random.random()
-        l2 = wd2 * random.random()
-        k = anisotropic_Gaussian(ksize=2 * random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2)
-    else:
-        k = fspecial('gaussian', 2 * random.randint(2, 11) + 3, wd * random.random())
-    img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode='mirror')
-
-    return img
-
-
-def add_resize(img, sf=4):
-    rnum = np.random.rand()
-    if rnum > 0.8:  # up
-        sf1 = random.uniform(1, 2)
-    elif rnum < 0.7:  # down
-        sf1 = random.uniform(0.5 / sf, 1)
-    else:
-        sf1 = 1.0
-    img = cv2.resize(img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3]))
-    img = np.clip(img, 0.0, 1.0)
-
-    return img
-
-
-# def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
-#     noise_level = random.randint(noise_level1, noise_level2)
-#     rnum = np.random.rand()
-#     if rnum > 0.6:  # add color Gaussian noise
-#         img += np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
-#     elif rnum < 0.4:  # add grayscale Gaussian noise
-#         img += np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
-#     else:  # add  noise
-#         L = noise_level2 / 255.
-#         D = np.diag(np.random.rand(3))
-#         U = orth(np.random.rand(3, 3))
-#         conv = np.dot(np.dot(np.transpose(U), D), U)
-#         img += np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
-#     img = np.clip(img, 0.0, 1.0)
-#     return img
-
-def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
-    noise_level = random.randint(noise_level1, noise_level2)
-    rnum = np.random.rand()
-    if rnum > 0.6:  # add color Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
-    elif rnum < 0.4:  # add grayscale Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
-    else:  # add  noise
-        L = noise_level2 / 255.
-        D = np.diag(np.random.rand(3))
-        U = orth(np.random.rand(3, 3))
-        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_speckle_noise(img, noise_level1=2, noise_level2=25):
-    noise_level = random.randint(noise_level1, noise_level2)
-    img = np.clip(img, 0.0, 1.0)
-    rnum = random.random()
-    if rnum > 0.6:
-        img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
-    elif rnum < 0.4:
-        img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
-    else:
-        L = noise_level2 / 255.
-        D = np.diag(np.random.rand(3))
-        U = orth(np.random.rand(3, 3))
-        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_Poisson_noise(img):
-    img = np.clip((img * 255.0).round(), 0, 255) / 255.
-    vals = 10 ** (2 * random.random() + 2.0)  # [2, 4]
-    if random.random() < 0.5:
-        img = np.random.poisson(img * vals).astype(np.float32) / vals
-    else:
-        img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
-        img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.
-        noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
-        img += noise_gray[:, :, np.newaxis]
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_JPEG_noise(img):
-    quality_factor = random.randint(30, 95)
-    img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
-    result, encimg = cv2.imencode('.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
-    img = cv2.imdecode(encimg, 1)
-    img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
-    return img
-
-
-def random_crop(lq, hq, sf=4, lq_patchsize=64):
-    h, w = lq.shape[:2]
-    rnd_h = random.randint(0, h - lq_patchsize)
-    rnd_w = random.randint(0, w - lq_patchsize)
-    lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :]
-
-    rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
-    hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize * sf, :]
-    return lq, hq
-
-
-def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
-    """
-    This is the degradation model of BSRGAN from the paper
-    "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
-    ----------
-    img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf)
-    sf: scale factor
-    isp_model: camera ISP model
-    Returns
-    -------
-    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
-    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
-    """
-    isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
-    sf_ori = sf
-
-    h1, w1 = img.shape[:2]
-    img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
-    h, w = img.shape[:2]
-
-    if h < lq_patchsize * sf or w < lq_patchsize * sf:
-        raise ValueError(f'img size ({h1}X{w1}) is too small!')
-
-    hq = img.copy()
-
-    if sf == 4 and random.random() < scale2_prob:  # downsample1
-        if np.random.rand() < 0.5:
-            img = cv2.resize(img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
-                             interpolation=random.choice([1, 2, 3]))
-        else:
-            img = util.imresize_np(img, 1 / 2, True)
-        img = np.clip(img, 0.0, 1.0)
-        sf = 2
-
-    shuffle_order = random.sample(range(7), 7)
-    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
-    if idx1 > idx2:  # keep downsample3 last
-        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
-
-    for i in shuffle_order:
-
-        if i == 0:
-            img = add_blur(img, sf=sf)
-
-        elif i == 1:
-            img = add_blur(img, sf=sf)
-
-        elif i == 2:
-            a, b = img.shape[1], img.shape[0]
-            # downsample2
-            if random.random() < 0.75:
-                sf1 = random.uniform(1, 2 * sf)
-                img = cv2.resize(img, (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
-                                 interpolation=random.choice([1, 2, 3]))
-            else:
-                k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
-                k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
-                img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode='mirror')
-                img = img[0::sf, 0::sf, ...]  # nearest downsampling
-            img = np.clip(img, 0.0, 1.0)
-
-        elif i == 3:
-            # downsample3
-            img = cv2.resize(img, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
-            img = np.clip(img, 0.0, 1.0)
-
-        elif i == 4:
-            # add Gaussian noise
-            img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25)
-
-        elif i == 5:
-            # add JPEG noise
-            if random.random() < jpeg_prob:
-                img = add_JPEG_noise(img)
-
-        elif i == 6:
-            # add processed camera sensor noise
-            if random.random() < isp_prob and isp_model is not None:
-                with torch.no_grad():
-                    img, hq = isp_model.forward(img.copy(), hq)
-
-    # add final JPEG compression noise
-    img = add_JPEG_noise(img)
-
-    # random crop
-    img, hq = random_crop(img, hq, sf_ori, lq_patchsize)
-
-    return img, hq
-
-
-# todo no isp_model?
-def degradation_bsrgan_variant(image, sf=4, isp_model=None):
-    """
-    This is the degradation model of BSRGAN from the paper
-    "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
-    ----------
-    sf: scale factor
-    isp_model: camera ISP model
-    Returns
-    -------
-    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
-    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
-    """
-    image = util.uint2single(image)
-    isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
-    sf_ori = sf
-
-    h1, w1 = image.shape[:2]
-    image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
-    h, w = image.shape[:2]
-
-    hq = image.copy()
-
-    if sf == 4 and random.random() < scale2_prob:  # downsample1
-        if np.random.rand() < 0.5:
-            image = cv2.resize(image, (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
-                               interpolation=random.choice([1, 2, 3]))
-        else:
-            image = util.imresize_np(image, 1 / 2, True)
-        image = np.clip(image, 0.0, 1.0)
-        sf = 2
-
-    shuffle_order = random.sample(range(7), 7)
-    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
-    if idx1 > idx2:  # keep downsample3 last
-        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
-
-    for i in shuffle_order:
-
-        if i == 0:
-            image = add_blur(image, sf=sf)
-
-        elif i == 1:
-            image = add_blur(image, sf=sf)
-
-        elif i == 2:
-            a, b = image.shape[1], image.shape[0]
-            # downsample2
-            if random.random() < 0.75:
-                sf1 = random.uniform(1, 2 * sf)
-                image = cv2.resize(image, (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
-                                   interpolation=random.choice([1, 2, 3]))
-            else:
-                k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
-                k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
-                image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode='mirror')
-                image = image[0::sf, 0::sf, ...]  # nearest downsampling
-            image = np.clip(image, 0.0, 1.0)
-
-        elif i == 3:
-            # downsample3
-            image = cv2.resize(image, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
-            image = np.clip(image, 0.0, 1.0)
-
-        elif i == 4:
-            # add Gaussian noise
-            image = add_Gaussian_noise(image, noise_level1=2, noise_level2=25)
-
-        elif i == 5:
-            # add JPEG noise
-            if random.random() < jpeg_prob:
-                image = add_JPEG_noise(image)
-
-        # elif i == 6:
-        #     # add processed camera sensor noise
-        #     if random.random() < isp_prob and isp_model is not None:
-        #         with torch.no_grad():
-        #             img, hq = isp_model.forward(img.copy(), hq)
-
-    # add final JPEG compression noise
-    image = add_JPEG_noise(image)
-    image = util.single2uint(image)
-    example = {"image":image}
-    return example
-
-
-# TODO incase there is a pickle error one needs to replace a += x with a = a + x in add_speckle_noise etc...
-def degradation_bsrgan_plus(img, sf=4, shuffle_prob=0.5, use_sharp=True, lq_patchsize=64, isp_model=None):
-    """
-    This is an extended degradation model by combining
-    the degradation models of BSRGAN and Real-ESRGAN
-    ----------
-    img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf)
-    sf: scale factor
-    use_shuffle: the degradation shuffle
-    use_sharp: sharpening the img
-    Returns
-    -------
-    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
-    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
-    """
-
-    h1, w1 = img.shape[:2]
-    img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
-    h, w = img.shape[:2]
-
-    if h < lq_patchsize * sf or w < lq_patchsize * sf:
-        raise ValueError(f'img size ({h1}X{w1}) is too small!')
-
-    if use_sharp:
-        img = add_sharpening(img)
-    hq = img.copy()
-
-    if random.random() < shuffle_prob:
-        shuffle_order = random.sample(range(13), 13)
-    else:
-        shuffle_order = list(range(13))
-        # local shuffle for noise, JPEG is always the last one
-        shuffle_order[2:6] = random.sample(shuffle_order[2:6], len(range(2, 6)))
-        shuffle_order[9:13] = random.sample(shuffle_order[9:13], len(range(9, 13)))
-
-    poisson_prob, speckle_prob, isp_prob = 0.1, 0.1, 0.1
-
-    for i in shuffle_order:
-        if i == 0:
-            img = add_blur(img, sf=sf)
-        elif i == 1:
-            img = add_resize(img, sf=sf)
-        elif i == 2:
-            img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25)
-        elif i == 3:
-            if random.random() < poisson_prob:
-                img = add_Poisson_noise(img)
-        elif i == 4:
-            if random.random() < speckle_prob:
-                img = add_speckle_noise(img)
-        elif i == 5:
-            if random.random() < isp_prob and isp_model is not None:
-                with torch.no_grad():
-                    img, hq = isp_model.forward(img.copy(), hq)
-        elif i == 6:
-            img = add_JPEG_noise(img)
-        elif i == 7:
-            img = add_blur(img, sf=sf)
-        elif i == 8:
-            img = add_resize(img, sf=sf)
-        elif i == 9:
-            img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25)
-        elif i == 10:
-            if random.random() < poisson_prob:
-                img = add_Poisson_noise(img)
-        elif i == 11:
-            if random.random() < speckle_prob:
-                img = add_speckle_noise(img)
-        elif i == 12:
-            if random.random() < isp_prob and isp_model is not None:
-                with torch.no_grad():
-                    img, hq = isp_model.forward(img.copy(), hq)
-        else:
-            print('check the shuffle!')
-
-    # resize to desired size
-    img = cv2.resize(img, (int(1 / sf * hq.shape[1]), int(1 / sf * hq.shape[0])),
-                     interpolation=random.choice([1, 2, 3]))
-
-    # add final JPEG compression noise
-    img = add_JPEG_noise(img)
-
-    # random crop
-    img, hq = random_crop(img, hq, sf, lq_patchsize)
-
-    return img, hq
-
-
-if __name__ == '__main__':
-	print("hey")
-	img = util.imread_uint('utils/test.png', 3)
-	print(img)
-	img = util.uint2single(img)
-	print(img)
-	img = img[:448, :448]
-	h = img.shape[0] // 4
-	print("resizing to", h)
-	sf = 4
-	deg_fn = partial(degradation_bsrgan_variant, sf=sf)
-	for i in range(20):
-		print(i)
-		img_lq = deg_fn(img)
-		print(img_lq)
-		img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img)["image"]
-		print(img_lq.shape)
-		print("bicubic", img_lq_bicubic.shape)
-		print(img_hq.shape)
-		lq_nearest = cv2.resize(util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
-		                        interpolation=0)
-		lq_bicubic_nearest = cv2.resize(util.single2uint(img_lq_bicubic), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
-		                        interpolation=0)
-		img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
-		util.imsave(img_concat, str(i) + '.png')
-
-
diff --git a/ControlNet/ldm/modules/image_degradation/bsrgan_light.py b/ControlNet/ldm/modules/image_degradation/bsrgan_light.py
deleted file mode 100644
index 808c7f882cb75e2ba2340d5b55881d11927351f0..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/modules/image_degradation/bsrgan_light.py
+++ /dev/null
@@ -1,651 +0,0 @@
-# -*- coding: utf-8 -*-
-import numpy as np
-import cv2
-import torch
-
-from functools import partial
-import random
-from scipy import ndimage
-import scipy
-import scipy.stats as ss
-from scipy.interpolate import interp2d
-from scipy.linalg import orth
-import albumentations
-
-import ldm.modules.image_degradation.utils_image as util
-
-"""
-# --------------------------------------------
-# Super-Resolution
-# --------------------------------------------
-#
-# Kai Zhang (cskaizhang@gmail.com)
-# https://github.com/cszn
-# From 2019/03--2021/08
-# --------------------------------------------
-"""
-
-def modcrop_np(img, sf):
-    '''
-    Args:
-        img: numpy image, WxH or WxHxC
-        sf: scale factor
-    Return:
-        cropped image
-    '''
-    w, h = img.shape[:2]
-    im = np.copy(img)
-    return im[:w - w % sf, :h - h % sf, ...]
-
-
-"""
-# --------------------------------------------
-# anisotropic Gaussian kernels
-# --------------------------------------------
-"""
-
-
-def analytic_kernel(k):
-    """Calculate the X4 kernel from the X2 kernel (for proof see appendix in paper)"""
-    k_size = k.shape[0]
-    # Calculate the big kernels size
-    big_k = np.zeros((3 * k_size - 2, 3 * k_size - 2))
-    # Loop over the small kernel to fill the big one
-    for r in range(k_size):
-        for c in range(k_size):
-            big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k
-    # Crop the edges of the big kernel to ignore very small values and increase run time of SR
-    crop = k_size // 2
-    cropped_big_k = big_k[crop:-crop, crop:-crop]
-    # Normalize to 1
-    return cropped_big_k / cropped_big_k.sum()
-
-
-def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
-    """ generate an anisotropic Gaussian kernel
-    Args:
-        ksize : e.g., 15, kernel size
-        theta : [0,  pi], rotation angle range
-        l1    : [0.1,50], scaling of eigenvalues
-        l2    : [0.1,l1], scaling of eigenvalues
-        If l1 = l2, will get an isotropic Gaussian kernel.
-    Returns:
-        k     : kernel
-    """
-
-    v = np.dot(np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), np.array([1., 0.]))
-    V = np.array([[v[0], v[1]], [v[1], -v[0]]])
-    D = np.array([[l1, 0], [0, l2]])
-    Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
-    k = gm_blur_kernel(mean=[0, 0], cov=Sigma, size=ksize)
-
-    return k
-
-
-def gm_blur_kernel(mean, cov, size=15):
-    center = size / 2.0 + 0.5
-    k = np.zeros([size, size])
-    for y in range(size):
-        for x in range(size):
-            cy = y - center + 1
-            cx = x - center + 1
-            k[y, x] = ss.multivariate_normal.pdf([cx, cy], mean=mean, cov=cov)
-
-    k = k / np.sum(k)
-    return k
-
-
-def shift_pixel(x, sf, upper_left=True):
-    """shift pixel for super-resolution with different scale factors
-    Args:
-        x: WxHxC or WxH
-        sf: scale factor
-        upper_left: shift direction
-    """
-    h, w = x.shape[:2]
-    shift = (sf - 1) * 0.5
-    xv, yv = np.arange(0, w, 1.0), np.arange(0, h, 1.0)
-    if upper_left:
-        x1 = xv + shift
-        y1 = yv + shift
-    else:
-        x1 = xv - shift
-        y1 = yv - shift
-
-    x1 = np.clip(x1, 0, w - 1)
-    y1 = np.clip(y1, 0, h - 1)
-
-    if x.ndim == 2:
-        x = interp2d(xv, yv, x)(x1, y1)
-    if x.ndim == 3:
-        for i in range(x.shape[-1]):
-            x[:, :, i] = interp2d(xv, yv, x[:, :, i])(x1, y1)
-
-    return x
-
-
-def blur(x, k):
-    '''
-    x: image, NxcxHxW
-    k: kernel, Nx1xhxw
-    '''
-    n, c = x.shape[:2]
-    p1, p2 = (k.shape[-2] - 1) // 2, (k.shape[-1] - 1) // 2
-    x = torch.nn.functional.pad(x, pad=(p1, p2, p1, p2), mode='replicate')
-    k = k.repeat(1, c, 1, 1)
-    k = k.view(-1, 1, k.shape[2], k.shape[3])
-    x = x.view(1, -1, x.shape[2], x.shape[3])
-    x = torch.nn.functional.conv2d(x, k, bias=None, stride=1, padding=0, groups=n * c)
-    x = x.view(n, c, x.shape[2], x.shape[3])
-
-    return x
-
-
-def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var=0.6, max_var=10., noise_level=0):
-    """"
-    # modified version of https://github.com/assafshocher/BlindSR_dataset_generator
-    # Kai Zhang
-    # min_var = 0.175 * sf  # variance of the gaussian kernel will be sampled between min_var and max_var
-    # max_var = 2.5 * sf
-    """
-    # Set random eigen-vals (lambdas) and angle (theta) for COV matrix
-    lambda_1 = min_var + np.random.rand() * (max_var - min_var)
-    lambda_2 = min_var + np.random.rand() * (max_var - min_var)
-    theta = np.random.rand() * np.pi  # random theta
-    noise = -noise_level + np.random.rand(*k_size) * noise_level * 2
-
-    # Set COV matrix using Lambdas and Theta
-    LAMBDA = np.diag([lambda_1, lambda_2])
-    Q = np.array([[np.cos(theta), -np.sin(theta)],
-                  [np.sin(theta), np.cos(theta)]])
-    SIGMA = Q @ LAMBDA @ Q.T
-    INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]
-
-    # Set expectation position (shifting kernel for aligned image)
-    MU = k_size // 2 - 0.5 * (scale_factor - 1)  # - 0.5 * (scale_factor - k_size % 2)
-    MU = MU[None, None, :, None]
-
-    # Create meshgrid for Gaussian
-    [X, Y] = np.meshgrid(range(k_size[0]), range(k_size[1]))
-    Z = np.stack([X, Y], 2)[:, :, :, None]
-
-    # Calcualte Gaussian for every pixel of the kernel
-    ZZ = Z - MU
-    ZZ_t = ZZ.transpose(0, 1, 3, 2)
-    raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise)
-
-    # shift the kernel so it will be centered
-    # raw_kernel_centered = kernel_shift(raw_kernel, scale_factor)
-
-    # Normalize the kernel and return
-    # kernel = raw_kernel_centered / np.sum(raw_kernel_centered)
-    kernel = raw_kernel / np.sum(raw_kernel)
-    return kernel
-
-
-def fspecial_gaussian(hsize, sigma):
-    hsize = [hsize, hsize]
-    siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
-    std = sigma
-    [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
-    arg = -(x * x + y * y) / (2 * std * std)
-    h = np.exp(arg)
-    h[h < scipy.finfo(float).eps * h.max()] = 0
-    sumh = h.sum()
-    if sumh != 0:
-        h = h / sumh
-    return h
-
-
-def fspecial_laplacian(alpha):
-    alpha = max([0, min([alpha, 1])])
-    h1 = alpha / (alpha + 1)
-    h2 = (1 - alpha) / (alpha + 1)
-    h = [[h1, h2, h1], [h2, -4 / (alpha + 1), h2], [h1, h2, h1]]
-    h = np.array(h)
-    return h
-
-
-def fspecial(filter_type, *args, **kwargs):
-    '''
-    python code from:
-    https://github.com/ronaldosena/imagens-medicas-2/blob/40171a6c259edec7827a6693a93955de2bd39e76/Aulas/aula_2_-_uniform_filter/matlab_fspecial.py
-    '''
-    if filter_type == 'gaussian':
-        return fspecial_gaussian(*args, **kwargs)
-    if filter_type == 'laplacian':
-        return fspecial_laplacian(*args, **kwargs)
-
-
-"""
-# --------------------------------------------
-# degradation models
-# --------------------------------------------
-"""
-
-
-def bicubic_degradation(x, sf=3):
-    '''
-    Args:
-        x: HxWxC image, [0, 1]
-        sf: down-scale factor
-    Return:
-        bicubicly downsampled LR image
-    '''
-    x = util.imresize_np(x, scale=1 / sf)
-    return x
-
-
-def srmd_degradation(x, k, sf=3):
-    ''' blur + bicubic downsampling
-    Args:
-        x: HxWxC image, [0, 1]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    Reference:
-        @inproceedings{zhang2018learning,
-          title={Learning a single convolutional super-resolution network for multiple degradations},
-          author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
-          booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
-          pages={3262--3271},
-          year={2018}
-        }
-    '''
-    x = ndimage.convolve(x, np.expand_dims(k, axis=2), mode='wrap')  # 'nearest' | 'mirror'
-    x = bicubic_degradation(x, sf=sf)
-    return x
-
-
-def dpsr_degradation(x, k, sf=3):
-    ''' bicubic downsampling + blur
-    Args:
-        x: HxWxC image, [0, 1]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    Reference:
-        @inproceedings{zhang2019deep,
-          title={Deep Plug-and-Play Super-Resolution for Arbitrary Blur Kernels},
-          author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
-          booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
-          pages={1671--1681},
-          year={2019}
-        }
-    '''
-    x = bicubic_degradation(x, sf=sf)
-    x = ndimage.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
-    return x
-
-
-def classical_degradation(x, k, sf=3):
-    ''' blur + downsampling
-    Args:
-        x: HxWxC image, [0, 1]/[0, 255]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    '''
-    x = ndimage.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
-    # x = filters.correlate(x, np.expand_dims(np.flip(k), axis=2))
-    st = 0
-    return x[st::sf, st::sf, ...]
-
-
-def add_sharpening(img, weight=0.5, radius=50, threshold=10):
-    """USM sharpening. borrowed from real-ESRGAN
-    Input image: I; Blurry image: B.
-    1. K = I + weight * (I - B)
-    2. Mask = 1 if abs(I - B) > threshold, else: 0
-    3. Blur mask:
-    4. Out = Mask * K + (1 - Mask) * I
-    Args:
-        img (Numpy array): Input image, HWC, BGR; float32, [0, 1].
-        weight (float): Sharp weight. Default: 1.
-        radius (float): Kernel size of Gaussian blur. Default: 50.
-        threshold (int):
-    """
-    if radius % 2 == 0:
-        radius += 1
-    blur = cv2.GaussianBlur(img, (radius, radius), 0)
-    residual = img - blur
-    mask = np.abs(residual) * 255 > threshold
-    mask = mask.astype('float32')
-    soft_mask = cv2.GaussianBlur(mask, (radius, radius), 0)
-
-    K = img + weight * residual
-    K = np.clip(K, 0, 1)
-    return soft_mask * K + (1 - soft_mask) * img
-
-
-def add_blur(img, sf=4):
-    wd2 = 4.0 + sf
-    wd = 2.0 + 0.2 * sf
-
-    wd2 = wd2/4
-    wd = wd/4
-
-    if random.random() < 0.5:
-        l1 = wd2 * random.random()
-        l2 = wd2 * random.random()
-        k = anisotropic_Gaussian(ksize=random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2)
-    else:
-        k = fspecial('gaussian', random.randint(2, 4) + 3, wd * random.random())
-    img = ndimage.convolve(img, np.expand_dims(k, axis=2), mode='mirror')
-
-    return img
-
-
-def add_resize(img, sf=4):
-    rnum = np.random.rand()
-    if rnum > 0.8:  # up
-        sf1 = random.uniform(1, 2)
-    elif rnum < 0.7:  # down
-        sf1 = random.uniform(0.5 / sf, 1)
-    else:
-        sf1 = 1.0
-    img = cv2.resize(img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3]))
-    img = np.clip(img, 0.0, 1.0)
-
-    return img
-
-
-# def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
-#     noise_level = random.randint(noise_level1, noise_level2)
-#     rnum = np.random.rand()
-#     if rnum > 0.6:  # add color Gaussian noise
-#         img += np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
-#     elif rnum < 0.4:  # add grayscale Gaussian noise
-#         img += np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
-#     else:  # add  noise
-#         L = noise_level2 / 255.
-#         D = np.diag(np.random.rand(3))
-#         U = orth(np.random.rand(3, 3))
-#         conv = np.dot(np.dot(np.transpose(U), D), U)
-#         img += np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
-#     img = np.clip(img, 0.0, 1.0)
-#     return img
-
-def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
-    noise_level = random.randint(noise_level1, noise_level2)
-    rnum = np.random.rand()
-    if rnum > 0.6:  # add color Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
-    elif rnum < 0.4:  # add grayscale Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
-    else:  # add  noise
-        L = noise_level2 / 255.
-        D = np.diag(np.random.rand(3))
-        U = orth(np.random.rand(3, 3))
-        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_speckle_noise(img, noise_level1=2, noise_level2=25):
-    noise_level = random.randint(noise_level1, noise_level2)
-    img = np.clip(img, 0.0, 1.0)
-    rnum = random.random()
-    if rnum > 0.6:
-        img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
-    elif rnum < 0.4:
-        img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
-    else:
-        L = noise_level2 / 255.
-        D = np.diag(np.random.rand(3))
-        U = orth(np.random.rand(3, 3))
-        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_Poisson_noise(img):
-    img = np.clip((img * 255.0).round(), 0, 255) / 255.
-    vals = 10 ** (2 * random.random() + 2.0)  # [2, 4]
-    if random.random() < 0.5:
-        img = np.random.poisson(img * vals).astype(np.float32) / vals
-    else:
-        img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
-        img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.
-        noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
-        img += noise_gray[:, :, np.newaxis]
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_JPEG_noise(img):
-    quality_factor = random.randint(80, 95)
-    img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
-    result, encimg = cv2.imencode('.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
-    img = cv2.imdecode(encimg, 1)
-    img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
-    return img
-
-
-def random_crop(lq, hq, sf=4, lq_patchsize=64):
-    h, w = lq.shape[:2]
-    rnd_h = random.randint(0, h - lq_patchsize)
-    rnd_w = random.randint(0, w - lq_patchsize)
-    lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :]
-
-    rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
-    hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize * sf, :]
-    return lq, hq
-
-
-def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
-    """
-    This is the degradation model of BSRGAN from the paper
-    "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
-    ----------
-    img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf)
-    sf: scale factor
-    isp_model: camera ISP model
-    Returns
-    -------
-    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
-    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
-    """
-    isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
-    sf_ori = sf
-
-    h1, w1 = img.shape[:2]
-    img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
-    h, w = img.shape[:2]
-
-    if h < lq_patchsize * sf or w < lq_patchsize * sf:
-        raise ValueError(f'img size ({h1}X{w1}) is too small!')
-
-    hq = img.copy()
-
-    if sf == 4 and random.random() < scale2_prob:  # downsample1
-        if np.random.rand() < 0.5:
-            img = cv2.resize(img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
-                             interpolation=random.choice([1, 2, 3]))
-        else:
-            img = util.imresize_np(img, 1 / 2, True)
-        img = np.clip(img, 0.0, 1.0)
-        sf = 2
-
-    shuffle_order = random.sample(range(7), 7)
-    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
-    if idx1 > idx2:  # keep downsample3 last
-        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
-
-    for i in shuffle_order:
-
-        if i == 0:
-            img = add_blur(img, sf=sf)
-
-        elif i == 1:
-            img = add_blur(img, sf=sf)
-
-        elif i == 2:
-            a, b = img.shape[1], img.shape[0]
-            # downsample2
-            if random.random() < 0.75:
-                sf1 = random.uniform(1, 2 * sf)
-                img = cv2.resize(img, (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
-                                 interpolation=random.choice([1, 2, 3]))
-            else:
-                k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
-                k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
-                img = ndimage.convolve(img, np.expand_dims(k_shifted, axis=2), mode='mirror')
-                img = img[0::sf, 0::sf, ...]  # nearest downsampling
-            img = np.clip(img, 0.0, 1.0)
-
-        elif i == 3:
-            # downsample3
-            img = cv2.resize(img, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
-            img = np.clip(img, 0.0, 1.0)
-
-        elif i == 4:
-            # add Gaussian noise
-            img = add_Gaussian_noise(img, noise_level1=2, noise_level2=8)
-
-        elif i == 5:
-            # add JPEG noise
-            if random.random() < jpeg_prob:
-                img = add_JPEG_noise(img)
-
-        elif i == 6:
-            # add processed camera sensor noise
-            if random.random() < isp_prob and isp_model is not None:
-                with torch.no_grad():
-                    img, hq = isp_model.forward(img.copy(), hq)
-
-    # add final JPEG compression noise
-    img = add_JPEG_noise(img)
-
-    # random crop
-    img, hq = random_crop(img, hq, sf_ori, lq_patchsize)
-
-    return img, hq
-
-
-# todo no isp_model?
-def degradation_bsrgan_variant(image, sf=4, isp_model=None, up=False):
-    """
-    This is the degradation model of BSRGAN from the paper
-    "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
-    ----------
-    sf: scale factor
-    isp_model: camera ISP model
-    Returns
-    -------
-    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
-    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
-    """
-    image = util.uint2single(image)
-    isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
-    sf_ori = sf
-
-    h1, w1 = image.shape[:2]
-    image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
-    h, w = image.shape[:2]
-
-    hq = image.copy()
-
-    if sf == 4 and random.random() < scale2_prob:  # downsample1
-        if np.random.rand() < 0.5:
-            image = cv2.resize(image, (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
-                               interpolation=random.choice([1, 2, 3]))
-        else:
-            image = util.imresize_np(image, 1 / 2, True)
-        image = np.clip(image, 0.0, 1.0)
-        sf = 2
-
-    shuffle_order = random.sample(range(7), 7)
-    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
-    if idx1 > idx2:  # keep downsample3 last
-        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
-
-    for i in shuffle_order:
-
-        if i == 0:
-            image = add_blur(image, sf=sf)
-
-        # elif i == 1:
-        #     image = add_blur(image, sf=sf)
-
-        if i == 0:
-            pass
-
-        elif i == 2:
-            a, b = image.shape[1], image.shape[0]
-            # downsample2
-            if random.random() < 0.8:
-                sf1 = random.uniform(1, 2 * sf)
-                image = cv2.resize(image, (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
-                                   interpolation=random.choice([1, 2, 3]))
-            else:
-                k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
-                k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
-                image = ndimage.convolve(image, np.expand_dims(k_shifted, axis=2), mode='mirror')
-                image = image[0::sf, 0::sf, ...]  # nearest downsampling
-
-            image = np.clip(image, 0.0, 1.0)
-
-        elif i == 3:
-            # downsample3
-            image = cv2.resize(image, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
-            image = np.clip(image, 0.0, 1.0)
-
-        elif i == 4:
-            # add Gaussian noise
-            image = add_Gaussian_noise(image, noise_level1=1, noise_level2=2)
-
-        elif i == 5:
-            # add JPEG noise
-            if random.random() < jpeg_prob:
-                image = add_JPEG_noise(image)
-        #
-        # elif i == 6:
-        #     # add processed camera sensor noise
-        #     if random.random() < isp_prob and isp_model is not None:
-        #         with torch.no_grad():
-        #             img, hq = isp_model.forward(img.copy(), hq)
-
-    # add final JPEG compression noise
-    image = add_JPEG_noise(image)
-    image = util.single2uint(image)
-    if up:
-        image = cv2.resize(image, (w1, h1), interpolation=cv2.INTER_CUBIC)  # todo: random, as above? want to condition on it then
-    example = {"image": image}
-    return example
-
-
-
-
-if __name__ == '__main__':
-    print("hey")
-    img = util.imread_uint('utils/test.png', 3)
-    img = img[:448, :448]
-    h = img.shape[0] // 4
-    print("resizing to", h)
-    sf = 4
-    deg_fn = partial(degradation_bsrgan_variant, sf=sf)
-    for i in range(20):
-        print(i)
-        img_hq = img
-        img_lq = deg_fn(img)["image"]
-        img_hq, img_lq = util.uint2single(img_hq), util.uint2single(img_lq)
-        print(img_lq)
-        img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)["image"]
-        print(img_lq.shape)
-        print("bicubic", img_lq_bicubic.shape)
-        print(img_hq.shape)
-        lq_nearest = cv2.resize(util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
-                                interpolation=0)
-        lq_bicubic_nearest = cv2.resize(util.single2uint(img_lq_bicubic),
-                                        (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
-                                        interpolation=0)
-        img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
-        util.imsave(img_concat, str(i) + '.png')
diff --git a/ControlNet/ldm/modules/image_degradation/utils/test.png b/ControlNet/ldm/modules/image_degradation/utils/test.png
deleted file mode 100644
index e720ed04ac7e1e7938d367e692fb6a742c54a24c..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/modules/image_degradation/utils/test.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:92e516278f0d3e85e84cfb55b43338e12d5896a0ee3833aafdf378025457d753
-size 441072
diff --git a/ControlNet/ldm/modules/image_degradation/utils_image.py b/ControlNet/ldm/modules/image_degradation/utils_image.py
deleted file mode 100644
index 0175f155ad900ae33c3c46ed87f49b352e3faf98..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/modules/image_degradation/utils_image.py
+++ /dev/null
@@ -1,916 +0,0 @@
-import os
-import math
-import random
-import numpy as np
-import torch
-import cv2
-from torchvision.utils import make_grid
-from datetime import datetime
-#import matplotlib.pyplot as plt   # TODO: check with Dominik, also bsrgan.py vs bsrgan_light.py
-
-
-os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
-
-
-'''
-# --------------------------------------------
-# Kai Zhang (github: https://github.com/cszn)
-# 03/Mar/2019
-# --------------------------------------------
-# https://github.com/twhui/SRGAN-pyTorch
-# https://github.com/xinntao/BasicSR
-# --------------------------------------------
-'''
-
-
-IMG_EXTENSIONS = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', '.tif']
-
-
-def is_image_file(filename):
-    return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
-
-
-def get_timestamp():
-    return datetime.now().strftime('%y%m%d-%H%M%S')
-
-
-def imshow(x, title=None, cbar=False, figsize=None):
-    plt.figure(figsize=figsize)
-    plt.imshow(np.squeeze(x), interpolation='nearest', cmap='gray')
-    if title:
-        plt.title(title)
-    if cbar:
-        plt.colorbar()
-    plt.show()
-
-
-def surf(Z, cmap='rainbow', figsize=None):
-    plt.figure(figsize=figsize)
-    ax3 = plt.axes(projection='3d')
-
-    w, h = Z.shape[:2]
-    xx = np.arange(0,w,1)
-    yy = np.arange(0,h,1)
-    X, Y = np.meshgrid(xx, yy)
-    ax3.plot_surface(X,Y,Z,cmap=cmap)
-    #ax3.contour(X,Y,Z, zdim='z',offset=-2，cmap=cmap)
-    plt.show()
-
-
-'''
-# --------------------------------------------
-# get image pathes
-# --------------------------------------------
-'''
-
-
-def get_image_paths(dataroot):
-    paths = None  # return None if dataroot is None
-    if dataroot is not None:
-        paths = sorted(_get_paths_from_images(dataroot))
-    return paths
-
-
-def _get_paths_from_images(path):
-    assert os.path.isdir(path), '{:s} is not a valid directory'.format(path)
-    images = []
-    for dirpath, _, fnames in sorted(os.walk(path)):
-        for fname in sorted(fnames):
-            if is_image_file(fname):
-                img_path = os.path.join(dirpath, fname)
-                images.append(img_path)
-    assert images, '{:s} has no valid image file'.format(path)
-    return images
-
-
-'''
-# --------------------------------------------
-# split large images into small images 
-# --------------------------------------------
-'''
-
-
-def patches_from_image(img, p_size=512, p_overlap=64, p_max=800):
-    w, h = img.shape[:2]
-    patches = []
-    if w > p_max and h > p_max:
-        w1 = list(np.arange(0, w-p_size, p_size-p_overlap, dtype=np.int))
-        h1 = list(np.arange(0, h-p_size, p_size-p_overlap, dtype=np.int))
-        w1.append(w-p_size)
-        h1.append(h-p_size)
-#        print(w1)
-#        print(h1)
-        for i in w1:
-            for j in h1:
-                patches.append(img[i:i+p_size, j:j+p_size,:])
-    else:
-        patches.append(img)
-
-    return patches
-
-
-def imssave(imgs, img_path):
-    """
-    imgs: list, N images of size WxHxC
-    """
-    img_name, ext = os.path.splitext(os.path.basename(img_path))
-
-    for i, img in enumerate(imgs):
-        if img.ndim == 3:
-            img = img[:, :, [2, 1, 0]]
-        new_path = os.path.join(os.path.dirname(img_path), img_name+str('_s{:04d}'.format(i))+'.png')
-        cv2.imwrite(new_path, img)
-
-
-def split_imageset(original_dataroot, taget_dataroot, n_channels=3, p_size=800, p_overlap=96, p_max=1000):
-    """
-    split the large images from original_dataroot into small overlapped images with size (p_size)x(p_size),
-    and save them into taget_dataroot; only the images with larger size than (p_max)x(p_max)
-    will be splitted.
-    Args:
-        original_dataroot:
-        taget_dataroot:
-        p_size: size of small images
-        p_overlap: patch size in training is a good choice
-        p_max: images with smaller size than (p_max)x(p_max) keep unchanged.
-    """
-    paths = get_image_paths(original_dataroot)
-    for img_path in paths:
-        # img_name, ext = os.path.splitext(os.path.basename(img_path))
-        img = imread_uint(img_path, n_channels=n_channels)
-        patches = patches_from_image(img, p_size, p_overlap, p_max)
-        imssave(patches, os.path.join(taget_dataroot,os.path.basename(img_path)))
-        #if original_dataroot == taget_dataroot:
-        #del img_path
-
-'''
-# --------------------------------------------
-# makedir
-# --------------------------------------------
-'''
-
-
-def mkdir(path):
-    if not os.path.exists(path):
-        os.makedirs(path)
-
-
-def mkdirs(paths):
-    if isinstance(paths, str):
-        mkdir(paths)
-    else:
-        for path in paths:
-            mkdir(path)
-
-
-def mkdir_and_rename(path):
-    if os.path.exists(path):
-        new_name = path + '_archived_' + get_timestamp()
-        print('Path already exists. Rename it to [{:s}]'.format(new_name))
-        os.rename(path, new_name)
-    os.makedirs(path)
-
-
-'''
-# --------------------------------------------
-# read image from path
-# opencv is fast, but read BGR numpy image
-# --------------------------------------------
-'''
-
-
-# --------------------------------------------
-# get uint8 image of size HxWxn_channles (RGB)
-# --------------------------------------------
-def imread_uint(path, n_channels=3):
-    #  input: path
-    # output: HxWx3(RGB or GGG), or HxWx1 (G)
-    if n_channels == 1:
-        img = cv2.imread(path, 0)  # cv2.IMREAD_GRAYSCALE
-        img = np.expand_dims(img, axis=2)  # HxWx1
-    elif n_channels == 3:
-        img = cv2.imread(path, cv2.IMREAD_UNCHANGED)  # BGR or G
-        if img.ndim == 2:
-            img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)  # GGG
-        else:
-            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # RGB
-    return img
-
-
-# --------------------------------------------
-# matlab's imwrite
-# --------------------------------------------
-def imsave(img, img_path):
-    img = np.squeeze(img)
-    if img.ndim == 3:
-        img = img[:, :, [2, 1, 0]]
-    cv2.imwrite(img_path, img)
-
-def imwrite(img, img_path):
-    img = np.squeeze(img)
-    if img.ndim == 3:
-        img = img[:, :, [2, 1, 0]]
-    cv2.imwrite(img_path, img)
-
-
-
-# --------------------------------------------
-# get single image of size HxWxn_channles (BGR)
-# --------------------------------------------
-def read_img(path):
-    # read image by cv2
-    # return: Numpy float32, HWC, BGR, [0,1]
-    img = cv2.imread(path, cv2.IMREAD_UNCHANGED)  # cv2.IMREAD_GRAYSCALE
-    img = img.astype(np.float32) / 255.
-    if img.ndim == 2:
-        img = np.expand_dims(img, axis=2)
-    # some images have 4 channels
-    if img.shape[2] > 3:
-        img = img[:, :, :3]
-    return img
-
-
-'''
-# --------------------------------------------
-# image format conversion
-# --------------------------------------------
-# numpy(single) <--->  numpy(unit)
-# numpy(single) <--->  tensor
-# numpy(unit)   <--->  tensor
-# --------------------------------------------
-'''
-
-
-# --------------------------------------------
-# numpy(single) [0, 1] <--->  numpy(unit)
-# --------------------------------------------
-
-
-def uint2single(img):
-
-    return np.float32(img/255.)
-
-
-def single2uint(img):
-
-    return np.uint8((img.clip(0, 1)*255.).round())
-
-
-def uint162single(img):
-
-    return np.float32(img/65535.)
-
-
-def single2uint16(img):
-
-    return np.uint16((img.clip(0, 1)*65535.).round())
-
-
-# --------------------------------------------
-# numpy(unit) (HxWxC or HxW) <--->  tensor
-# --------------------------------------------
-
-
-# convert uint to 4-dimensional torch tensor
-def uint2tensor4(img):
-    if img.ndim == 2:
-        img = np.expand_dims(img, axis=2)
-    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().div(255.).unsqueeze(0)
-
-
-# convert uint to 3-dimensional torch tensor
-def uint2tensor3(img):
-    if img.ndim == 2:
-        img = np.expand_dims(img, axis=2)
-    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().div(255.)
-
-
-# convert 2/3/4-dimensional torch tensor to uint
-def tensor2uint(img):
-    img = img.data.squeeze().float().clamp_(0, 1).cpu().numpy()
-    if img.ndim == 3:
-        img = np.transpose(img, (1, 2, 0))
-    return np.uint8((img*255.0).round())
-
-
-# --------------------------------------------
-# numpy(single) (HxWxC) <--->  tensor
-# --------------------------------------------
-
-
-# convert single (HxWxC) to 3-dimensional torch tensor
-def single2tensor3(img):
-    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float()
-
-
-# convert single (HxWxC) to 4-dimensional torch tensor
-def single2tensor4(img):
-    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().unsqueeze(0)
-
-
-# convert torch tensor to single
-def tensor2single(img):
-    img = img.data.squeeze().float().cpu().numpy()
-    if img.ndim == 3:
-        img = np.transpose(img, (1, 2, 0))
-
-    return img
-
-# convert torch tensor to single
-def tensor2single3(img):
-    img = img.data.squeeze().float().cpu().numpy()
-    if img.ndim == 3:
-        img = np.transpose(img, (1, 2, 0))
-    elif img.ndim == 2:
-        img = np.expand_dims(img, axis=2)
-    return img
-
-
-def single2tensor5(img):
-    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1, 3).float().unsqueeze(0)
-
-
-def single32tensor5(img):
-    return torch.from_numpy(np.ascontiguousarray(img)).float().unsqueeze(0).unsqueeze(0)
-
-
-def single42tensor4(img):
-    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1, 3).float()
-
-
-# from skimage.io import imread, imsave
-def tensor2img(tensor, out_type=np.uint8, min_max=(0, 1)):
-    '''
-    Converts a torch Tensor into an image Numpy array of BGR channel order
-    Input: 4D(B,(3/1),H,W), 3D(C,H,W), or 2D(H,W), any range, RGB channel order
-    Output: 3D(H,W,C) or 2D(H,W), [0,255], np.uint8 (default)
-    '''
-    tensor = tensor.squeeze().float().cpu().clamp_(*min_max)  # squeeze first, then clamp
-    tensor = (tensor - min_max[0]) / (min_max[1] - min_max[0])  # to range [0,1]
-    n_dim = tensor.dim()
-    if n_dim == 4:
-        n_img = len(tensor)
-        img_np = make_grid(tensor, nrow=int(math.sqrt(n_img)), normalize=False).numpy()
-        img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0))  # HWC, BGR
-    elif n_dim == 3:
-        img_np = tensor.numpy()
-        img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0))  # HWC, BGR
-    elif n_dim == 2:
-        img_np = tensor.numpy()
-    else:
-        raise TypeError(
-            'Only support 4D, 3D and 2D tensor. But received with dimension: {:d}'.format(n_dim))
-    if out_type == np.uint8:
-        img_np = (img_np * 255.0).round()
-        # Important. Unlike matlab, numpy.unit8() WILL NOT round by default.
-    return img_np.astype(out_type)
-
-
-'''
-# --------------------------------------------
-# Augmentation, flipe and/or rotate
-# --------------------------------------------
-# The following two are enough.
-# (1) augmet_img: numpy image of WxHxC or WxH
-# (2) augment_img_tensor4: tensor image 1xCxWxH
-# --------------------------------------------
-'''
-
-
-def augment_img(img, mode=0):
-    '''Kai Zhang (github: https://github.com/cszn)
-    '''
-    if mode == 0:
-        return img
-    elif mode == 1:
-        return np.flipud(np.rot90(img))
-    elif mode == 2:
-        return np.flipud(img)
-    elif mode == 3:
-        return np.rot90(img, k=3)
-    elif mode == 4:
-        return np.flipud(np.rot90(img, k=2))
-    elif mode == 5:
-        return np.rot90(img)
-    elif mode == 6:
-        return np.rot90(img, k=2)
-    elif mode == 7:
-        return np.flipud(np.rot90(img, k=3))
-
-
-def augment_img_tensor4(img, mode=0):
-    '''Kai Zhang (github: https://github.com/cszn)
-    '''
-    if mode == 0:
-        return img
-    elif mode == 1:
-        return img.rot90(1, [2, 3]).flip([2])
-    elif mode == 2:
-        return img.flip([2])
-    elif mode == 3:
-        return img.rot90(3, [2, 3])
-    elif mode == 4:
-        return img.rot90(2, [2, 3]).flip([2])
-    elif mode == 5:
-        return img.rot90(1, [2, 3])
-    elif mode == 6:
-        return img.rot90(2, [2, 3])
-    elif mode == 7:
-        return img.rot90(3, [2, 3]).flip([2])
-
-
-def augment_img_tensor(img, mode=0):
-    '''Kai Zhang (github: https://github.com/cszn)
-    '''
-    img_size = img.size()
-    img_np = img.data.cpu().numpy()
-    if len(img_size) == 3:
-        img_np = np.transpose(img_np, (1, 2, 0))
-    elif len(img_size) == 4:
-        img_np = np.transpose(img_np, (2, 3, 1, 0))
-    img_np = augment_img(img_np, mode=mode)
-    img_tensor = torch.from_numpy(np.ascontiguousarray(img_np))
-    if len(img_size) == 3:
-        img_tensor = img_tensor.permute(2, 0, 1)
-    elif len(img_size) == 4:
-        img_tensor = img_tensor.permute(3, 2, 0, 1)
-
-    return img_tensor.type_as(img)
-
-
-def augment_img_np3(img, mode=0):
-    if mode == 0:
-        return img
-    elif mode == 1:
-        return img.transpose(1, 0, 2)
-    elif mode == 2:
-        return img[::-1, :, :]
-    elif mode == 3:
-        img = img[::-1, :, :]
-        img = img.transpose(1, 0, 2)
-        return img
-    elif mode == 4:
-        return img[:, ::-1, :]
-    elif mode == 5:
-        img = img[:, ::-1, :]
-        img = img.transpose(1, 0, 2)
-        return img
-    elif mode == 6:
-        img = img[:, ::-1, :]
-        img = img[::-1, :, :]
-        return img
-    elif mode == 7:
-        img = img[:, ::-1, :]
-        img = img[::-1, :, :]
-        img = img.transpose(1, 0, 2)
-        return img
-
-
-def augment_imgs(img_list, hflip=True, rot=True):
-    # horizontal flip OR rotate
-    hflip = hflip and random.random() < 0.5
-    vflip = rot and random.random() < 0.5
-    rot90 = rot and random.random() < 0.5
-
-    def _augment(img):
-        if hflip:
-            img = img[:, ::-1, :]
-        if vflip:
-            img = img[::-1, :, :]
-        if rot90:
-            img = img.transpose(1, 0, 2)
-        return img
-
-    return [_augment(img) for img in img_list]
-
-
-'''
-# --------------------------------------------
-# modcrop and shave
-# --------------------------------------------
-'''
-
-
-def modcrop(img_in, scale):
-    # img_in: Numpy, HWC or HW
-    img = np.copy(img_in)
-    if img.ndim == 2:
-        H, W = img.shape
-        H_r, W_r = H % scale, W % scale
-        img = img[:H - H_r, :W - W_r]
-    elif img.ndim == 3:
-        H, W, C = img.shape
-        H_r, W_r = H % scale, W % scale
-        img = img[:H - H_r, :W - W_r, :]
-    else:
-        raise ValueError('Wrong img ndim: [{:d}].'.format(img.ndim))
-    return img
-
-
-def shave(img_in, border=0):
-    # img_in: Numpy, HWC or HW
-    img = np.copy(img_in)
-    h, w = img.shape[:2]
-    img = img[border:h-border, border:w-border]
-    return img
-
-
-'''
-# --------------------------------------------
-# image processing process on numpy image
-# channel_convert(in_c, tar_type, img_list):
-# rgb2ycbcr(img, only_y=True):
-# bgr2ycbcr(img, only_y=True):
-# ycbcr2rgb(img):
-# --------------------------------------------
-'''
-
-
-def rgb2ycbcr(img, only_y=True):
-    '''same as matlab rgb2ycbcr
-    only_y: only return Y channel
-    Input:
-        uint8, [0, 255]
-        float, [0, 1]
-    '''
-    in_img_type = img.dtype
-    img.astype(np.float32)
-    if in_img_type != np.uint8:
-        img *= 255.
-    # convert
-    if only_y:
-        rlt = np.dot(img, [65.481, 128.553, 24.966]) / 255.0 + 16.0
-    else:
-        rlt = np.matmul(img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786],
-                              [24.966, 112.0, -18.214]]) / 255.0 + [16, 128, 128]
-    if in_img_type == np.uint8:
-        rlt = rlt.round()
-    else:
-        rlt /= 255.
-    return rlt.astype(in_img_type)
-
-
-def ycbcr2rgb(img):
-    '''same as matlab ycbcr2rgb
-    Input:
-        uint8, [0, 255]
-        float, [0, 1]
-    '''
-    in_img_type = img.dtype
-    img.astype(np.float32)
-    if in_img_type != np.uint8:
-        img *= 255.
-    # convert
-    rlt = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621], [0, -0.00153632, 0.00791071],
-                          [0.00625893, -0.00318811, 0]]) * 255.0 + [-222.921, 135.576, -276.836]
-    if in_img_type == np.uint8:
-        rlt = rlt.round()
-    else:
-        rlt /= 255.
-    return rlt.astype(in_img_type)
-
-
-def bgr2ycbcr(img, only_y=True):
-    '''bgr version of rgb2ycbcr
-    only_y: only return Y channel
-    Input:
-        uint8, [0, 255]
-        float, [0, 1]
-    '''
-    in_img_type = img.dtype
-    img.astype(np.float32)
-    if in_img_type != np.uint8:
-        img *= 255.
-    # convert
-    if only_y:
-        rlt = np.dot(img, [24.966, 128.553, 65.481]) / 255.0 + 16.0
-    else:
-        rlt = np.matmul(img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
-                              [65.481, -37.797, 112.0]]) / 255.0 + [16, 128, 128]
-    if in_img_type == np.uint8:
-        rlt = rlt.round()
-    else:
-        rlt /= 255.
-    return rlt.astype(in_img_type)
-
-
-def channel_convert(in_c, tar_type, img_list):
-    # conversion among BGR, gray and y
-    if in_c == 3 and tar_type == 'gray':  # BGR to gray
-        gray_list = [cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) for img in img_list]
-        return [np.expand_dims(img, axis=2) for img in gray_list]
-    elif in_c == 3 and tar_type == 'y':  # BGR to y
-        y_list = [bgr2ycbcr(img, only_y=True) for img in img_list]
-        return [np.expand_dims(img, axis=2) for img in y_list]
-    elif in_c == 1 and tar_type == 'RGB':  # gray/y to BGR
-        return [cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) for img in img_list]
-    else:
-        return img_list
-
-
-'''
-# --------------------------------------------
-# metric, PSNR and SSIM
-# --------------------------------------------
-'''
-
-
-# --------------------------------------------
-# PSNR
-# --------------------------------------------
-def calculate_psnr(img1, img2, border=0):
-    # img1 and img2 have range [0, 255]
-    #img1 = img1.squeeze()
-    #img2 = img2.squeeze()
-    if not img1.shape == img2.shape:
-        raise ValueError('Input images must have the same dimensions.')
-    h, w = img1.shape[:2]
-    img1 = img1[border:h-border, border:w-border]
-    img2 = img2[border:h-border, border:w-border]
-
-    img1 = img1.astype(np.float64)
-    img2 = img2.astype(np.float64)
-    mse = np.mean((img1 - img2)**2)
-    if mse == 0:
-        return float('inf')
-    return 20 * math.log10(255.0 / math.sqrt(mse))
-
-
-# --------------------------------------------
-# SSIM
-# --------------------------------------------
-def calculate_ssim(img1, img2, border=0):
-    '''calculate SSIM
-    the same outputs as MATLAB's
-    img1, img2: [0, 255]
-    '''
-    #img1 = img1.squeeze()
-    #img2 = img2.squeeze()
-    if not img1.shape == img2.shape:
-        raise ValueError('Input images must have the same dimensions.')
-    h, w = img1.shape[:2]
-    img1 = img1[border:h-border, border:w-border]
-    img2 = img2[border:h-border, border:w-border]
-
-    if img1.ndim == 2:
-        return ssim(img1, img2)
-    elif img1.ndim == 3:
-        if img1.shape[2] == 3:
-            ssims = []
-            for i in range(3):
-                ssims.append(ssim(img1[:,:,i], img2[:,:,i]))
-            return np.array(ssims).mean()
-        elif img1.shape[2] == 1:
-            return ssim(np.squeeze(img1), np.squeeze(img2))
-    else:
-        raise ValueError('Wrong input image dimensions.')
-
-
-def ssim(img1, img2):
-    C1 = (0.01 * 255)**2
-    C2 = (0.03 * 255)**2
-
-    img1 = img1.astype(np.float64)
-    img2 = img2.astype(np.float64)
-    kernel = cv2.getGaussianKernel(11, 1.5)
-    window = np.outer(kernel, kernel.transpose())
-
-    mu1 = cv2.filter2D(img1, -1, window)[5:-5, 5:-5]  # valid
-    mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
-    mu1_sq = mu1**2
-    mu2_sq = mu2**2
-    mu1_mu2 = mu1 * mu2
-    sigma1_sq = cv2.filter2D(img1**2, -1, window)[5:-5, 5:-5] - mu1_sq
-    sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
-    sigma12 = cv2.filter2D(img1 * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
-
-    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) *
-                                                            (sigma1_sq + sigma2_sq + C2))
-    return ssim_map.mean()
-
-
-'''
-# --------------------------------------------
-# matlab's bicubic imresize (numpy and torch) [0, 1]
-# --------------------------------------------
-'''
-
-
-# matlab 'imresize' function, now only support 'bicubic'
-def cubic(x):
-    absx = torch.abs(x)
-    absx2 = absx**2
-    absx3 = absx**3
-    return (1.5*absx3 - 2.5*absx2 + 1) * ((absx <= 1).type_as(absx)) + \
-        (-0.5*absx3 + 2.5*absx2 - 4*absx + 2) * (((absx > 1)*(absx <= 2)).type_as(absx))
-
-
-def calculate_weights_indices(in_length, out_length, scale, kernel, kernel_width, antialiasing):
-    if (scale < 1) and (antialiasing):
-        # Use a modified kernel to simultaneously interpolate and antialias- larger kernel width
-        kernel_width = kernel_width / scale
-
-    # Output-space coordinates
-    x = torch.linspace(1, out_length, out_length)
-
-    # Input-space coordinates. Calculate the inverse mapping such that 0.5
-    # in output space maps to 0.5 in input space, and 0.5+scale in output
-    # space maps to 1.5 in input space.
-    u = x / scale + 0.5 * (1 - 1 / scale)
-
-    # What is the left-most pixel that can be involved in the computation?
-    left = torch.floor(u - kernel_width / 2)
-
-    # What is the maximum number of pixels that can be involved in the
-    # computation?  Note: it's OK to use an extra pixel here; if the
-    # corresponding weights are all zero, it will be eliminated at the end
-    # of this function.
-    P = math.ceil(kernel_width) + 2
-
-    # The indices of the input pixels involved in computing the k-th output
-    # pixel are in row k of the indices matrix.
-    indices = left.view(out_length, 1).expand(out_length, P) + torch.linspace(0, P - 1, P).view(
-        1, P).expand(out_length, P)
-
-    # The weights used to compute the k-th output pixel are in row k of the
-    # weights matrix.
-    distance_to_center = u.view(out_length, 1).expand(out_length, P) - indices
-    # apply cubic kernel
-    if (scale < 1) and (antialiasing):
-        weights = scale * cubic(distance_to_center * scale)
-    else:
-        weights = cubic(distance_to_center)
-    # Normalize the weights matrix so that each row sums to 1.
-    weights_sum = torch.sum(weights, 1).view(out_length, 1)
-    weights = weights / weights_sum.expand(out_length, P)
-
-    # If a column in weights is all zero, get rid of it. only consider the first and last column.
-    weights_zero_tmp = torch.sum((weights == 0), 0)
-    if not math.isclose(weights_zero_tmp[0], 0, rel_tol=1e-6):
-        indices = indices.narrow(1, 1, P - 2)
-        weights = weights.narrow(1, 1, P - 2)
-    if not math.isclose(weights_zero_tmp[-1], 0, rel_tol=1e-6):
-        indices = indices.narrow(1, 0, P - 2)
-        weights = weights.narrow(1, 0, P - 2)
-    weights = weights.contiguous()
-    indices = indices.contiguous()
-    sym_len_s = -indices.min() + 1
-    sym_len_e = indices.max() - in_length
-    indices = indices + sym_len_s - 1
-    return weights, indices, int(sym_len_s), int(sym_len_e)
-
-
-# --------------------------------------------
-# imresize for tensor image [0, 1]
-# --------------------------------------------
-def imresize(img, scale, antialiasing=True):
-    # Now the scale should be the same for H and W
-    # input: img: pytorch tensor, CHW or HW [0,1]
-    # output: CHW or HW [0,1] w/o round
-    need_squeeze = True if img.dim() == 2 else False
-    if need_squeeze:
-        img.unsqueeze_(0)
-    in_C, in_H, in_W = img.size()
-    out_C, out_H, out_W = in_C, math.ceil(in_H * scale), math.ceil(in_W * scale)
-    kernel_width = 4
-    kernel = 'cubic'
-
-    # Return the desired dimension order for performing the resize.  The
-    # strategy is to perform the resize first along the dimension with the
-    # smallest scale factor.
-    # Now we do not support this.
-
-    # get weights and indices
-    weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices(
-        in_H, out_H, scale, kernel, kernel_width, antialiasing)
-    weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices(
-        in_W, out_W, scale, kernel, kernel_width, antialiasing)
-    # process H dimension
-    # symmetric copying
-    img_aug = torch.FloatTensor(in_C, in_H + sym_len_Hs + sym_len_He, in_W)
-    img_aug.narrow(1, sym_len_Hs, in_H).copy_(img)
-
-    sym_patch = img[:, :sym_len_Hs, :]
-    inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(1, inv_idx)
-    img_aug.narrow(1, 0, sym_len_Hs).copy_(sym_patch_inv)
-
-    sym_patch = img[:, -sym_len_He:, :]
-    inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(1, inv_idx)
-    img_aug.narrow(1, sym_len_Hs + in_H, sym_len_He).copy_(sym_patch_inv)
-
-    out_1 = torch.FloatTensor(in_C, out_H, in_W)
-    kernel_width = weights_H.size(1)
-    for i in range(out_H):
-        idx = int(indices_H[i][0])
-        for j in range(out_C):
-            out_1[j, i, :] = img_aug[j, idx:idx + kernel_width, :].transpose(0, 1).mv(weights_H[i])
-
-    # process W dimension
-    # symmetric copying
-    out_1_aug = torch.FloatTensor(in_C, out_H, in_W + sym_len_Ws + sym_len_We)
-    out_1_aug.narrow(2, sym_len_Ws, in_W).copy_(out_1)
-
-    sym_patch = out_1[:, :, :sym_len_Ws]
-    inv_idx = torch.arange(sym_patch.size(2) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(2, inv_idx)
-    out_1_aug.narrow(2, 0, sym_len_Ws).copy_(sym_patch_inv)
-
-    sym_patch = out_1[:, :, -sym_len_We:]
-    inv_idx = torch.arange(sym_patch.size(2) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(2, inv_idx)
-    out_1_aug.narrow(2, sym_len_Ws + in_W, sym_len_We).copy_(sym_patch_inv)
-
-    out_2 = torch.FloatTensor(in_C, out_H, out_W)
-    kernel_width = weights_W.size(1)
-    for i in range(out_W):
-        idx = int(indices_W[i][0])
-        for j in range(out_C):
-            out_2[j, :, i] = out_1_aug[j, :, idx:idx + kernel_width].mv(weights_W[i])
-    if need_squeeze:
-        out_2.squeeze_()
-    return out_2
-
-
-# --------------------------------------------
-# imresize for numpy image [0, 1]
-# --------------------------------------------
-def imresize_np(img, scale, antialiasing=True):
-    # Now the scale should be the same for H and W
-    # input: img: Numpy, HWC or HW [0,1]
-    # output: HWC or HW [0,1] w/o round
-    img = torch.from_numpy(img)
-    need_squeeze = True if img.dim() == 2 else False
-    if need_squeeze:
-        img.unsqueeze_(2)
-
-    in_H, in_W, in_C = img.size()
-    out_C, out_H, out_W = in_C, math.ceil(in_H * scale), math.ceil(in_W * scale)
-    kernel_width = 4
-    kernel = 'cubic'
-
-    # Return the desired dimension order for performing the resize.  The
-    # strategy is to perform the resize first along the dimension with the
-    # smallest scale factor.
-    # Now we do not support this.
-
-    # get weights and indices
-    weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices(
-        in_H, out_H, scale, kernel, kernel_width, antialiasing)
-    weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices(
-        in_W, out_W, scale, kernel, kernel_width, antialiasing)
-    # process H dimension
-    # symmetric copying
-    img_aug = torch.FloatTensor(in_H + sym_len_Hs + sym_len_He, in_W, in_C)
-    img_aug.narrow(0, sym_len_Hs, in_H).copy_(img)
-
-    sym_patch = img[:sym_len_Hs, :, :]
-    inv_idx = torch.arange(sym_patch.size(0) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(0, inv_idx)
-    img_aug.narrow(0, 0, sym_len_Hs).copy_(sym_patch_inv)
-
-    sym_patch = img[-sym_len_He:, :, :]
-    inv_idx = torch.arange(sym_patch.size(0) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(0, inv_idx)
-    img_aug.narrow(0, sym_len_Hs + in_H, sym_len_He).copy_(sym_patch_inv)
-
-    out_1 = torch.FloatTensor(out_H, in_W, in_C)
-    kernel_width = weights_H.size(1)
-    for i in range(out_H):
-        idx = int(indices_H[i][0])
-        for j in range(out_C):
-            out_1[i, :, j] = img_aug[idx:idx + kernel_width, :, j].transpose(0, 1).mv(weights_H[i])
-
-    # process W dimension
-    # symmetric copying
-    out_1_aug = torch.FloatTensor(out_H, in_W + sym_len_Ws + sym_len_We, in_C)
-    out_1_aug.narrow(1, sym_len_Ws, in_W).copy_(out_1)
-
-    sym_patch = out_1[:, :sym_len_Ws, :]
-    inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(1, inv_idx)
-    out_1_aug.narrow(1, 0, sym_len_Ws).copy_(sym_patch_inv)
-
-    sym_patch = out_1[:, -sym_len_We:, :]
-    inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(1, inv_idx)
-    out_1_aug.narrow(1, sym_len_Ws + in_W, sym_len_We).copy_(sym_patch_inv)
-
-    out_2 = torch.FloatTensor(out_H, out_W, in_C)
-    kernel_width = weights_W.size(1)
-    for i in range(out_W):
-        idx = int(indices_W[i][0])
-        for j in range(out_C):
-            out_2[:, i, j] = out_1_aug[:, idx:idx + kernel_width, j].mv(weights_W[i])
-    if need_squeeze:
-        out_2.squeeze_()
-
-    return out_2.numpy()
-
-
-if __name__ == '__main__':
-    print('---')
-#    img = imread_uint('test.bmp', 3)
-#    img = uint2single(img)
-#    img_bicubic = imresize_np(img, 1/4)
\ No newline at end of file
diff --git a/ControlNet/ldm/modules/midas/__init__.py b/ControlNet/ldm/modules/midas/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/ControlNet/ldm/modules/midas/api.py b/ControlNet/ldm/modules/midas/api.py
deleted file mode 100644
index b58ebbffd942a2fc22264f0ab47e400c26b9f41c..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/modules/midas/api.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# based on https://github.com/isl-org/MiDaS
-
-import cv2
-import torch
-import torch.nn as nn
-from torchvision.transforms import Compose
-
-from ldm.modules.midas.midas.dpt_depth import DPTDepthModel
-from ldm.modules.midas.midas.midas_net import MidasNet
-from ldm.modules.midas.midas.midas_net_custom import MidasNet_small
-from ldm.modules.midas.midas.transforms import Resize, NormalizeImage, PrepareForNet
-
-
-ISL_PATHS = {
-    "dpt_large": "midas_models/dpt_large-midas-2f21e586.pt",
-    "dpt_hybrid": "midas_models/dpt_hybrid-midas-501f0c75.pt",
-    "midas_v21": "",
-    "midas_v21_small": "",
-}
-
-
-def disabled_train(self, mode=True):
-    """Overwrite model.train with this function to make sure train/eval mode
-    does not change anymore."""
-    return self
-
-
-def load_midas_transform(model_type):
-    # https://github.com/isl-org/MiDaS/blob/master/run.py
-    # load transform only
-    if model_type == "dpt_large":  # DPT-Large
-        net_w, net_h = 384, 384
-        resize_mode = "minimal"
-        normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
-
-    elif model_type == "dpt_hybrid":  # DPT-Hybrid
-        net_w, net_h = 384, 384
-        resize_mode = "minimal"
-        normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
-
-    elif model_type == "midas_v21":
-        net_w, net_h = 384, 384
-        resize_mode = "upper_bound"
-        normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-
-    elif model_type == "midas_v21_small":
-        net_w, net_h = 256, 256
-        resize_mode = "upper_bound"
-        normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-
-    else:
-        assert False, f"model_type '{model_type}' not implemented, use: --model_type large"
-
-    transform = Compose(
-        [
-            Resize(
-                net_w,
-                net_h,
-                resize_target=None,
-                keep_aspect_ratio=True,
-                ensure_multiple_of=32,
-                resize_method=resize_mode,
-                image_interpolation_method=cv2.INTER_CUBIC,
-            ),
-            normalization,
-            PrepareForNet(),
-        ]
-    )
-
-    return transform
-
-
-def load_model(model_type):
-    # https://github.com/isl-org/MiDaS/blob/master/run.py
-    # load network
-    model_path = ISL_PATHS[model_type]
-    if model_type == "dpt_large":  # DPT-Large
-        model = DPTDepthModel(
-            path=model_path,
-            backbone="vitl16_384",
-            non_negative=True,
-        )
-        net_w, net_h = 384, 384
-        resize_mode = "minimal"
-        normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
-
-    elif model_type == "dpt_hybrid":  # DPT-Hybrid
-        model = DPTDepthModel(
-            path=model_path,
-            backbone="vitb_rn50_384",
-            non_negative=True,
-        )
-        net_w, net_h = 384, 384
-        resize_mode = "minimal"
-        normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
-
-    elif model_type == "midas_v21":
-        model = MidasNet(model_path, non_negative=True)
-        net_w, net_h = 384, 384
-        resize_mode = "upper_bound"
-        normalization = NormalizeImage(
-            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-        )
-
-    elif model_type == "midas_v21_small":
-        model = MidasNet_small(model_path, features=64, backbone="efficientnet_lite3", exportable=True,
-                               non_negative=True, blocks={'expand': True})
-        net_w, net_h = 256, 256
-        resize_mode = "upper_bound"
-        normalization = NormalizeImage(
-            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-        )
-
-    else:
-        print(f"model_type '{model_type}' not implemented, use: --model_type large")
-        assert False
-
-    transform = Compose(
-        [
-            Resize(
-                net_w,
-                net_h,
-                resize_target=None,
-                keep_aspect_ratio=True,
-                ensure_multiple_of=32,
-                resize_method=resize_mode,
-                image_interpolation_method=cv2.INTER_CUBIC,
-            ),
-            normalization,
-            PrepareForNet(),
-        ]
-    )
-
-    return model.eval(), transform
-
-
-class MiDaSInference(nn.Module):
-    MODEL_TYPES_TORCH_HUB = [
-        "DPT_Large",
-        "DPT_Hybrid",
-        "MiDaS_small"
-    ]
-    MODEL_TYPES_ISL = [
-        "dpt_large",
-        "dpt_hybrid",
-        "midas_v21",
-        "midas_v21_small",
-    ]
-
-    def __init__(self, model_type):
-        super().__init__()
-        assert (model_type in self.MODEL_TYPES_ISL)
-        model, _ = load_model(model_type)
-        self.model = model
-        self.model.train = disabled_train
-
-    def forward(self, x):
-        # x in 0..1 as produced by calling self.transform on a 0..1 float64 numpy array
-        # NOTE: we expect that the correct transform has been called during dataloading.
-        with torch.no_grad():
-            prediction = self.model(x)
-            prediction = torch.nn.functional.interpolate(
-                prediction.unsqueeze(1),
-                size=x.shape[2:],
-                mode="bicubic",
-                align_corners=False,
-            )
-        assert prediction.shape == (x.shape[0], 1, x.shape[2], x.shape[3])
-        return prediction
-
diff --git a/ControlNet/ldm/modules/midas/midas/__init__.py b/ControlNet/ldm/modules/midas/midas/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/ControlNet/ldm/modules/midas/midas/base_model.py b/ControlNet/ldm/modules/midas/midas/base_model.py
deleted file mode 100644
index 5cf430239b47ec5ec07531263f26f5c24a2311cd..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/modules/midas/midas/base_model.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import torch
-
-
-class BaseModel(torch.nn.Module):
-    def load(self, path):
-        """Load model from file.
-
-        Args:
-            path (str): file path
-        """
-        parameters = torch.load(path, map_location=torch.device('cpu'))
-
-        if "optimizer" in parameters:
-            parameters = parameters["model"]
-
-        self.load_state_dict(parameters)
diff --git a/ControlNet/ldm/modules/midas/midas/blocks.py b/ControlNet/ldm/modules/midas/midas/blocks.py
deleted file mode 100644
index 2145d18fa98060a618536d9a64fe6589e9be4f78..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/modules/midas/midas/blocks.py
+++ /dev/null
@@ -1,342 +0,0 @@
-import torch
-import torch.nn as nn
-
-from .vit import (
-    _make_pretrained_vitb_rn50_384,
-    _make_pretrained_vitl16_384,
-    _make_pretrained_vitb16_384,
-    forward_vit,
-)
-
-def _make_encoder(backbone, features, use_pretrained, groups=1, expand=False, exportable=True, hooks=None, use_vit_only=False, use_readout="ignore",):
-    if backbone == "vitl16_384":
-        pretrained = _make_pretrained_vitl16_384(
-            use_pretrained, hooks=hooks, use_readout=use_readout
-        )
-        scratch = _make_scratch(
-            [256, 512, 1024, 1024], features, groups=groups, expand=expand
-        )  # ViT-L/16 - 85.0% Top1 (backbone)
-    elif backbone == "vitb_rn50_384":
-        pretrained = _make_pretrained_vitb_rn50_384(
-            use_pretrained,
-            hooks=hooks,
-            use_vit_only=use_vit_only,
-            use_readout=use_readout,
-        )
-        scratch = _make_scratch(
-            [256, 512, 768, 768], features, groups=groups, expand=expand
-        )  # ViT-H/16 - 85.0% Top1 (backbone)
-    elif backbone == "vitb16_384":
-        pretrained = _make_pretrained_vitb16_384(
-            use_pretrained, hooks=hooks, use_readout=use_readout
-        )
-        scratch = _make_scratch(
-            [96, 192, 384, 768], features, groups=groups, expand=expand
-        )  # ViT-B/16 - 84.6% Top1 (backbone)
-    elif backbone == "resnext101_wsl":
-        pretrained = _make_pretrained_resnext101_wsl(use_pretrained)
-        scratch = _make_scratch([256, 512, 1024, 2048], features, groups=groups, expand=expand)     # efficientnet_lite3  
-    elif backbone == "efficientnet_lite3":
-        pretrained = _make_pretrained_efficientnet_lite3(use_pretrained, exportable=exportable)
-        scratch = _make_scratch([32, 48, 136, 384], features, groups=groups, expand=expand)  # efficientnet_lite3     
-    else:
-        print(f"Backbone '{backbone}' not implemented")
-        assert False
-        
-    return pretrained, scratch
-
-
-def _make_scratch(in_shape, out_shape, groups=1, expand=False):
-    scratch = nn.Module()
-
-    out_shape1 = out_shape
-    out_shape2 = out_shape
-    out_shape3 = out_shape
-    out_shape4 = out_shape
-    if expand==True:
-        out_shape1 = out_shape
-        out_shape2 = out_shape*2
-        out_shape3 = out_shape*4
-        out_shape4 = out_shape*8
-
-    scratch.layer1_rn = nn.Conv2d(
-        in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
-    )
-    scratch.layer2_rn = nn.Conv2d(
-        in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
-    )
-    scratch.layer3_rn = nn.Conv2d(
-        in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
-    )
-    scratch.layer4_rn = nn.Conv2d(
-        in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
-    )
-
-    return scratch
-
-
-def _make_pretrained_efficientnet_lite3(use_pretrained, exportable=False):
-    efficientnet = torch.hub.load(
-        "rwightman/gen-efficientnet-pytorch",
-        "tf_efficientnet_lite3",
-        pretrained=use_pretrained,
-        exportable=exportable
-    )
-    return _make_efficientnet_backbone(efficientnet)
-
-
-def _make_efficientnet_backbone(effnet):
-    pretrained = nn.Module()
-
-    pretrained.layer1 = nn.Sequential(
-        effnet.conv_stem, effnet.bn1, effnet.act1, *effnet.blocks[0:2]
-    )
-    pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3])
-    pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5])
-    pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9])
-
-    return pretrained
-    
-
-def _make_resnet_backbone(resnet):
-    pretrained = nn.Module()
-    pretrained.layer1 = nn.Sequential(
-        resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool, resnet.layer1
-    )
-
-    pretrained.layer2 = resnet.layer2
-    pretrained.layer3 = resnet.layer3
-    pretrained.layer4 = resnet.layer4
-
-    return pretrained
-
-
-def _make_pretrained_resnext101_wsl(use_pretrained):
-    resnet = torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl")
-    return _make_resnet_backbone(resnet)
-
-
-
-class Interpolate(nn.Module):
-    """Interpolation module.
-    """
-
-    def __init__(self, scale_factor, mode, align_corners=False):
-        """Init.
-
-        Args:
-            scale_factor (float): scaling
-            mode (str): interpolation mode
-        """
-        super(Interpolate, self).__init__()
-
-        self.interp = nn.functional.interpolate
-        self.scale_factor = scale_factor
-        self.mode = mode
-        self.align_corners = align_corners
-
-    def forward(self, x):
-        """Forward pass.
-
-        Args:
-            x (tensor): input
-
-        Returns:
-            tensor: interpolated data
-        """
-
-        x = self.interp(
-            x, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners
-        )
-
-        return x
-
-
-class ResidualConvUnit(nn.Module):
-    """Residual convolution module.
-    """
-
-    def __init__(self, features):
-        """Init.
-
-        Args:
-            features (int): number of features
-        """
-        super().__init__()
-
-        self.conv1 = nn.Conv2d(
-            features, features, kernel_size=3, stride=1, padding=1, bias=True
-        )
-
-        self.conv2 = nn.Conv2d(
-            features, features, kernel_size=3, stride=1, padding=1, bias=True
-        )
-
-        self.relu = nn.ReLU(inplace=True)
-
-    def forward(self, x):
-        """Forward pass.
-
-        Args:
-            x (tensor): input
-
-        Returns:
-            tensor: output
-        """
-        out = self.relu(x)
-        out = self.conv1(out)
-        out = self.relu(out)
-        out = self.conv2(out)
-
-        return out + x
-
-
-class FeatureFusionBlock(nn.Module):
-    """Feature fusion block.
-    """
-
-    def __init__(self, features):
-        """Init.
-
-        Args:
-            features (int): number of features
-        """
-        super(FeatureFusionBlock, self).__init__()
-
-        self.resConfUnit1 = ResidualConvUnit(features)
-        self.resConfUnit2 = ResidualConvUnit(features)
-
-    def forward(self, *xs):
-        """Forward pass.
-
-        Returns:
-            tensor: output
-        """
-        output = xs[0]
-
-        if len(xs) == 2:
-            output += self.resConfUnit1(xs[1])
-
-        output = self.resConfUnit2(output)
-
-        output = nn.functional.interpolate(
-            output, scale_factor=2, mode="bilinear", align_corners=True
-        )
-
-        return output
-
-
-
-
-class ResidualConvUnit_custom(nn.Module):
-    """Residual convolution module.
-    """
-
-    def __init__(self, features, activation, bn):
-        """Init.
-
-        Args:
-            features (int): number of features
-        """
-        super().__init__()
-
-        self.bn = bn
-
-        self.groups=1
-
-        self.conv1 = nn.Conv2d(
-            features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
-        )
-        
-        self.conv2 = nn.Conv2d(
-            features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
-        )
-
-        if self.bn==True:
-            self.bn1 = nn.BatchNorm2d(features)
-            self.bn2 = nn.BatchNorm2d(features)
-
-        self.activation = activation
-
-        self.skip_add = nn.quantized.FloatFunctional()
-
-    def forward(self, x):
-        """Forward pass.
-
-        Args:
-            x (tensor): input
-
-        Returns:
-            tensor: output
-        """
-        
-        out = self.activation(x)
-        out = self.conv1(out)
-        if self.bn==True:
-            out = self.bn1(out)
-       
-        out = self.activation(out)
-        out = self.conv2(out)
-        if self.bn==True:
-            out = self.bn2(out)
-
-        if self.groups > 1:
-            out = self.conv_merge(out)
-
-        return self.skip_add.add(out, x)
-
-        # return out + x
-
-
-class FeatureFusionBlock_custom(nn.Module):
-    """Feature fusion block.
-    """
-
-    def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True):
-        """Init.
-
-        Args:
-            features (int): number of features
-        """
-        super(FeatureFusionBlock_custom, self).__init__()
-
-        self.deconv = deconv
-        self.align_corners = align_corners
-
-        self.groups=1
-
-        self.expand = expand
-        out_features = features
-        if self.expand==True:
-            out_features = features//2
-        
-        self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
-
-        self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
-        self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
-        
-        self.skip_add = nn.quantized.FloatFunctional()
-
-    def forward(self, *xs):
-        """Forward pass.
-
-        Returns:
-            tensor: output
-        """
-        output = xs[0]
-
-        if len(xs) == 2:
-            res = self.resConfUnit1(xs[1])
-            output = self.skip_add.add(output, res)
-            # output += res
-
-        output = self.resConfUnit2(output)
-
-        output = nn.functional.interpolate(
-            output, scale_factor=2, mode="bilinear", align_corners=self.align_corners
-        )
-
-        output = self.out_conv(output)
-
-        return output
-
diff --git a/ControlNet/ldm/modules/midas/midas/dpt_depth.py b/ControlNet/ldm/modules/midas/midas/dpt_depth.py
deleted file mode 100644
index 4e9aab5d2767dffea39da5b3f30e2798688216f1..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/modules/midas/midas/dpt_depth.py
+++ /dev/null
@@ -1,109 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from .base_model import BaseModel
-from .blocks import (
-    FeatureFusionBlock,
-    FeatureFusionBlock_custom,
-    Interpolate,
-    _make_encoder,
-    forward_vit,
-)
-
-
-def _make_fusion_block(features, use_bn):
-    return FeatureFusionBlock_custom(
-        features,
-        nn.ReLU(False),
-        deconv=False,
-        bn=use_bn,
-        expand=False,
-        align_corners=True,
-    )
-
-
-class DPT(BaseModel):
-    def __init__(
-        self,
-        head,
-        features=256,
-        backbone="vitb_rn50_384",
-        readout="project",
-        channels_last=False,
-        use_bn=False,
-    ):
-
-        super(DPT, self).__init__()
-
-        self.channels_last = channels_last
-
-        hooks = {
-            "vitb_rn50_384": [0, 1, 8, 11],
-            "vitb16_384": [2, 5, 8, 11],
-            "vitl16_384": [5, 11, 17, 23],
-        }
-
-        # Instantiate backbone and reassemble blocks
-        self.pretrained, self.scratch = _make_encoder(
-            backbone,
-            features,
-            False, # Set to true of you want to train from scratch, uses ImageNet weights
-            groups=1,
-            expand=False,
-            exportable=False,
-            hooks=hooks[backbone],
-            use_readout=readout,
-        )
-
-        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
-        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
-        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
-        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
-
-        self.scratch.output_conv = head
-
-
-    def forward(self, x):
-        if self.channels_last == True:
-            x.contiguous(memory_format=torch.channels_last)
-
-        layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)
-
-        layer_1_rn = self.scratch.layer1_rn(layer_1)
-        layer_2_rn = self.scratch.layer2_rn(layer_2)
-        layer_3_rn = self.scratch.layer3_rn(layer_3)
-        layer_4_rn = self.scratch.layer4_rn(layer_4)
-
-        path_4 = self.scratch.refinenet4(layer_4_rn)
-        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
-        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
-        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
-
-        out = self.scratch.output_conv(path_1)
-
-        return out
-
-
-class DPTDepthModel(DPT):
-    def __init__(self, path=None, non_negative=True, **kwargs):
-        features = kwargs["features"] if "features" in kwargs else 256
-
-        head = nn.Sequential(
-            nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
-            Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
-            nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
-            nn.ReLU(True),
-            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
-            nn.ReLU(True) if non_negative else nn.Identity(),
-            nn.Identity(),
-        )
-
-        super().__init__(head, **kwargs)
-
-        if path is not None:
-           self.load(path)
-
-    def forward(self, x):
-        return super().forward(x).squeeze(dim=1)
-
diff --git a/ControlNet/ldm/modules/midas/midas/midas_net.py b/ControlNet/ldm/modules/midas/midas/midas_net.py
deleted file mode 100644
index 8a954977800b0a0f48807e80fa63041910e33c1f..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/modules/midas/midas/midas_net.py
+++ /dev/null
@@ -1,76 +0,0 @@
-"""MidashNet: Network for monocular depth estimation trained by mixing several datasets.
-This file contains code that is adapted from
-https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
-"""
-import torch
-import torch.nn as nn
-
-from .base_model import BaseModel
-from .blocks import FeatureFusionBlock, Interpolate, _make_encoder
-
-
-class MidasNet(BaseModel):
-    """Network for monocular depth estimation.
-    """
-
-    def __init__(self, path=None, features=256, non_negative=True):
-        """Init.
-
-        Args:
-            path (str, optional): Path to saved model. Defaults to None.
-            features (int, optional): Number of features. Defaults to 256.
-            backbone (str, optional): Backbone network for encoder. Defaults to resnet50
-        """
-        print("Loading weights: ", path)
-
-        super(MidasNet, self).__init__()
-
-        use_pretrained = False if path is None else True
-
-        self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained)
-
-        self.scratch.refinenet4 = FeatureFusionBlock(features)
-        self.scratch.refinenet3 = FeatureFusionBlock(features)
-        self.scratch.refinenet2 = FeatureFusionBlock(features)
-        self.scratch.refinenet1 = FeatureFusionBlock(features)
-
-        self.scratch.output_conv = nn.Sequential(
-            nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1),
-            Interpolate(scale_factor=2, mode="bilinear"),
-            nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1),
-            nn.ReLU(True),
-            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
-            nn.ReLU(True) if non_negative else nn.Identity(),
-        )
-
-        if path:
-            self.load(path)
-
-    def forward(self, x):
-        """Forward pass.
-
-        Args:
-            x (tensor): input data (image)
-
-        Returns:
-            tensor: depth
-        """
-
-        layer_1 = self.pretrained.layer1(x)
-        layer_2 = self.pretrained.layer2(layer_1)
-        layer_3 = self.pretrained.layer3(layer_2)
-        layer_4 = self.pretrained.layer4(layer_3)
-
-        layer_1_rn = self.scratch.layer1_rn(layer_1)
-        layer_2_rn = self.scratch.layer2_rn(layer_2)
-        layer_3_rn = self.scratch.layer3_rn(layer_3)
-        layer_4_rn = self.scratch.layer4_rn(layer_4)
-
-        path_4 = self.scratch.refinenet4(layer_4_rn)
-        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
-        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
-        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
-
-        out = self.scratch.output_conv(path_1)
-
-        return torch.squeeze(out, dim=1)
diff --git a/ControlNet/ldm/modules/midas/midas/midas_net_custom.py b/ControlNet/ldm/modules/midas/midas/midas_net_custom.py
deleted file mode 100644
index 50e4acb5e53d5fabefe3dde16ab49c33c2b7797c..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/modules/midas/midas/midas_net_custom.py
+++ /dev/null
@@ -1,128 +0,0 @@
-"""MidashNet: Network for monocular depth estimation trained by mixing several datasets.
-This file contains code that is adapted from
-https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
-"""
-import torch
-import torch.nn as nn
-
-from .base_model import BaseModel
-from .blocks import FeatureFusionBlock, FeatureFusionBlock_custom, Interpolate, _make_encoder
-
-
-class MidasNet_small(BaseModel):
-    """Network for monocular depth estimation.
-    """
-
-    def __init__(self, path=None, features=64, backbone="efficientnet_lite3", non_negative=True, exportable=True, channels_last=False, align_corners=True,
-        blocks={'expand': True}):
-        """Init.
-
-        Args:
-            path (str, optional): Path to saved model. Defaults to None.
-            features (int, optional): Number of features. Defaults to 256.
-            backbone (str, optional): Backbone network for encoder. Defaults to resnet50
-        """
-        print("Loading weights: ", path)
-
-        super(MidasNet_small, self).__init__()
-
-        use_pretrained = False if path else True
-                
-        self.channels_last = channels_last
-        self.blocks = blocks
-        self.backbone = backbone
-
-        self.groups = 1
-
-        features1=features
-        features2=features
-        features3=features
-        features4=features
-        self.expand = False
-        if "expand" in self.blocks and self.blocks['expand'] == True:
-            self.expand = True
-            features1=features
-            features2=features*2
-            features3=features*4
-            features4=features*8
-
-        self.pretrained, self.scratch = _make_encoder(self.backbone, features, use_pretrained, groups=self.groups, expand=self.expand, exportable=exportable)
-  
-        self.scratch.activation = nn.ReLU(False)    
-
-        self.scratch.refinenet4 = FeatureFusionBlock_custom(features4, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
-        self.scratch.refinenet3 = FeatureFusionBlock_custom(features3, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
-        self.scratch.refinenet2 = FeatureFusionBlock_custom(features2, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
-        self.scratch.refinenet1 = FeatureFusionBlock_custom(features1, self.scratch.activation, deconv=False, bn=False, align_corners=align_corners)
-
-        
-        self.scratch.output_conv = nn.Sequential(
-            nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1, groups=self.groups),
-            Interpolate(scale_factor=2, mode="bilinear"),
-            nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1),
-            self.scratch.activation,
-            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
-            nn.ReLU(True) if non_negative else nn.Identity(),
-            nn.Identity(),
-        )
-        
-        if path:
-            self.load(path)
-
-
-    def forward(self, x):
-        """Forward pass.
-
-        Args:
-            x (tensor): input data (image)
-
-        Returns:
-            tensor: depth
-        """
-        if self.channels_last==True:
-            print("self.channels_last = ", self.channels_last)
-            x.contiguous(memory_format=torch.channels_last)
-
-
-        layer_1 = self.pretrained.layer1(x)
-        layer_2 = self.pretrained.layer2(layer_1)
-        layer_3 = self.pretrained.layer3(layer_2)
-        layer_4 = self.pretrained.layer4(layer_3)
-        
-        layer_1_rn = self.scratch.layer1_rn(layer_1)
-        layer_2_rn = self.scratch.layer2_rn(layer_2)
-        layer_3_rn = self.scratch.layer3_rn(layer_3)
-        layer_4_rn = self.scratch.layer4_rn(layer_4)
-
-
-        path_4 = self.scratch.refinenet4(layer_4_rn)
-        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
-        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
-        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
-        
-        out = self.scratch.output_conv(path_1)
-
-        return torch.squeeze(out, dim=1)
-
-
-
-def fuse_model(m):
-    prev_previous_type = nn.Identity()
-    prev_previous_name = ''
-    previous_type = nn.Identity()
-    previous_name = ''
-    for name, module in m.named_modules():
-        if prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d and type(module) == nn.ReLU:
-            # print("FUSED ", prev_previous_name, previous_name, name)
-            torch.quantization.fuse_modules(m, [prev_previous_name, previous_name, name], inplace=True)
-        elif prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d:
-            # print("FUSED ", prev_previous_name, previous_name)
-            torch.quantization.fuse_modules(m, [prev_previous_name, previous_name], inplace=True)
-        # elif previous_type == nn.Conv2d and type(module) == nn.ReLU:
-        #    print("FUSED ", previous_name, name)
-        #    torch.quantization.fuse_modules(m, [previous_name, name], inplace=True)
-
-        prev_previous_type = previous_type
-        prev_previous_name = previous_name
-        previous_type = type(module)
-        previous_name = name
\ No newline at end of file
diff --git a/ControlNet/ldm/modules/midas/midas/transforms.py b/ControlNet/ldm/modules/midas/midas/transforms.py
deleted file mode 100644
index 350cbc11662633ad7f8968eb10be2e7de6e384e9..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/modules/midas/midas/transforms.py
+++ /dev/null
@@ -1,234 +0,0 @@
-import numpy as np
-import cv2
-import math
-
-
-def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
-    """Rezise the sample to ensure the given size. Keeps aspect ratio.
-
-    Args:
-        sample (dict): sample
-        size (tuple): image size
-
-    Returns:
-        tuple: new size
-    """
-    shape = list(sample["disparity"].shape)
-
-    if shape[0] >= size[0] and shape[1] >= size[1]:
-        return sample
-
-    scale = [0, 0]
-    scale[0] = size[0] / shape[0]
-    scale[1] = size[1] / shape[1]
-
-    scale = max(scale)
-
-    shape[0] = math.ceil(scale * shape[0])
-    shape[1] = math.ceil(scale * shape[1])
-
-    # resize
-    sample["image"] = cv2.resize(
-        sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method
-    )
-
-    sample["disparity"] = cv2.resize(
-        sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST
-    )
-    sample["mask"] = cv2.resize(
-        sample["mask"].astype(np.float32),
-        tuple(shape[::-1]),
-        interpolation=cv2.INTER_NEAREST,
-    )
-    sample["mask"] = sample["mask"].astype(bool)
-
-    return tuple(shape)
-
-
-class Resize(object):
-    """Resize sample to given size (width, height).
-    """
-
-    def __init__(
-        self,
-        width,
-        height,
-        resize_target=True,
-        keep_aspect_ratio=False,
-        ensure_multiple_of=1,
-        resize_method="lower_bound",
-        image_interpolation_method=cv2.INTER_AREA,
-    ):
-        """Init.
-
-        Args:
-            width (int): desired output width
-            height (int): desired output height
-            resize_target (bool, optional):
-                True: Resize the full sample (image, mask, target).
-                False: Resize image only.
-                Defaults to True.
-            keep_aspect_ratio (bool, optional):
-                True: Keep the aspect ratio of the input sample.
-                Output sample might not have the given width and height, and
-                resize behaviour depends on the parameter 'resize_method'.
-                Defaults to False.
-            ensure_multiple_of (int, optional):
-                Output width and height is constrained to be multiple of this parameter.
-                Defaults to 1.
-            resize_method (str, optional):
-                "lower_bound": Output will be at least as large as the given size.
-                "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
-                "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
-                Defaults to "lower_bound".
-        """
-        self.__width = width
-        self.__height = height
-
-        self.__resize_target = resize_target
-        self.__keep_aspect_ratio = keep_aspect_ratio
-        self.__multiple_of = ensure_multiple_of
-        self.__resize_method = resize_method
-        self.__image_interpolation_method = image_interpolation_method
-
-    def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
-        y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
-
-        if max_val is not None and y > max_val:
-            y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
-
-        if y < min_val:
-            y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
-
-        return y
-
-    def get_size(self, width, height):
-        # determine new height and width
-        scale_height = self.__height / height
-        scale_width = self.__width / width
-
-        if self.__keep_aspect_ratio:
-            if self.__resize_method == "lower_bound":
-                # scale such that output size is lower bound
-                if scale_width > scale_height:
-                    # fit width
-                    scale_height = scale_width
-                else:
-                    # fit height
-                    scale_width = scale_height
-            elif self.__resize_method == "upper_bound":
-                # scale such that output size is upper bound
-                if scale_width < scale_height:
-                    # fit width
-                    scale_height = scale_width
-                else:
-                    # fit height
-                    scale_width = scale_height
-            elif self.__resize_method == "minimal":
-                # scale as least as possbile
-                if abs(1 - scale_width) < abs(1 - scale_height):
-                    # fit width
-                    scale_height = scale_width
-                else:
-                    # fit height
-                    scale_width = scale_height
-            else:
-                raise ValueError(
-                    f"resize_method {self.__resize_method} not implemented"
-                )
-
-        if self.__resize_method == "lower_bound":
-            new_height = self.constrain_to_multiple_of(
-                scale_height * height, min_val=self.__height
-            )
-            new_width = self.constrain_to_multiple_of(
-                scale_width * width, min_val=self.__width
-            )
-        elif self.__resize_method == "upper_bound":
-            new_height = self.constrain_to_multiple_of(
-                scale_height * height, max_val=self.__height
-            )
-            new_width = self.constrain_to_multiple_of(
-                scale_width * width, max_val=self.__width
-            )
-        elif self.__resize_method == "minimal":
-            new_height = self.constrain_to_multiple_of(scale_height * height)
-            new_width = self.constrain_to_multiple_of(scale_width * width)
-        else:
-            raise ValueError(f"resize_method {self.__resize_method} not implemented")
-
-        return (new_width, new_height)
-
-    def __call__(self, sample):
-        width, height = self.get_size(
-            sample["image"].shape[1], sample["image"].shape[0]
-        )
-
-        # resize sample
-        sample["image"] = cv2.resize(
-            sample["image"],
-            (width, height),
-            interpolation=self.__image_interpolation_method,
-        )
-
-        if self.__resize_target:
-            if "disparity" in sample:
-                sample["disparity"] = cv2.resize(
-                    sample["disparity"],
-                    (width, height),
-                    interpolation=cv2.INTER_NEAREST,
-                )
-
-            if "depth" in sample:
-                sample["depth"] = cv2.resize(
-                    sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST
-                )
-
-            sample["mask"] = cv2.resize(
-                sample["mask"].astype(np.float32),
-                (width, height),
-                interpolation=cv2.INTER_NEAREST,
-            )
-            sample["mask"] = sample["mask"].astype(bool)
-
-        return sample
-
-
-class NormalizeImage(object):
-    """Normlize image by given mean and std.
-    """
-
-    def __init__(self, mean, std):
-        self.__mean = mean
-        self.__std = std
-
-    def __call__(self, sample):
-        sample["image"] = (sample["image"] - self.__mean) / self.__std
-
-        return sample
-
-
-class PrepareForNet(object):
-    """Prepare sample for usage as network input.
-    """
-
-    def __init__(self):
-        pass
-
-    def __call__(self, sample):
-        image = np.transpose(sample["image"], (2, 0, 1))
-        sample["image"] = np.ascontiguousarray(image).astype(np.float32)
-
-        if "mask" in sample:
-            sample["mask"] = sample["mask"].astype(np.float32)
-            sample["mask"] = np.ascontiguousarray(sample["mask"])
-
-        if "disparity" in sample:
-            disparity = sample["disparity"].astype(np.float32)
-            sample["disparity"] = np.ascontiguousarray(disparity)
-
-        if "depth" in sample:
-            depth = sample["depth"].astype(np.float32)
-            sample["depth"] = np.ascontiguousarray(depth)
-
-        return sample
diff --git a/ControlNet/ldm/modules/midas/midas/vit.py b/ControlNet/ldm/modules/midas/midas/vit.py
deleted file mode 100644
index ea46b1be88b261b0dec04f3da0256f5f66f88a74..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/modules/midas/midas/vit.py
+++ /dev/null
@@ -1,491 +0,0 @@
-import torch
-import torch.nn as nn
-import timm
-import types
-import math
-import torch.nn.functional as F
-
-
-class Slice(nn.Module):
-    def __init__(self, start_index=1):
-        super(Slice, self).__init__()
-        self.start_index = start_index
-
-    def forward(self, x):
-        return x[:, self.start_index :]
-
-
-class AddReadout(nn.Module):
-    def __init__(self, start_index=1):
-        super(AddReadout, self).__init__()
-        self.start_index = start_index
-
-    def forward(self, x):
-        if self.start_index == 2:
-            readout = (x[:, 0] + x[:, 1]) / 2
-        else:
-            readout = x[:, 0]
-        return x[:, self.start_index :] + readout.unsqueeze(1)
-
-
-class ProjectReadout(nn.Module):
-    def __init__(self, in_features, start_index=1):
-        super(ProjectReadout, self).__init__()
-        self.start_index = start_index
-
-        self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU())
-
-    def forward(self, x):
-        readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index :])
-        features = torch.cat((x[:, self.start_index :], readout), -1)
-
-        return self.project(features)
-
-
-class Transpose(nn.Module):
-    def __init__(self, dim0, dim1):
-        super(Transpose, self).__init__()
-        self.dim0 = dim0
-        self.dim1 = dim1
-
-    def forward(self, x):
-        x = x.transpose(self.dim0, self.dim1)
-        return x
-
-
-def forward_vit(pretrained, x):
-    b, c, h, w = x.shape
-
-    glob = pretrained.model.forward_flex(x)
-
-    layer_1 = pretrained.activations["1"]
-    layer_2 = pretrained.activations["2"]
-    layer_3 = pretrained.activations["3"]
-    layer_4 = pretrained.activations["4"]
-
-    layer_1 = pretrained.act_postprocess1[0:2](layer_1)
-    layer_2 = pretrained.act_postprocess2[0:2](layer_2)
-    layer_3 = pretrained.act_postprocess3[0:2](layer_3)
-    layer_4 = pretrained.act_postprocess4[0:2](layer_4)
-
-    unflatten = nn.Sequential(
-        nn.Unflatten(
-            2,
-            torch.Size(
-                [
-                    h // pretrained.model.patch_size[1],
-                    w // pretrained.model.patch_size[0],
-                ]
-            ),
-        )
-    )
-
-    if layer_1.ndim == 3:
-        layer_1 = unflatten(layer_1)
-    if layer_2.ndim == 3:
-        layer_2 = unflatten(layer_2)
-    if layer_3.ndim == 3:
-        layer_3 = unflatten(layer_3)
-    if layer_4.ndim == 3:
-        layer_4 = unflatten(layer_4)
-
-    layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1)
-    layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2)
-    layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3)
-    layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4)
-
-    return layer_1, layer_2, layer_3, layer_4
-
-
-def _resize_pos_embed(self, posemb, gs_h, gs_w):
-    posemb_tok, posemb_grid = (
-        posemb[:, : self.start_index],
-        posemb[0, self.start_index :],
-    )
-
-    gs_old = int(math.sqrt(len(posemb_grid)))
-
-    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
-    posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear")
-    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
-
-    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
-
-    return posemb
-
-
-def forward_flex(self, x):
-    b, c, h, w = x.shape
-
-    pos_embed = self._resize_pos_embed(
-        self.pos_embed, h // self.patch_size[1], w // self.patch_size[0]
-    )
-
-    B = x.shape[0]
-
-    if hasattr(self.patch_embed, "backbone"):
-        x = self.patch_embed.backbone(x)
-        if isinstance(x, (list, tuple)):
-            x = x[-1]  # last feature if backbone outputs list/tuple of features
-
-    x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
-
-    if getattr(self, "dist_token", None) is not None:
-        cls_tokens = self.cls_token.expand(
-            B, -1, -1
-        )  # stole cls_tokens impl from Phil Wang, thanks
-        dist_token = self.dist_token.expand(B, -1, -1)
-        x = torch.cat((cls_tokens, dist_token, x), dim=1)
-    else:
-        cls_tokens = self.cls_token.expand(
-            B, -1, -1
-        )  # stole cls_tokens impl from Phil Wang, thanks
-        x = torch.cat((cls_tokens, x), dim=1)
-
-    x = x + pos_embed
-    x = self.pos_drop(x)
-
-    for blk in self.blocks:
-        x = blk(x)
-
-    x = self.norm(x)
-
-    return x
-
-
-activations = {}
-
-
-def get_activation(name):
-    def hook(model, input, output):
-        activations[name] = output
-
-    return hook
-
-
-def get_readout_oper(vit_features, features, use_readout, start_index=1):
-    if use_readout == "ignore":
-        readout_oper = [Slice(start_index)] * len(features)
-    elif use_readout == "add":
-        readout_oper = [AddReadout(start_index)] * len(features)
-    elif use_readout == "project":
-        readout_oper = [
-            ProjectReadout(vit_features, start_index) for out_feat in features
-        ]
-    else:
-        assert (
-            False
-        ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
-
-    return readout_oper
-
-
-def _make_vit_b16_backbone(
-    model,
-    features=[96, 192, 384, 768],
-    size=[384, 384],
-    hooks=[2, 5, 8, 11],
-    vit_features=768,
-    use_readout="ignore",
-    start_index=1,
-):
-    pretrained = nn.Module()
-
-    pretrained.model = model
-    pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
-    pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
-    pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
-    pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
-
-    pretrained.activations = activations
-
-    readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
-
-    # 32, 48, 136, 384
-    pretrained.act_postprocess1 = nn.Sequential(
-        readout_oper[0],
-        Transpose(1, 2),
-        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-        nn.Conv2d(
-            in_channels=vit_features,
-            out_channels=features[0],
-            kernel_size=1,
-            stride=1,
-            padding=0,
-        ),
-        nn.ConvTranspose2d(
-            in_channels=features[0],
-            out_channels=features[0],
-            kernel_size=4,
-            stride=4,
-            padding=0,
-            bias=True,
-            dilation=1,
-            groups=1,
-        ),
-    )
-
-    pretrained.act_postprocess2 = nn.Sequential(
-        readout_oper[1],
-        Transpose(1, 2),
-        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-        nn.Conv2d(
-            in_channels=vit_features,
-            out_channels=features[1],
-            kernel_size=1,
-            stride=1,
-            padding=0,
-        ),
-        nn.ConvTranspose2d(
-            in_channels=features[1],
-            out_channels=features[1],
-            kernel_size=2,
-            stride=2,
-            padding=0,
-            bias=True,
-            dilation=1,
-            groups=1,
-        ),
-    )
-
-    pretrained.act_postprocess3 = nn.Sequential(
-        readout_oper[2],
-        Transpose(1, 2),
-        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-        nn.Conv2d(
-            in_channels=vit_features,
-            out_channels=features[2],
-            kernel_size=1,
-            stride=1,
-            padding=0,
-        ),
-    )
-
-    pretrained.act_postprocess4 = nn.Sequential(
-        readout_oper[3],
-        Transpose(1, 2),
-        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-        nn.Conv2d(
-            in_channels=vit_features,
-            out_channels=features[3],
-            kernel_size=1,
-            stride=1,
-            padding=0,
-        ),
-        nn.Conv2d(
-            in_channels=features[3],
-            out_channels=features[3],
-            kernel_size=3,
-            stride=2,
-            padding=1,
-        ),
-    )
-
-    pretrained.model.start_index = start_index
-    pretrained.model.patch_size = [16, 16]
-
-    # We inject this function into the VisionTransformer instances so that
-    # we can use it with interpolated position embeddings without modifying the library source.
-    pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
-    pretrained.model._resize_pos_embed = types.MethodType(
-        _resize_pos_embed, pretrained.model
-    )
-
-    return pretrained
-
-
-def _make_pretrained_vitl16_384(pretrained, use_readout="ignore", hooks=None):
-    model = timm.create_model("vit_large_patch16_384", pretrained=pretrained)
-
-    hooks = [5, 11, 17, 23] if hooks == None else hooks
-    return _make_vit_b16_backbone(
-        model,
-        features=[256, 512, 1024, 1024],
-        hooks=hooks,
-        vit_features=1024,
-        use_readout=use_readout,
-    )
-
-
-def _make_pretrained_vitb16_384(pretrained, use_readout="ignore", hooks=None):
-    model = timm.create_model("vit_base_patch16_384", pretrained=pretrained)
-
-    hooks = [2, 5, 8, 11] if hooks == None else hooks
-    return _make_vit_b16_backbone(
-        model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
-    )
-
-
-def _make_pretrained_deitb16_384(pretrained, use_readout="ignore", hooks=None):
-    model = timm.create_model("vit_deit_base_patch16_384", pretrained=pretrained)
-
-    hooks = [2, 5, 8, 11] if hooks == None else hooks
-    return _make_vit_b16_backbone(
-        model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
-    )
-
-
-def _make_pretrained_deitb16_distil_384(pretrained, use_readout="ignore", hooks=None):
-    model = timm.create_model(
-        "vit_deit_base_distilled_patch16_384", pretrained=pretrained
-    )
-
-    hooks = [2, 5, 8, 11] if hooks == None else hooks
-    return _make_vit_b16_backbone(
-        model,
-        features=[96, 192, 384, 768],
-        hooks=hooks,
-        use_readout=use_readout,
-        start_index=2,
-    )
-
-
-def _make_vit_b_rn50_backbone(
-    model,
-    features=[256, 512, 768, 768],
-    size=[384, 384],
-    hooks=[0, 1, 8, 11],
-    vit_features=768,
-    use_vit_only=False,
-    use_readout="ignore",
-    start_index=1,
-):
-    pretrained = nn.Module()
-
-    pretrained.model = model
-
-    if use_vit_only == True:
-        pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
-        pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
-    else:
-        pretrained.model.patch_embed.backbone.stages[0].register_forward_hook(
-            get_activation("1")
-        )
-        pretrained.model.patch_embed.backbone.stages[1].register_forward_hook(
-            get_activation("2")
-        )
-
-    pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
-    pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
-
-    pretrained.activations = activations
-
-    readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
-
-    if use_vit_only == True:
-        pretrained.act_postprocess1 = nn.Sequential(
-            readout_oper[0],
-            Transpose(1, 2),
-            nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-            nn.Conv2d(
-                in_channels=vit_features,
-                out_channels=features[0],
-                kernel_size=1,
-                stride=1,
-                padding=0,
-            ),
-            nn.ConvTranspose2d(
-                in_channels=features[0],
-                out_channels=features[0],
-                kernel_size=4,
-                stride=4,
-                padding=0,
-                bias=True,
-                dilation=1,
-                groups=1,
-            ),
-        )
-
-        pretrained.act_postprocess2 = nn.Sequential(
-            readout_oper[1],
-            Transpose(1, 2),
-            nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-            nn.Conv2d(
-                in_channels=vit_features,
-                out_channels=features[1],
-                kernel_size=1,
-                stride=1,
-                padding=0,
-            ),
-            nn.ConvTranspose2d(
-                in_channels=features[1],
-                out_channels=features[1],
-                kernel_size=2,
-                stride=2,
-                padding=0,
-                bias=True,
-                dilation=1,
-                groups=1,
-            ),
-        )
-    else:
-        pretrained.act_postprocess1 = nn.Sequential(
-            nn.Identity(), nn.Identity(), nn.Identity()
-        )
-        pretrained.act_postprocess2 = nn.Sequential(
-            nn.Identity(), nn.Identity(), nn.Identity()
-        )
-
-    pretrained.act_postprocess3 = nn.Sequential(
-        readout_oper[2],
-        Transpose(1, 2),
-        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-        nn.Conv2d(
-            in_channels=vit_features,
-            out_channels=features[2],
-            kernel_size=1,
-            stride=1,
-            padding=0,
-        ),
-    )
-
-    pretrained.act_postprocess4 = nn.Sequential(
-        readout_oper[3],
-        Transpose(1, 2),
-        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-        nn.Conv2d(
-            in_channels=vit_features,
-            out_channels=features[3],
-            kernel_size=1,
-            stride=1,
-            padding=0,
-        ),
-        nn.Conv2d(
-            in_channels=features[3],
-            out_channels=features[3],
-            kernel_size=3,
-            stride=2,
-            padding=1,
-        ),
-    )
-
-    pretrained.model.start_index = start_index
-    pretrained.model.patch_size = [16, 16]
-
-    # We inject this function into the VisionTransformer instances so that
-    # we can use it with interpolated position embeddings without modifying the library source.
-    pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
-
-    # We inject this function into the VisionTransformer instances so that
-    # we can use it with interpolated position embeddings without modifying the library source.
-    pretrained.model._resize_pos_embed = types.MethodType(
-        _resize_pos_embed, pretrained.model
-    )
-
-    return pretrained
-
-
-def _make_pretrained_vitb_rn50_384(
-    pretrained, use_readout="ignore", hooks=None, use_vit_only=False
-):
-    model = timm.create_model("vit_base_resnet50_384", pretrained=pretrained)
-
-    hooks = [0, 1, 8, 11] if hooks == None else hooks
-    return _make_vit_b_rn50_backbone(
-        model,
-        features=[256, 512, 768, 768],
-        size=[384, 384],
-        hooks=hooks,
-        use_vit_only=use_vit_only,
-        use_readout=use_readout,
-    )
diff --git a/ControlNet/ldm/modules/midas/utils.py b/ControlNet/ldm/modules/midas/utils.py
deleted file mode 100644
index 9a9d3b5b66370fa98da9e067ba53ead848ea9a59..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/modules/midas/utils.py
+++ /dev/null
@@ -1,189 +0,0 @@
-"""Utils for monoDepth."""
-import sys
-import re
-import numpy as np
-import cv2
-import torch
-
-
-def read_pfm(path):
-    """Read pfm file.
-
-    Args:
-        path (str): path to file
-
-    Returns:
-        tuple: (data, scale)
-    """
-    with open(path, "rb") as file:
-
-        color = None
-        width = None
-        height = None
-        scale = None
-        endian = None
-
-        header = file.readline().rstrip()
-        if header.decode("ascii") == "PF":
-            color = True
-        elif header.decode("ascii") == "Pf":
-            color = False
-        else:
-            raise Exception("Not a PFM file: " + path)
-
-        dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii"))
-        if dim_match:
-            width, height = list(map(int, dim_match.groups()))
-        else:
-            raise Exception("Malformed PFM header.")
-
-        scale = float(file.readline().decode("ascii").rstrip())
-        if scale < 0:
-            # little-endian
-            endian = "<"
-            scale = -scale
-        else:
-            # big-endian
-            endian = ">"
-
-        data = np.fromfile(file, endian + "f")
-        shape = (height, width, 3) if color else (height, width)
-
-        data = np.reshape(data, shape)
-        data = np.flipud(data)
-
-        return data, scale
-
-
-def write_pfm(path, image, scale=1):
-    """Write pfm file.
-
-    Args:
-        path (str): pathto file
-        image (array): data
-        scale (int, optional): Scale. Defaults to 1.
-    """
-
-    with open(path, "wb") as file:
-        color = None
-
-        if image.dtype.name != "float32":
-            raise Exception("Image dtype must be float32.")
-
-        image = np.flipud(image)
-
-        if len(image.shape) == 3 and image.shape[2] == 3:  # color image
-            color = True
-        elif (
-            len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1
-        ):  # greyscale
-            color = False
-        else:
-            raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.")
-
-        file.write("PF\n" if color else "Pf\n".encode())
-        file.write("%d %d\n".encode() % (image.shape[1], image.shape[0]))
-
-        endian = image.dtype.byteorder
-
-        if endian == "<" or endian == "=" and sys.byteorder == "little":
-            scale = -scale
-
-        file.write("%f\n".encode() % scale)
-
-        image.tofile(file)
-
-
-def read_image(path):
-    """Read image and output RGB image (0-1).
-
-    Args:
-        path (str): path to file
-
-    Returns:
-        array: RGB image (0-1)
-    """
-    img = cv2.imread(path)
-
-    if img.ndim == 2:
-        img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
-
-    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0
-
-    return img
-
-
-def resize_image(img):
-    """Resize image and make it fit for network.
-
-    Args:
-        img (array): image
-
-    Returns:
-        tensor: data ready for network
-    """
-    height_orig = img.shape[0]
-    width_orig = img.shape[1]
-
-    if width_orig > height_orig:
-        scale = width_orig / 384
-    else:
-        scale = height_orig / 384
-
-    height = (np.ceil(height_orig / scale / 32) * 32).astype(int)
-    width = (np.ceil(width_orig / scale / 32) * 32).astype(int)
-
-    img_resized = cv2.resize(img, (width, height), interpolation=cv2.INTER_AREA)
-
-    img_resized = (
-        torch.from_numpy(np.transpose(img_resized, (2, 0, 1))).contiguous().float()
-    )
-    img_resized = img_resized.unsqueeze(0)
-
-    return img_resized
-
-
-def resize_depth(depth, width, height):
-    """Resize depth map and bring to CPU (numpy).
-
-    Args:
-        depth (tensor): depth
-        width (int): image width
-        height (int): image height
-
-    Returns:
-        array: processed depth
-    """
-    depth = torch.squeeze(depth[0, :, :, :]).to("cpu")
-
-    depth_resized = cv2.resize(
-        depth.numpy(), (width, height), interpolation=cv2.INTER_CUBIC
-    )
-
-    return depth_resized
-
-def write_depth(path, depth, bits=1):
-    """Write depth map to pfm and png file.
-
-    Args:
-        path (str): filepath without extension
-        depth (array): depth
-    """
-    write_pfm(path + ".pfm", depth.astype(np.float32))
-
-    depth_min = depth.min()
-    depth_max = depth.max()
-
-    max_val = (2**(8*bits))-1
-
-    if depth_max - depth_min > np.finfo("float").eps:
-        out = max_val * (depth - depth_min) / (depth_max - depth_min)
-    else:
-        out = np.zeros(depth.shape, dtype=depth.type)
-
-    if bits == 1:
-        cv2.imwrite(path + ".png", out.astype("uint8"))
-    elif bits == 2:
-        cv2.imwrite(path + ".png", out.astype("uint16"))
-
-    return
diff --git a/ControlNet/ldm/util.py b/ControlNet/ldm/util.py
deleted file mode 100644
index 45cb050ece6f401a22dde098ce3f1ff663c5eb6a..0000000000000000000000000000000000000000
--- a/ControlNet/ldm/util.py
+++ /dev/null
@@ -1,197 +0,0 @@
-import importlib
-
-import torch
-from torch import optim
-import numpy as np
-
-from inspect import isfunction
-from PIL import Image, ImageDraw, ImageFont
-
-
-def log_txt_as_img(wh, xc, size=10):
-    # wh a tuple of (width, height)
-    # xc a list of captions to plot
-    b = len(xc)
-    txts = list()
-    for bi in range(b):
-        txt = Image.new("RGB", wh, color="white")
-        draw = ImageDraw.Draw(txt)
-        font = ImageFont.truetype('font/DejaVuSans.ttf', size=size)
-        nc = int(40 * (wh[0] / 256))
-        lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
-
-        try:
-            draw.text((0, 0), lines, fill="black", font=font)
-        except UnicodeEncodeError:
-            print("Cant encode string for logging. Skipping.")
-
-        txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
-        txts.append(txt)
-    txts = np.stack(txts)
-    txts = torch.tensor(txts)
-    return txts
-
-
-def ismap(x):
-    if not isinstance(x, torch.Tensor):
-        return False
-    return (len(x.shape) == 4) and (x.shape[1] > 3)
-
-
-def isimage(x):
-    if not isinstance(x,torch.Tensor):
-        return False
-    return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
-
-
-def exists(x):
-    return x is not None
-
-
-def default(val, d):
-    if exists(val):
-        return val
-    return d() if isfunction(d) else d
-
-
-def mean_flat(tensor):
-    """
-    https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86
-    Take the mean over all non-batch dimensions.
-    """
-    return tensor.mean(dim=list(range(1, len(tensor.shape))))
-
-
-def count_params(model, verbose=False):
-    total_params = sum(p.numel() for p in model.parameters())
-    if verbose:
-        print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
-    return total_params
-
-
-def instantiate_from_config(config):
-    if not "target" in config:
-        if config == '__is_first_stage__':
-            return None
-        elif config == "__is_unconditional__":
-            return None
-        raise KeyError("Expected key `target` to instantiate.")
-    return get_obj_from_str(config["target"])(**config.get("params", dict()))
-
-
-def get_obj_from_str(string, reload=False):
-    module, cls = string.rsplit(".", 1)
-    if reload:
-        module_imp = importlib.import_module(module)
-        importlib.reload(module_imp)
-    return getattr(importlib.import_module(module, package=None), cls)
-
-
-class AdamWwithEMAandWings(optim.Optimizer):
-    # credit to https://gist.github.com/crowsonkb/65f7265353f403714fce3b2595e0b298
-    def __init__(self, params, lr=1.e-3, betas=(0.9, 0.999), eps=1.e-8,  # TODO: check hyperparameters before using
-                 weight_decay=1.e-2, amsgrad=False, ema_decay=0.9999,   # ema decay to match previous code
-                 ema_power=1., param_names=()):
-        """AdamW that saves EMA versions of the parameters."""
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
-        if not 0.0 <= weight_decay:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
-        if not 0.0 <= ema_decay <= 1.0:
-            raise ValueError("Invalid ema_decay value: {}".format(ema_decay))
-        defaults = dict(lr=lr, betas=betas, eps=eps,
-                        weight_decay=weight_decay, amsgrad=amsgrad, ema_decay=ema_decay,
-                        ema_power=ema_power, param_names=param_names)
-        super().__init__(params, defaults)
-
-    def __setstate__(self, state):
-        super().__setstate__(state)
-        for group in self.param_groups:
-            group.setdefault('amsgrad', False)
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-        Args:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        for group in self.param_groups:
-            params_with_grad = []
-            grads = []
-            exp_avgs = []
-            exp_avg_sqs = []
-            ema_params_with_grad = []
-            state_sums = []
-            max_exp_avg_sqs = []
-            state_steps = []
-            amsgrad = group['amsgrad']
-            beta1, beta2 = group['betas']
-            ema_decay = group['ema_decay']
-            ema_power = group['ema_power']
-
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                params_with_grad.append(p)
-                if p.grad.is_sparse:
-                    raise RuntimeError('AdamW does not support sparse gradients')
-                grads.append(p.grad)
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state['step'] = 0
-                    # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                    # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                    if amsgrad:
-                        # Maintains max of all exp. moving avg. of sq. grad. values
-                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                    # Exponential moving average of parameter values
-                    state['param_exp_avg'] = p.detach().float().clone()
-
-                exp_avgs.append(state['exp_avg'])
-                exp_avg_sqs.append(state['exp_avg_sq'])
-                ema_params_with_grad.append(state['param_exp_avg'])
-
-                if amsgrad:
-                    max_exp_avg_sqs.append(state['max_exp_avg_sq'])
-
-                # update the steps for each param group update
-                state['step'] += 1
-                # record the step after step update
-                state_steps.append(state['step'])
-
-            optim._functional.adamw(params_with_grad,
-                    grads,
-                    exp_avgs,
-                    exp_avg_sqs,
-                    max_exp_avg_sqs,
-                    state_steps,
-                    amsgrad=amsgrad,
-                    beta1=beta1,
-                    beta2=beta2,
-                    lr=group['lr'],
-                    weight_decay=group['weight_decay'],
-                    eps=group['eps'],
-                    maximize=False)
-
-            cur_ema_decay = min(ema_decay, 1 - state['step'] ** -ema_power)
-            for param, ema_param in zip(params_with_grad, ema_params_with_grad):
-                ema_param.mul_(cur_ema_decay).add_(param.float(), alpha=1 - cur_ema_decay)
-
-        return loss
\ No newline at end of file
diff --git a/ControlNet/models/cldm_v15.yaml b/ControlNet/models/cldm_v15.yaml
deleted file mode 100644
index 3e53feff3d3015dc2a8adfdaaf9bde67717ba121..0000000000000000000000000000000000000000
--- a/ControlNet/models/cldm_v15.yaml
+++ /dev/null
@@ -1,79 +0,0 @@
-model:
-  target: ControlNet.cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: ControlNet.cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: ControlNet.cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ControlNet.ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ControlNet.ldm.modules.encoders.modules.FrozenCLIPEmbedder
diff --git a/ControlNet/models/cldm_v21.yaml b/ControlNet/models/cldm_v21.yaml
deleted file mode 100644
index fc65193647e476e108fce5977f11250d55919106..0000000000000000000000000000000000000000
--- a/ControlNet/models/cldm_v21.yaml
+++ /dev/null
@@ -1,85 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        use_checkpoint: True
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        use_checkpoint: True
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          #attn_type: "vanilla-xformers"
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
-      params:
-        freeze: True
-        layer: "penultimate"
diff --git a/ControlNet/share.py b/ControlNet/share.py
deleted file mode 100644
index a2307183cc2a7580044011f5a46a130ba804a2ef..0000000000000000000000000000000000000000
--- a/ControlNet/share.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import ControlNet.config
-from ControlNet.cldm.hack import disable_verbosity, enable_sliced_attention
-
-
-disable_verbosity()
-
-if ControlNet.config.save_memory:
-    enable_sliced_attention()
diff --git a/ControlNet/tool_add_control.py b/ControlNet/tool_add_control.py
deleted file mode 100644
index 8076b5143405e5516b063f4fd63096f65cffbed2..0000000000000000000000000000000000000000
--- a/ControlNet/tool_add_control.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import sys
-import os
-
-assert len(sys.argv) == 3, 'Args are wrong.'
-
-input_path = sys.argv[1]
-output_path = sys.argv[2]
-
-assert os.path.exists(input_path), 'Input model does not exist.'
-assert not os.path.exists(output_path), 'Output filename already exists.'
-assert os.path.exists(os.path.dirname(output_path)), 'Output path is not valid.'
-
-import torch
-from share import *
-from cldm.model import create_model
-
-
-def get_node_name(name, parent_name):
-    if len(name) <= len(parent_name):
-        return False, ''
-    p = name[:len(parent_name)]
-    if p != parent_name:
-        return False, ''
-    return True, name[len(parent_name):]
-
-
-model = create_model(config_path='./models/cldm_v15.yaml')
-
-pretrained_weights = torch.load(input_path)
-if 'state_dict' in pretrained_weights:
-    pretrained_weights = pretrained_weights['state_dict']
-
-scratch_dict = model.state_dict()
-
-target_dict = {}
-for k in scratch_dict.keys():
-    is_control, name = get_node_name(k, 'control_')
-    if is_control:
-        copy_k = 'model.diffusion_' + name
-    else:
-        copy_k = k
-    if copy_k in pretrained_weights:
-        target_dict[k] = pretrained_weights[copy_k].clone()
-    else:
-        target_dict[k] = scratch_dict[k].clone()
-        print(f'These weights are newly added: {k}')
-
-model.load_state_dict(target_dict, strict=True)
-torch.save(model.state_dict(), output_path)
-print('Done.')
diff --git a/ControlNet/tool_add_control_sd21.py b/ControlNet/tool_add_control_sd21.py
deleted file mode 100644
index 7c3ac5f9bb398b9d20cb7d750f1f7c0717670eae..0000000000000000000000000000000000000000
--- a/ControlNet/tool_add_control_sd21.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import sys
-import os
-
-assert len(sys.argv) == 3, 'Args are wrong.'
-
-input_path = sys.argv[1]
-output_path = sys.argv[2]
-
-assert os.path.exists(input_path), 'Input model does not exist.'
-assert not os.path.exists(output_path), 'Output filename already exists.'
-assert os.path.exists(os.path.dirname(output_path)), 'Output path is not valid.'
-
-import torch
-from share import *
-from cldm.model import create_model
-
-
-def get_node_name(name, parent_name):
-    if len(name) <= len(parent_name):
-        return False, ''
-    p = name[:len(parent_name)]
-    if p != parent_name:
-        return False, ''
-    return True, name[len(parent_name):]
-
-
-model = create_model(config_path='./models/cldm_v21.yaml')
-
-pretrained_weights = torch.load(input_path)
-if 'state_dict' in pretrained_weights:
-    pretrained_weights = pretrained_weights['state_dict']
-
-scratch_dict = model.state_dict()
-
-target_dict = {}
-for k in scratch_dict.keys():
-    is_control, name = get_node_name(k, 'control_')
-    if is_control:
-        copy_k = 'model.diffusion_' + name
-    else:
-        copy_k = k
-    if copy_k in pretrained_weights:
-        target_dict[k] = pretrained_weights[copy_k].clone()
-    else:
-        target_dict[k] = scratch_dict[k].clone()
-        print(f'These weights are newly added: {k}')
-
-model.load_state_dict(target_dict, strict=True)
-torch.save(model.state_dict(), output_path)
-print('Done.')
diff --git a/ControlNet/tool_transfer_control.py b/ControlNet/tool_transfer_control.py
deleted file mode 100644
index b84442cc93f7f9c30cb7311b8675d9124a6e8ec9..0000000000000000000000000000000000000000
--- a/ControlNet/tool_transfer_control.py
+++ /dev/null
@@ -1,59 +0,0 @@
-path_sd15 = './models/v1-5-pruned.ckpt'
-path_sd15_with_control = './models/control_sd15_openpose.pth'
-path_input = './models/anything-v3-full.safetensors'
-path_output = './models/control_any3_openpose.pth'
-
-
-import os
-
-
-assert os.path.exists(path_sd15), 'Input path_sd15 does not exists!'
-assert os.path.exists(path_sd15_with_control), 'Input path_sd15_with_control does not exists!'
-assert os.path.exists(path_input), 'Input path_input does not exists!'
-assert os.path.exists(os.path.dirname(path_output)), 'Output folder not exists!'
-
-
-import torch
-from share import *
-from cldm.model import load_state_dict
-
-
-sd15_state_dict = load_state_dict(path_sd15)
-sd15_with_control_state_dict = load_state_dict(path_sd15_with_control)
-input_state_dict = load_state_dict(path_input)
-
-
-def get_node_name(name, parent_name):
-    if len(name) <= len(parent_name):
-        return False, ''
-    p = name[:len(parent_name)]
-    if p != parent_name:
-        return False, ''
-    return True, name[len(parent_name):]
-
-
-keys = sd15_with_control_state_dict.keys()
-
-final_state_dict = {}
-for key in keys:
-    is_first_stage, _ = get_node_name(key, 'first_stage_model')
-    is_cond_stage, _ = get_node_name(key, 'cond_stage_model')
-    if is_first_stage or is_cond_stage:
-        final_state_dict[key] = input_state_dict[key]
-        continue
-    p = sd15_with_control_state_dict[key]
-    is_control, node_name = get_node_name(key, 'control_')
-    if is_control:
-        sd15_key_name = 'model.diffusion_' + node_name
-    else:
-        sd15_key_name = key
-    if sd15_key_name in input_state_dict:
-        p_new = p + input_state_dict[sd15_key_name] - sd15_state_dict[sd15_key_name]
-        # print(f'Offset clone from [{sd15_key_name}] to [{key}]')
-    else:
-        p_new = p
-        # print(f'Direct clone to [{key}]')
-    final_state_dict[key] = p_new
-
-torch.save(final_state_dict, path_output)
-print('Transferred model saved at ' + path_output)
diff --git a/ControlNet/tutorial_dataset.py b/ControlNet/tutorial_dataset.py
deleted file mode 100644
index fb327f981d10cf94e6a7f55f5b2b4497d3e7a9cb..0000000000000000000000000000000000000000
--- a/ControlNet/tutorial_dataset.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import json
-import cv2
-import numpy as np
-
-from torch.utils.data import Dataset
-
-
-class MyDataset(Dataset):
-    def __init__(self):
-        self.data = []
-        with open('./training/fill50k/prompt.json', 'rt') as f:
-            for line in f:
-                self.data.append(json.loads(line))
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(self, idx):
-        item = self.data[idx]
-
-        source_filename = item['source']
-        target_filename = item['target']
-        prompt = item['prompt']
-
-        source = cv2.imread('./training/fill50k/' + source_filename)
-        target = cv2.imread('./training/fill50k/' + target_filename)
-
-        # Do not forget that OpenCV read images in BGR order.
-        source = cv2.cvtColor(source, cv2.COLOR_BGR2RGB)
-        target = cv2.cvtColor(target, cv2.COLOR_BGR2RGB)
-
-        # Normalize source images to [0, 1].
-        source = source.astype(np.float32) / 255.0
-
-        # Normalize target images to [-1, 1].
-        target = (target.astype(np.float32) / 127.5) - 1.0
-
-        return dict(jpg=target, txt=prompt, hint=source)
-
diff --git a/ControlNet/tutorial_dataset_test.py b/ControlNet/tutorial_dataset_test.py
deleted file mode 100644
index b0c4c355065d15c266a8b4e8c68dfcbe2b246730..0000000000000000000000000000000000000000
--- a/ControlNet/tutorial_dataset_test.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from tutorial_dataset import MyDataset
-
-dataset = MyDataset()
-print(len(dataset))
-
-item = dataset[1234]
-jpg = item['jpg']
-txt = item['txt']
-hint = item['hint']
-print(txt)
-print(jpg.shape)
-print(hint.shape)
diff --git a/ControlNet/tutorial_train.py b/ControlNet/tutorial_train.py
deleted file mode 100644
index 393d7addb164c32eff9c3d675e4f32fb555868f0..0000000000000000000000000000000000000000
--- a/ControlNet/tutorial_train.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from share import *
-
-import pytorch_lightning as pl
-from torch.utils.data import DataLoader
-from tutorial_dataset import MyDataset
-from cldm.logger import ImageLogger
-from cldm.model import create_model, load_state_dict
-
-
-# Configs
-resume_path = './models/control_sd15_ini.ckpt'
-batch_size = 4
-logger_freq = 300
-learning_rate = 1e-5
-sd_locked = True
-only_mid_control = False
-
-
-# First use cpu to load models. Pytorch Lightning will automatically move it to GPUs.
-model = create_model('./models/cldm_v15.yaml').cpu()
-model.load_state_dict(load_state_dict(resume_path, location='cpu'))
-model.learning_rate = learning_rate
-model.sd_locked = sd_locked
-model.only_mid_control = only_mid_control
-
-
-# Misc
-dataset = MyDataset()
-dataloader = DataLoader(dataset, num_workers=0, batch_size=batch_size, shuffle=True)
-logger = ImageLogger(batch_frequency=logger_freq)
-trainer = pl.Trainer(gpus=1, precision=32, callbacks=[logger])
-
-
-# Train!
-trainer.fit(model, dataloader)
diff --git a/ControlNet/tutorial_train_sd21.py b/ControlNet/tutorial_train_sd21.py
deleted file mode 100644
index 8bbc148f9b1e90561f5a186cc0be94c911dd67cf..0000000000000000000000000000000000000000
--- a/ControlNet/tutorial_train_sd21.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from share import *
-
-import pytorch_lightning as pl
-from torch.utils.data import DataLoader
-from tutorial_dataset import MyDataset
-from cldm.logger import ImageLogger
-from cldm.model import create_model, load_state_dict
-
-
-# Configs
-resume_path = './models/control_sd21_ini.ckpt'
-batch_size = 4
-logger_freq = 300
-learning_rate = 1e-5
-sd_locked = True
-only_mid_control = False
-
-
-# First use cpu to load models. Pytorch Lightning will automatically move it to GPUs.
-model = create_model('./models/cldm_v21.yaml').cpu()
-model.load_state_dict(load_state_dict(resume_path, location='cpu'))
-model.learning_rate = learning_rate
-model.sd_locked = sd_locked
-model.only_mid_control = only_mid_control
-
-
-# Misc
-dataset = MyDataset()
-dataloader = DataLoader(dataset, num_workers=0, batch_size=batch_size, shuffle=True)
-logger = ImageLogger(batch_frequency=logger_freq)
-trainer = pl.Trainer(gpus=1, precision=32, callbacks=[logger])
-
-
-# Train!
-trainer.fit(model, dataloader)
diff --git a/README copy.md b/README copy.md
deleted file mode 100644
index 1b13e4357e4e87180f5ac9842e85df1b774432fd..0000000000000000000000000000000000000000
--- a/README copy.md	
+++ /dev/null
@@ -1,6 +0,0 @@
----
-title: PatchFusion
-app_file: app.py
-sdk: gradio
-sdk_version: 4.8.0
----
diff --git a/README.md b/README.md
index 85c6db37a02e9fc6744899be691579c57e7aff93..4257a279c1a8f241bfbd31d91818a7a76718c23b 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ emoji: 📚
 colorFrom: red
 colorTo: green
 sdk: gradio
-sdk_version: 4.8.0
+sdk_version: 4.14.0
 app_file: app.py
 pinned: false
 ---
diff --git a/app.py b/app.py
index e0260b31883608425713d960019c359e8291e5d9..bf065ed00d4b3aa7b88f689dd6acc05738f885c8 100644
--- a/app.py
+++ b/app.py
@@ -22,124 +22,76 @@
 
 # File author: Zhenyu Li
 
-import gc
-import copy
-from ControlNet.share import *
-import einops
+import os
+import cv2
 import torch
-import random
-
-import ControlNet.config as config
-from pytorch_lightning import seed_everything
-from ControlNet.cldm.model import create_model, load_state_dict
-from ControlNet.cldm.ddim_hacked import DDIMSampler
-
 import gradio as gr
-import torch
+from estimator.models.patchfusion import PatchFusion
+from gradio_imageslider import ImageSlider
+from torchvision import transforms
 import numpy as np
-from zoedepth.utils.arg_utils import parse_unknown
-import argparse
-from zoedepth.models.builder import build_model
-from zoedepth.utils.config import get_config_user
-import gradio as gr
-
-from ui_prediction import predict_depth
 import torch.nn.functional as F
-
-from huggingface_hub import hf_hub_download 
-import matplotlib
-
 from PIL import Image
 import tempfile
-
+# import spaces
+import matplotlib
+from huggingface_hub import hf_hub_download 
+import copy
+from zoedepth.models.zoedepth import ZoeDepth
+
+css = """
+#img-display-container {
+    max-height: 100vh;
+    }
+#img-display-input {
+    max-height: 80vh;
+    }
+#img-display-output {
+    max-height: 80vh;
+    }
+"""
 DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
-
-def depth_load_state_dict(model, state_dict):
-    """Load state_dict into model, handling DataParallel and DistributedDataParallel. Also checks for "model" key in state_dict.
-
-    DataParallel prefixes state_dict keys with 'module.' when saving.
-    If the model is not a DataParallel model but the state_dict is, then prefixes are removed.
-    If the model is a DataParallel model but the state_dict is not, then prefixes are added.
-    """
-    state_dict = state_dict.get('model', state_dict)
-    # if model is a DataParallel model, then state_dict keys are prefixed with 'module.'
-
-    do_prefix = isinstance(
-        model, (torch.nn.DataParallel, torch.nn.parallel.DistributedDataParallel))
-    state = {}
-    for k, v in state_dict.items():
-        if k.startswith('module.') and not do_prefix:
-            k = k[7:]
-
-        if not k.startswith('module.') and do_prefix:
-            k = 'module.' + k
-
-        state[k] = v
-
-    model.load_state_dict(state, strict=True)
-    print("Loaded successfully")
-    return model
-
-def load_wts(model, checkpoint_path):
-    ckpt = torch.load(checkpoint_path, map_location='cpu')
-    return depth_load_state_dict(model, ckpt)
-
-def load_ckpt(model, checkpoint):
-    model = load_wts(model, checkpoint)
-    print("Loaded weights from {0}".format(checkpoint))
-    return model
-
-
-pf_ckp = hf_hub_download(repo_id="zhyever/PatchFusion", filename="patchfusion_u4k.pt")
-parser = argparse.ArgumentParser()
-parser.add_argument("--ckp_path", type=str, default=pf_ckp)
-parser.add_argument("-m", "--model", type=str, default="zoedepth_custom")
-parser.add_argument("--model_cfg_path", type=str, default="./zoedepth/models/zoedepth_custom/configs/config_zoedepth_patchfusion.json")
-args, unknown_args = parser.parse_known_args()
-overwrite_kwargs = parse_unknown(unknown_args)
-overwrite_kwargs['model_cfg_path'] = args.model_cfg_path
-overwrite_kwargs["model"] = args.model
-config_depth = get_config_user(args.model, **overwrite_kwargs)
-config_depth["pretrained_resource"] = ''
-depth_model = build_model(config_depth)
-depth_model = load_ckpt(depth_model, args.ckp_path)
-depth_model.eval()
-
-controlnet_ckp = hf_hub_download(repo_id="zhyever/PatchFusion", filename="control_sd15_depth.pth")
-model = create_model('./ControlNet/models/cldm_v15.yaml')
-model.load_state_dict(load_state_dict(controlnet_ckp, location=DEVICE), strict=False)
-model = model.to(DEVICE)
-ddim_sampler = DDIMSampler(model)
-
-def colorize(value, cmap='magma_r', vmin=None, vmax=None):
-
-    percentile = 0.03
-    vmin = np.percentile(value, percentile)
-    vmax = np.percentile(value, 100 - percentile)
-
-    if vmin != vmax:
-        value = (value - vmin) / (vmax - vmin)  # vmin..vmax
-    else:
-        value = value * 0.
-
-    cmapper = matplotlib.cm.get_cmap(cmap)
-    value = cmapper(value, bytes=True)  # ((1)xhxwx4)
-
-    value = value[:, :, :3] # bgr -> rgb
-    # rgb_value = value[..., ::-1]
-    rgb_value = value
-
-    rgb_value = np.transpose(rgb_value, (2, 0, 1))
-    rgb_value = rgb_value[np.newaxis, ...]
-
-    return rgb_value
-
-def colorize_depth_maps(depth_map, min_depth=0, max_depth=0, cmap='Spectral_r', valid_mask=None):
+model_name = 'Zhyever/patchfusion_depth_anything_vitl14'
+
+# 'Zhyever/patchfusion_depth_anything_vits14', 
+# 'Zhyever/patchfusion_depth_anything_vitb14', 
+# 'Zhyever/patchfusion_depth_anything_vitl14', 
+# 'Zhyever/patchfusion_zoedepth'
+
+model = PatchFusion.from_pretrained(model_name).to(DEVICE).eval()
+
+pf_ckp = hf_hub_download(repo_id="Zhyever/PatchFusion", filename="DepthAnything_vitl.pt")
+depth_anything_base = ZoeDepth.build(**model.config.coarse_branch)
+depth_anything_base.load_state_dict(torch.load(pf_ckp, map_location='cpu')['model'])
+depth_anything_base = depth_anything_base.to(DEVICE).eval()
+
+# @spaces.GPU
+@torch.no_grad()
+def predict_depth(model, image_lr, image_hr, mode, patch_number, resolution_h, resolution_w, patch_split_number_h, patch_split_number_w):
+    process_num = 4
+    tile_cfg = dict()
+    tile_cfg['image_raw_shape'] = (resolution_h, resolution_w)
+    tile_cfg['patch_split_num'] = (patch_split_number_h, patch_split_number_w)
+    if mode == 'r':
+        mode = mode + str(patch_number)
+    result, log_dict = model(mode='infer', cai_mode=mode, process_num=process_num, image_lr=image_lr, image_hr=image_hr, tile_cfg=tile_cfg)
+    return result
+
+# def colorize_depth(depth):
+#     depth = (depth - depth.min()) / (depth.max() - depth.min())
+#     depth = (1 - depth) * 255.0
+#     depth = depth.astype(np.uint8)
+#     colored_depth = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)[:, :, ::-1]
+#     return colored_depth
+
+
+def colorize_depth(depth_map, min_depth=0, max_depth=0, cmap='Spectral'):
     """
     Colorize depth maps.
     """
 
     percentile = 0.03
+    # percentile = 5
     min_depth = np.percentile(depth_map, percentile)
     max_depth = np.percentile(depth_map, 100 - percentile)
     
@@ -157,214 +109,134 @@ def colorize_depth_maps(depth_map, min_depth=0, max_depth=0, cmap='Spectral_r',
     cm = matplotlib.colormaps[cmap]
     depth = ((depth - min_depth) / (max_depth - min_depth)).clip(0, 1)
     img_colored_np = cm(depth, bytes=False)[:,:,:,0:3]  # value from 0 to 1
-    img_colored_np = np.rollaxis(img_colored_np, 3, 1)
-    
-    if valid_mask is not None:
-        if isinstance(depth_map, torch.Tensor):
-            valid_mask = valid_mask.detach().numpy()
-        valid_mask = valid_mask.squeeze()  # [H, W] or [B, H, W]
-        if valid_mask.ndim < 3:
-            valid_mask = valid_mask[np.newaxis, np.newaxis, :, :]
-        else:
-            valid_mask = valid_mask[:, np.newaxis, :, :]
-        valid_mask = np.repeat(valid_mask, 3, axis=1)
-        img_colored_np[~valid_mask] = 0
-    
-    if isinstance(depth_map, torch.Tensor):
-        img_colored = torch.from_numpy(img_colored_np).float()
-    elif isinstance(depth_map, np.ndarray):
-        img_colored = img_colored_np
-    
-    return img_colored
-
-def hack_process(path_input, path_depth=None, path_gen=None):
-    if path_depth is not None and path_gen is not None:
-        return path_input, path_depth, path_gen
-
-def rescale(A, lbound=-1, ubound=1):
-    """
-    Rescale an array to [lbound, ubound].
-
-    Parameters:
-    - A: Input data as numpy array
-    - lbound: Lower bound of the scale, default is 0.
-    - ubound: Upper bound of the scale, default is 1.
-
-    Returns:
-    - Rescaled array
-    """
-    A_min = np.min(A)
-    A_max = np.max(A)
-    return (ubound - lbound) * (A - A_min) / (A_max - A_min) + lbound
+    # img_colored_np = np.rollaxis(img_colored_np, 3, 1)
 
-def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, ddim_steps, guess_mode, strength, scale, seed, eta, mode, patch_number, resolution, patch_size, color_map):
-    with torch.no_grad():
-        w, h = input_image.size
-
-        depth_model.to(DEVICE)
-        detected_map = predict_depth(depth_model, input_image, mode, patch_number, resolution, patch_size, device=DEVICE)
-        detected_map_save = copy.deepcopy(detected_map)
-        tmp = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
-        detected_map_save = Image.fromarray((detected_map_save*256).astype('uint16'))
-        detected_map_save.save(tmp.name)
-
-        depth_model.cpu() # free some mem
-        gc.collect()
-        torch.cuda.empty_cache() 
-
-        if color_map == 'magma':
-            colored_depth = colorize(detected_map)
-        elif color_map == 'gray':
-            colored_depth = colorize(detected_map, cmap='gray_r')
-        else:
-            colored_depth = colorize_depth_maps(detected_map) * 255
-
-        detected_map = F.interpolate(torch.from_numpy(detected_map).unsqueeze(dim=0).unsqueeze(dim=0), (image_resolution, image_resolution), mode='bicubic', align_corners=True).squeeze().numpy()
-
-        H, W = detected_map.shape
-        detected_map_temp = ((1 - detected_map / (np.max(detected_map + 1e-3))) * 255)
-        detected_map = detected_map_temp.astype("uint8")
-
-        detected_map_temp = detected_map_temp[:, :, None]
-        detected_map_temp = np.concatenate([detected_map_temp, detected_map_temp, detected_map_temp], axis=2)
-        detected_map = detected_map[:, :, None]
-        detected_map = np.concatenate([detected_map, detected_map, detected_map], axis=2)
-        
-        control = torch.from_numpy(detected_map.copy()).float().to(DEVICE) / 255.0
-        control = torch.stack([control for _ in range(num_samples)], dim=0)
-        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
-
-        if seed == -1:
-            seed = random.randint(0, 65535)
-        seed_everything(seed)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=False)
-
-        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
-        un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
-        shape = (4, H // 8, W // 8)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=True)
-
-        model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
-                                                     shape, cond, verbose=False, eta=eta,
-                                                     unconditional_guidance_scale=scale,
-                                                     unconditional_conditioning=un_cond)
-
-        if config.save_memory:
-            model.low_vram_shift(is_diffusing=False)
-
-        x_samples = model.decode_first_stage(samples)
-        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255)
-
-        results = [x_samples[i] for i in range(num_samples)]
-
-        return_list = [colored_depth] + results
-        update_return_list = []
-        for idx, r in enumerate(return_list):
-            if idx == 0:
-                t_r = torch.from_numpy(r)
-            else:
-                t_r = torch.from_numpy(r).unsqueeze(dim=0).permute(0, 3, 1, 2)
-            # t_r = F.interpolate(t_r, (h, w), mode='bicubic', align_corners=True).squeeze().permute(1, 2, 0).numpy().astype(np.uint8)
-            t_r = t_r.squeeze().permute(1, 2, 0).numpy().astype(np.uint8)
-            update_return_list.append(t_r)
-        update_return_list.append(tmp.name)
+    img_colored_np = img_colored_np * 255.0
+    img_colored_np = img_colored_np.astype(np.uint8)
+    img_colored_np = img_colored_np[0, :, :, :]
+    
+    return img_colored_np
 
-    return update_return_list
+def gallery_fn(image, coarse, fine, **kwargs):
+    return [(coarse, fine), image]
 
 title = "# PatchFusion"
 description = """Official demo for **PatchFusion: An End-to-End Tile-Based Framework for High-Resolution Monocular Metric Depth Estimation**.
 
 PatchFusion is a deep learning model for high-resolution metric depth estimation from a single image.
-
 Please refer to our [project webpage](https://zhyever.github.io/patchfusion), [paper](https://arxiv.org/abs/2312.02284) or [github](https://github.com/zhyever/PatchFusion) for more details.
+We use Depth-Anything VitL14 as the default model in this demo.
+You can slide the output to compare the depth prediction from the base Depth-Anything model and PatchFusion.
 
-**Running PatchFusion depth estimation pipeline needs about 12GB memory on 4K images.** 
+"""
 
-# Advanced tips
+markdown = """
+PatchFusion works on default 4k (2160x3840) resolution. All input images will be resized to 4k before passing through PatchFusion as default. Users can increase the processing resolution in the advanced option (resolution_h, resolution_w).
 
-The overall pipeline: image --> (PatchFusion) --> depth --> (controlnet) --> generated image.
+PatchFusion has three modes: m1, m2, and rn. They are corresponding to the different tiling strategies P16, P49, and Rn in our paper. We rename it for general usage. Please refer to the documentation provided in [github inference instruction](https://github.com/zhyever/PatchFusion/docs/user_infer.md).
 
-As for the PatchFusion, it works on default 4k (2160x3840) resolution. All input images will be resized to 4k before passing through PatchFusion as default. It means if you have a higher resolution image, you might want to increase the processing resolution in the advanced option (You would also change the patch size to 1/4 image resolution). Because of the tiling strategy, our PatchFusion would not use more memory or time for even higher resolution inputs if properly setting parameters. 
+We provide customized tiling variations for m1 and m2. Users can adjust the patch split number in the advanced option (patch_split_number_h, patch_split_number_w) (default: 4x4 splitting).
 
 The output depth map is resized to the original image resolution. Download for better visualization quality. 16-Bit Raw Depth = (pred_depth * 256).to(uint16).
-
-We provide three color maps to render depth map, which are magma (more common in supervised depth estimation), spectral (better looking), and gray (thanks for the suggestion from petermg ). Please choose from the advanced option.
-
-For ControlNet, it works on default 896x896 resolution. Again, all input images will be resized to 896x896 before passing through ControlNet as default. You might be not happy because the 4K->896x896 downsampling, but limited by the GPU resource, this demo could only achieve this. This is the memory bottleneck. The output is not resized back to the image resolution for fast inference (Well... It's still so slow now... :D).
-
 """
 
-with gr.Blocks() as demo:
+image_resizer = model.resizer
+
+with gr.Blocks(css=css) as demo:
     gr.Markdown(title)
     gr.Markdown(description)
-    
-    with gr.Row():
-        gr.Markdown("## Control Stable Diffusion with Depth Maps")
+    gr.Markdown("### Depth Prediction demo")
+    gr.Markdown(markdown)
+
+    prediction_coarse = gr.File(
+        visible=False,
+    )
+    prediction_fine = gr.File(
+        visible=False,
+    )
     
     with gr.Row():
         with gr.Accordion("Advanced options", open=False):
-            # mode = gr.Radio(["P49", "R"], label="Tiling mode", info="We recommand using P49 for fast evaluation and R with 1024 patches for best visualization results, respectively", elem_id='mode', value='R')
-            mode = gr.Radio(["P49", "R"], label="Tiling mode", info="We recommand using P49 for fast evaluation and R with 1024 patches for best visualization results, respectively", elem_id='mode', value='P49')
-            patch_number = gr.Slider(1, 1024, label="Please decide the number of random patches (Only useful in mode=R)", step=1, value=256)
-            resolution = gr.Textbox(label="(PatchFusion) Proccessing resolution (Default 4K. Use 'x' to split height and width.)", elem_id='mode', value='2160x3840')
-            patch_size = gr.Textbox(label="(PatchFusion) Patch size (Default 1/4 of image resolution. Use 'x' to split height and width.)", elem_id='mode', value='540x960')
-            color_map = gr.Radio(["magma", "spectral", "gray"], label="Colormap used to render depth map", elem_id='mode', value='magma')
-
-            num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
-            image_resolution = gr.Slider(label="ControlNet image resolution (higher resolution will lead to OOM)", minimum=256, maximum=1024, value=896, step=64)
-            strength = gr.Slider(label="Control strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
-            guess_mode = gr.Checkbox(label='Guess Mode', value=False)
-            # detect_resolution = gr.Slider(label="Depth Resolution", minimum=128, maximum=1024, value=384, step=1)
-            ddim_steps = gr.Slider(label="steps", minimum=1, maximum=100, value=20, step=1)
-            scale = gr.Slider(label="guidance scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-            seed = gr.Slider(label="seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-            eta = gr.Number(label="eta (DDIM)", value=0.0)
-            a_prompt = gr.Textbox(label="Added prompt", value='best quality, extremely detailed')
-            n_prompt = gr.Textbox(label="Negative prompt", value='worst quality, low quality, lose details')
-
-    with gr.Row():
-        with gr.Column():
-            # input_image = gr.Image(source='upload', type="pil")
-            input_image = gr.Image(label="Input Image", type='pil')
-            prompt = gr.Textbox(label="Prompt (input your description)", value='A cozy cottage in an oil painting, with rich textures and vibrant green foliage')
-            run_button = gr.Button("Run")
+            resolution_h = gr.Slider(label="Proccessing resolution height (Default 4K, 2160)", minimum=256, maximum=2700, value=2160, step=10)
+            resolution_w = gr.Slider(label="Proccessing resolution width (Default 4K, 3840)", minimum=256, maximum=4800, value=3840, step=10)
+            mode = gr.Radio(["m2", "r"], label="Tiling mode", info="We recommand using M2 for fast evaluation and R with 256 patches for best visualization results, respectively", elem_id='mode', value='m2')
+            # mode = gr.Radio(["m1", "m2", "r"], label="Tiling mode", info="We recommand using M2 for fast evaluation and R with 256 patches for best visualization results, respectively", elem_id='mode', value='m1')
+            patch_number = gr.Slider(1, 256, label="Please decide the number of random patches (Only useful in mode=R)", step=1, value=256)
+            patch_split_number_h = gr.Slider(label="Patch split number of height dimension (Default 4)", minimum=2, maximum=8, value=4, step=1)
+            patch_split_number_w = gr.Slider(label="Patch split number of width dimension (Default 4)", minimum=2, maximum=8, value=4, step=1)
             
-        generated_image = gr.Image(label="Generated Map", elem_id='img-display-output')
+            color_map = gr.Radio(["turbo_r", "magma_r", "Spectral", "gray"], label="Colormap used to render depth map", elem_id='mode', value='turbo_r')
 
     with gr.Row():
-        depth_image = gr.Image(label="Depth Map", elem_id='img-display-output')
+        input_image = gr.Image(label="Input Image", type='numpy', elem_id='img-display-input')
+    
     with gr.Row():
-        raw_file = gr.File(label="16-Bit Raw Depth, Multiplier:256")
+        depth_image_slider = ImageSlider(label="Depth Map with Slider View", elem_id='img-display-output', position=0.5)
+        
+    raw_file = gr.File(label="16-Bit Raw Depth, Multiplier:256")
+    submit = gr.Button("Submit")
+
+    def on_submit(image, mode, patch_number, resolution_h, resolution_w, patch_split_number_h, patch_split_number_w, color_map):
+        h, w = image.shape[:2]
+
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0
+        image = transforms.ToTensor()(np.asarray(image)) # raw image
+        image_lr = image_resizer(image.unsqueeze(dim=0)).float().to(DEVICE)
+        
+        # make sure the resolution and patch number are valid
+        # adjust the relationship
+        if resolution_h % (2 * patch_split_number_h) != 0:
+            resolution_h = resolution_h // (2 * patch_split_number_h) * (2 * patch_split_number_h)
+        if resolution_w % (2 * patch_split_number_w) != 0:
+            resolution_w = resolution_w // (2 * patch_split_number_w) * (2 * patch_split_number_w)
+            
+        image_hr = F.interpolate(image.unsqueeze(dim=0), (resolution_h, resolution_w), mode='bicubic', align_corners=True).float().to(DEVICE)
+
+        pf_prediction = predict_depth(model, image_lr, image_hr, mode, patch_number, resolution_h, resolution_w, patch_split_number_h, patch_split_number_w)
+        coarse_prediction = depth_anything_base(image_lr)['metric_depth']
+        pf_prediction = F.interpolate(pf_prediction, (h, w))[0, 0].detach().cpu().numpy()
+        coarse_prediction = F.interpolate(coarse_prediction, (h, w))[0, 0].detach().cpu().numpy()
+        
+        pf_prediction_colored = colorize_depth(pf_prediction, cmap=color_map)
+        coarse_predictioncolored = colorize_depth(coarse_prediction, cmap=color_map)
+        
+        raw_depth = Image.fromarray((pf_prediction*256).astype('uint16'))
+        tmp = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
+        raw_depth.save(tmp.name)
 
-    ips = [input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, ddim_steps, guess_mode, strength, scale, seed, eta, mode, patch_number, resolution, patch_size, color_map]
-    run_button.click(fn=process, inputs=ips, outputs=[depth_image, generated_image, raw_file])
+        return [(coarse_predictioncolored, pf_prediction_colored), tmp.name]
+
+    ips = [input_image, mode, patch_number, resolution_h, resolution_w, patch_split_number_h, patch_split_number_w, color_map]
+    submit.click(on_submit, inputs=ips, outputs=[depth_image_slider, raw_file])
+
+    inputs = [input_image, prediction_coarse, prediction_fine]
+    outputs = [depth_image_slider, input_image,]
+        
     examples = gr.Examples(
-        inputs=[input_image, depth_image, generated_image],
-        outputs=[input_image, depth_image, generated_image],
-        examples=[
-            [
-                "examples/example_4.jpeg",
-                "examples/2_depth.png",
-                "examples/2_gen.png",
-
-            ],
-            [
-                "examples/example_6.png",
-                "examples/4_depth.png",
-                "examples/4_gen.png",
-            ],
-            [
-                "examples/example_1.jpeg",
-                "examples/1_depth.png",
-                "examples/1_gen.png",
-            ],],
-        cache_examples=True,
-        fn=hack_process)
+    inputs=inputs,
+    outputs=outputs,
+    examples=[
+        [
+            "examples/example_1.jpeg",
+            "examples/example_1_coarse.png",
+            "examples/example_1_fine.png",
+
+        ],
+        [
+            "examples/example_2.jpeg",
+            "examples/example_2_coarse.png",
+            "examples/example_2_fine.png",
+
+        ],
+        [
+            "examples/example_3.jpeg",
+            "examples/example_3_coarse.png",
+            "examples/example_3_fine.png",
+
+        ]],
+    cache_examples=True,
+    fn=gallery_fn,)
+    
 
 if __name__ == '__main__':
     demo.queue().launch(share=True)
\ No newline at end of file
diff --git a/config.yaml b/config.yaml
deleted file mode 100644
index 53ebc442cc2e3395b0c6cb841e64a511cb26004a..0000000000000000000000000000000000000000
--- a/config.yaml
+++ /dev/null
@@ -1,129 +0,0 @@
-!!python/object/new:zoedepth.utils.easydict.EasyDict
-dictitems:
-  attractor_alpha: 1000
-  attractor_gamma: 2
-  attractor_kind: mean
-  attractor_type: inv
-  aug: true
-  baseline: true
-  bin_centers_type: softplus
-  bin_embedding_dim: 128
-  clip_grad: 0.1
-  condition: true
-  dataset: nyu
-  distributed: true
-  do_resize: false
-  force_keep_ar: true
-  freeze: true
-  g2l: true
-  gpu: null
-  img_size: &id001
-  - 384
-  - 512
-  inverse_midas: false
-  log_images_every: 0.2
-  max_temp: 50.0
-  max_translation: 100
-  memory_efficient: true
-  midas_model_type: DPT_BEiT_L_384
-  min_temp: 0.0212
-  model: zoedepth_custom
-  n_attractors: &id002
-  - 16
-  - 8
-  - 4
-  - 1
-  n_bins: 64
-  name: PatchFusion
-  notes: ''
-  output_distribution: logbinomial
-  prefetch: false
-  pretrained_resource: ''
-  print_losses: false
-  project: ZoeDepth
-  random_crop: false
-  random_translate: false
-  raw_depth_shape: &id003
-  - 2160
-  - 3840
-  root: .
-  sample_feat_level: 4
-  sampled_training: false
-  save_dir: ./nfs/monodepth3_checkpoints
-  shared_dict: null
-  sr_ratio: 1
-  tags: ''
-  train_midas: false
-  transform_sample_gt_size: &id004
-  - 2160
-  - 3840
-  translate_prob: 0.2
-  uid: null
-  use_amp: false
-  use_area_prior: true
-  use_fusion_network: true
-  use_hr: false
-  use_pretrained_midas: false
-  use_shared_dict: false
-  validate_every: 1
-  version_name: patchfusion
-  workers: 24
-state:
-  attractor_alpha: 1000
-  attractor_gamma: 2
-  attractor_kind: mean
-  attractor_type: inv
-  aug: true
-  baseline: true
-  bin_centers_type: softplus
-  bin_embedding_dim: 128
-  clip_grad: 0.1
-  condition: true
-  dataset: nyu
-  distributed: true
-  do_resize: false
-  force_keep_ar: true
-  freeze: true
-  g2l: true
-  gpu: null
-  img_size: *id001
-  inverse_midas: false
-  log_images_every: 0.2
-  max_temp: 50.0
-  max_translation: 100
-  memory_efficient: true
-  midas_model_type: DPT_BEiT_L_384
-  min_temp: 0.0212
-  model: zoedepth_custom
-  n_attractors: *id002
-  n_bins: 64
-  name: PatchFusion
-  notes: ''
-  output_distribution: logbinomial
-  prefetch: false
-  pretrained_resource: ''
-  print_losses: false
-  project: ZoeDepth
-  random_crop: false
-  random_translate: false
-  raw_depth_shape: *id003
-  root: .
-  sample_feat_level: 4
-  sampled_training: false
-  save_dir: ./nfs/monodepth3_checkpoints
-  shared_dict: null
-  sr_ratio: 1
-  tags: ''
-  train_midas: false
-  transform_sample_gt_size: *id004
-  translate_prob: 0.2
-  uid: null
-  use_amp: false
-  use_area_prior: true
-  use_fusion_network: true
-  use_hr: false
-  use_pretrained_midas: false
-  use_shared_dict: false
-  validate_every: 1
-  version_name: patchfusion
-  workers: 24
diff --git a/configs/_base_/run_time.py b/configs/_base_/run_time.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5d9f36d382399179da6d103acf3644dc754b029
--- /dev/null
+++ b/configs/_base_/run_time.py
@@ -0,0 +1,5 @@
+
+env_cfg=dict(
+    cudnn_benchmark=True,
+    mp_cfg=dict(mp_start_method='forkserver'),
+    dist_cfg=dict(backend='nccl'))
\ No newline at end of file
diff --git a/configs/patchfusion_depthanything/depthanything_vitb_coarse_pretrain_u4k.py b/configs/patchfusion_depthanything/depthanything_vitb_coarse_pretrain_u4k.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e813fdcbb2515d3d3317df829ece85b1afd7409
--- /dev/null
+++ b/configs/patchfusion_depthanything/depthanything_vitb_coarse_pretrain_u4k.py
@@ -0,0 +1,120 @@
+_base_ = [
+    '../_base_/datasets/u4k.py', 
+]
+
+min_depth=1e-3
+max_depth=80
+    
+zoe_depth_config=dict(
+    type='DA-ZoeDepth',
+    
+    min_depth=min_depth,
+    max_depth=max_depth,
+    
+    depth_anything=True,
+    midas_model_type='vitb',
+    img_size=[392, 518],
+        
+    # some important params
+    # midas_model_type='DPT_BEiT_L_384',
+    pretrained_resource='local::./work_dir/DepthAnything_vitb.pt',
+    use_pretrained_midas=True,
+    train_midas=True,
+    freeze_midas_bn=True,
+    do_resize=False, # do not resize image in midas
+
+    # default settings
+    attractor_alpha=1000,
+    attractor_gamma=2,
+    attractor_kind='mean',
+    attractor_type='inv',
+    aug=True,
+    bin_centers_type='softplus',
+    bin_embedding_dim=128,
+    clip_grad=0.1,
+    dataset='nyu',
+    distributed=True,
+    force_keep_ar=True,
+    gpu='NULL',
+    inverse_midas=False,
+    log_images_every=0.1,
+    max_temp=50.0,
+    max_translation=100,
+    memory_efficient=True,
+    min_temp=0.0212,
+    model='zoedepth',
+    n_attractors=[16, 8, 4, 1],
+    n_bins=64,
+    name='ZoeDepth',
+    notes='',
+    output_distribution='logbinomial',
+    prefetch=False,
+    print_losses=False,
+    project='ZoeDepth',
+    random_crop=False,
+    random_translate=False,
+    root='.',
+    save_dir='',
+    shared_dict='NULL',
+    tags='',
+    translate_prob=0.2,
+    uid='NULL',
+    use_amp=False,
+    use_shared_dict=False,
+    validate_every=0.25,
+    version_name='v1',
+    workers=16,
+)
+
+model=dict(
+    type='BaselinePretrain',
+    min_depth=min_depth,
+    max_depth=max_depth,
+    target='coarse',
+    coarse_branch=zoe_depth_config,
+    fine_branch=zoe_depth_config,
+    sigloss=dict(type='SILogLoss'))
+
+collect_input_args=['image_lr', 'crops_image_hr', 'depth_gt', 'crop_depths', 'bboxs', 'image_hr']
+
+project='patchfusion'
+
+train_cfg=dict(max_epochs=24, val_interval=2, save_checkpoint_interval=24, log_interval=100, train_log_img_interval=500, val_log_img_interval=50, val_type='epoch_base', eval_start=0)
+
+optim_wrapper=dict(
+    # optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01),
+    optimizer=dict(type='AdamW', lr=0.0002/50, weight_decay=0.01),
+    clip_grad=dict(type='norm', max_norm=0.1, norm_type=2), # norm clip
+    paramwise_cfg=dict(
+        bypass_duplicate=True,
+        custom_keys={
+        }))
+
+param_scheduler=dict(
+    cycle_momentum=True,
+    base_momentum=0.85,
+    max_momentum=0.95,
+    div_factor=1,
+    final_div_factor=10000,
+    pct_start=0.5,
+    three_phase=False,)
+
+env_cfg=dict(
+    cudnn_benchmark=True,
+    mp_cfg=dict(mp_start_method='forkserver'),
+    dist_cfg=dict(backend='nccl'))
+
+convert_syncbn=True
+find_unused_parameters=True
+
+train_dataloader=dict(
+    dataset=dict(
+        resize_mode='depth-anything',
+        transform_cfg=dict(
+            network_process_size=[392, 518])))
+
+val_dataloader=dict(
+    dataset=dict(
+        resize_mode='depth-anything',
+        transform_cfg=dict(
+            network_process_size=[392, 518])))
\ No newline at end of file
diff --git a/configs/patchfusion_depthanything/depthanything_vitb_fine_pretrain_u4k.py b/configs/patchfusion_depthanything/depthanything_vitb_fine_pretrain_u4k.py
new file mode 100644
index 0000000000000000000000000000000000000000..88104e6200fe47016c99a5af03445a5ac72f7547
--- /dev/null
+++ b/configs/patchfusion_depthanything/depthanything_vitb_fine_pretrain_u4k.py
@@ -0,0 +1,121 @@
+_base_ = [
+    '../_base_/datasets/u4k.py', 
+]
+
+min_depth=1e-3
+max_depth=80
+    
+zoe_depth_config=dict(
+    type='DA-ZoeDepth',
+    
+    min_depth=min_depth,
+    max_depth=max_depth,
+    
+    depth_anything=True,
+    midas_model_type='vitb',
+    img_size=[392, 518],
+        
+    # some important params
+    # midas_model_type='DPT_BEiT_L_384',
+    pretrained_resource='local::./work_dir/DepthAnything_vitb.pt',
+    use_pretrained_midas=True,
+    train_midas=True,
+    freeze_midas_bn=True,
+    do_resize=False, # do not resize image in midas
+
+    # default settings
+    attractor_alpha=1000,
+    attractor_gamma=2,
+    attractor_kind='mean',
+    attractor_type='inv',
+    aug=True,
+    bin_centers_type='softplus',
+    bin_embedding_dim=128,
+    clip_grad=0.1,
+    dataset='nyu',
+    distributed=True,
+    force_keep_ar=True,
+    gpu='NULL',
+    inverse_midas=False,
+    log_images_every=0.1,
+    max_temp=50.0,
+    max_translation=100,
+    memory_efficient=True,
+    min_temp=0.0212,
+    model='zoedepth',
+    n_attractors=[16, 8, 4, 1],
+    n_bins=64,
+    name='ZoeDepth',
+    notes='',
+    output_distribution='logbinomial',
+    prefetch=False,
+    print_losses=False,
+    project='ZoeDepth',
+    random_crop=False,
+    random_translate=False,
+    root='.',
+    save_dir='',
+    shared_dict='NULL',
+    tags='',
+    translate_prob=0.2,
+    uid='NULL',
+    use_amp=False,
+    use_shared_dict=False,
+    validate_every=0.25,
+    version_name='v1',
+    workers=16,
+)
+
+model=dict(
+    type='BaselinePretrain',
+    patch_process_shape=(392, 518),
+    min_depth=min_depth,
+    max_depth=max_depth,
+    target='fine',
+    coarse_branch=zoe_depth_config,
+    fine_branch=zoe_depth_config,
+    sigloss=dict(type='SILogLoss'))
+
+collect_input_args=['image_lr', 'crops_image_hr', 'depth_gt', 'crop_depths', 'bboxs', 'image_hr']
+
+project='patchfusion'
+
+train_cfg=dict(max_epochs=24, val_interval=2, save_checkpoint_interval=24, log_interval=100, train_log_img_interval=500, val_log_img_interval=50, val_type='epoch_base', eval_start=0)
+
+optim_wrapper=dict(
+    # optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01),
+    optimizer=dict(type='AdamW', lr=0.0002/50, weight_decay=0.01),
+    clip_grad=dict(type='norm', max_norm=0.1, norm_type=2), # norm clip
+    paramwise_cfg=dict(
+        bypass_duplicate=True,
+        custom_keys={
+        }))
+
+param_scheduler=dict(
+    cycle_momentum=True,
+    base_momentum=0.85,
+    max_momentum=0.95,
+    div_factor=1,
+    final_div_factor=10000,
+    pct_start=0.5,
+    three_phase=False,)
+
+env_cfg=dict(
+    cudnn_benchmark=True,
+    mp_cfg=dict(mp_start_method='forkserver'),
+    dist_cfg=dict(backend='nccl'))
+
+convert_syncbn=True
+find_unused_parameters=True
+
+train_dataloader=dict(
+    dataset=dict(
+        resize_mode='depth-anything',
+        transform_cfg=dict(
+            network_process_size=[392, 518])))
+
+val_dataloader=dict(
+    dataset=dict(
+        resize_mode='depth-anything',
+        transform_cfg=dict(
+            network_process_size=[392, 518])))
\ No newline at end of file
diff --git a/configs/patchfusion_depthanything/depthanything_vitb_patchfusion_u4k.py b/configs/patchfusion_depthanything/depthanything_vitb_patchfusion_u4k.py
new file mode 100644
index 0000000000000000000000000000000000000000..d745c204f534889144e604e28eed2ad5799290f1
--- /dev/null
+++ b/configs/patchfusion_depthanything/depthanything_vitb_patchfusion_u4k.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../_base_/datasets/u4k.py', 
+    '../_base_/datasets/general_dataset.py',
+    '../_base_/run_time.py'
+]
+
+min_depth=1e-3
+max_depth=80
+zoe_depth_config=dict(
+    type='DA-ZoeDepth',
+    
+    min_depth=min_depth,
+    max_depth=max_depth,
+    
+    depth_anything=True,
+    midas_model_type='vitb',
+    img_size=[392, 518],
+        
+    # some important params
+    # midas_model_type='DPT_BEiT_L_384',
+    pretrained_resource=None,
+    use_pretrained_midas=True,
+    train_midas=True,
+    freeze_midas_bn=True,
+    do_resize=False, # do not resize image in midas
+
+    # default settings
+    attractor_alpha=1000,
+    attractor_gamma=2,
+    attractor_kind='mean',
+    attractor_type='inv',
+    aug=True,
+    bin_centers_type='softplus',
+    bin_embedding_dim=128,
+    clip_grad=0.1,
+    dataset='nyu',
+    distributed=True,
+    force_keep_ar=True,
+    gpu='NULL',
+    inverse_midas=False,
+    log_images_every=0.1,
+    max_temp=50.0,
+    max_translation=100,
+    memory_efficient=True,
+    min_temp=0.0212,
+    model='zoedepth',
+    n_attractors=[16, 8, 4, 1],
+    n_bins=64,
+    name='ZoeDepth',
+    notes='',
+    output_distribution='logbinomial',
+    prefetch=False,
+    print_losses=False,
+    project='ZoeDepth',
+    random_crop=False,
+    random_translate=False,
+    root='.',
+    save_dir='',
+    shared_dict='NULL',
+    tags='',
+    translate_prob=0.2,
+    uid='NULL',
+    use_amp=False,
+    use_shared_dict=False,
+    validate_every=0.25,
+    version_name='v1',
+    workers=16,
+)
+
+
+model=dict(
+    type='PatchFusion',
+    config=dict(
+        image_raw_shape=(2160, 3840),
+        patch_split_num=(4, 4),
+        patch_process_shape=(392, 518),
+        min_depth=min_depth,
+        max_depth=max_depth,
+        load_branch=True,
+        pretrain_model=['./work_dir/depthanything_vitb_u4k/coarse_pretrain/checkpoint_24.pth', './work_dir/depthanything_vitb_u4k/fine_pretrain/checkpoint_24.pth'], # coarse, fine
+        coarse_branch=zoe_depth_config,
+        fine_branch=zoe_depth_config,
+        guided_fusion=dict(
+            type='GuidedFusionPatchFusion',
+            patch_process_shape=(392, 518),
+            in_channels=[32, 128, 128, 128, 128, 128],
+            num_patches=[392*518, 224*296, 112*148, 56*74, 28*37, 14*19],
+            n_channels=5, 
+            g2l=True,),
+        sigloss=dict(type='SILogLoss')))
+
+collect_input_args=['image_lr', 'crops_image_hr', 'depth_gt', 'crop_depths', 'bboxs', 'image_hr']
+
+project='patchfusion'
+
+train_cfg=dict(max_epochs=16, val_interval=2, save_checkpoint_interval=16, log_interval=100, train_log_img_interval=500, val_log_img_interval=50, val_type='epoch_base', eval_start=0)
+
+optim_wrapper=dict(
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.001),
+    clip_grad=dict(type='norm', max_norm=0.1, norm_type=2), # norm clip
+    paramwise_cfg=dict(
+        bypass_duplicate=True,
+        custom_keys={
+        }))
+
+param_scheduler=dict(
+    cycle_momentum=True,
+    base_momentum=0.85,
+    max_momentum=0.95,
+    div_factor=10,
+    final_div_factor=10000,
+    pct_start=0.25,
+    three_phase=False,)
+
+convert_syncbn=True
+find_unused_parameters=True
+
+train_dataloader=dict(
+    dataset=dict(
+        resize_mode='depth-anything',
+        transform_cfg=dict(
+            network_process_size=[392, 518])))
+
+val_dataloader=dict(
+    dataset=dict(
+        resize_mode='depth-anything',
+        transform_cfg=dict(
+            network_process_size=[392, 518])))
+
+general_dataloader=dict(
+    dataset=dict(
+        network_process_size=(392, 518),
+        resize_mode='depth-anything'))
diff --git a/configs/patchfusion_depthanything/depthanything_vitl_coarse_pretrain_u4k.py b/configs/patchfusion_depthanything/depthanything_vitl_coarse_pretrain_u4k.py
new file mode 100644
index 0000000000000000000000000000000000000000..d914df7ca4e28053f7abbbea41ef80255eacd37c
--- /dev/null
+++ b/configs/patchfusion_depthanything/depthanything_vitl_coarse_pretrain_u4k.py
@@ -0,0 +1,120 @@
+_base_ = [
+    '../_base_/datasets/u4k.py', 
+]
+
+min_depth=1e-3
+max_depth=80
+    
+zoe_depth_config=dict(
+    type='DA-ZoeDepth',
+    
+    min_depth=min_depth,
+    max_depth=max_depth,
+    
+    depth_anything=True,
+    midas_model_type='vitl',
+    img_size=[392, 518],
+        
+    # some important params
+    # midas_model_type='DPT_BEiT_L_384',
+    pretrained_resource='local::./work_dir/DepthAnything_vitl.pt',
+    use_pretrained_midas=True,
+    train_midas=True,
+    freeze_midas_bn=True,
+    do_resize=False, # do not resize image in midas
+
+    # default settings
+    attractor_alpha=1000,
+    attractor_gamma=2,
+    attractor_kind='mean',
+    attractor_type='inv',
+    aug=True,
+    bin_centers_type='softplus',
+    bin_embedding_dim=128,
+    clip_grad=0.1,
+    dataset='nyu',
+    distributed=True,
+    force_keep_ar=True,
+    gpu='NULL',
+    inverse_midas=False,
+    log_images_every=0.1,
+    max_temp=50.0,
+    max_translation=100,
+    memory_efficient=True,
+    min_temp=0.0212,
+    model='zoedepth',
+    n_attractors=[16, 8, 4, 1],
+    n_bins=64,
+    name='ZoeDepth',
+    notes='',
+    output_distribution='logbinomial',
+    prefetch=False,
+    print_losses=False,
+    project='ZoeDepth',
+    random_crop=False,
+    random_translate=False,
+    root='.',
+    save_dir='',
+    shared_dict='NULL',
+    tags='',
+    translate_prob=0.2,
+    uid='NULL',
+    use_amp=False,
+    use_shared_dict=False,
+    validate_every=0.25,
+    version_name='v1',
+    workers=16,
+)
+
+model=dict(
+    type='BaselinePretrain',
+    min_depth=min_depth,
+    max_depth=max_depth,
+    target='coarse',
+    coarse_branch=zoe_depth_config,
+    fine_branch=zoe_depth_config,
+    sigloss=dict(type='SILogLoss'))
+
+collect_input_args=['image_lr', 'crops_image_hr', 'depth_gt', 'crop_depths', 'bboxs', 'image_hr']
+
+project='patchfusion'
+
+train_cfg=dict(max_epochs=24, val_interval=2, save_checkpoint_interval=24, log_interval=100, train_log_img_interval=500, val_log_img_interval=50, val_type='epoch_base', eval_start=0)
+
+optim_wrapper=dict(
+    # optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01),
+    optimizer=dict(type='AdamW', lr=0.0002/50, weight_decay=0.01),
+    clip_grad=dict(type='norm', max_norm=0.1, norm_type=2), # norm clip
+    paramwise_cfg=dict(
+        bypass_duplicate=True,
+        custom_keys={
+        }))
+
+param_scheduler=dict(
+    cycle_momentum=True,
+    base_momentum=0.85,
+    max_momentum=0.95,
+    div_factor=1,
+    final_div_factor=10000,
+    pct_start=0.5,
+    three_phase=False,)
+
+env_cfg=dict(
+    cudnn_benchmark=True,
+    mp_cfg=dict(mp_start_method='forkserver'),
+    dist_cfg=dict(backend='nccl'))
+
+convert_syncbn=True
+find_unused_parameters=True
+
+train_dataloader=dict(
+    dataset=dict(
+        resize_mode='depth-anything',
+        transform_cfg=dict(
+            network_process_size=[392, 518])))
+
+val_dataloader=dict(
+    dataset=dict(
+        resize_mode='depth-anything',
+        transform_cfg=dict(
+            network_process_size=[392, 518])))
\ No newline at end of file
diff --git a/configs/patchfusion_depthanything/depthanything_vitl_fine_pretrain_u4k.py b/configs/patchfusion_depthanything/depthanything_vitl_fine_pretrain_u4k.py
new file mode 100644
index 0000000000000000000000000000000000000000..8681dd09fac9d5012d1c0b51d414fa0812d3c25a
--- /dev/null
+++ b/configs/patchfusion_depthanything/depthanything_vitl_fine_pretrain_u4k.py
@@ -0,0 +1,121 @@
+_base_ = [
+    '../_base_/datasets/u4k.py', 
+]
+
+min_depth=1e-3
+max_depth=80
+    
+zoe_depth_config=dict(
+    type='DA-ZoeDepth',
+    
+    min_depth=min_depth,
+    max_depth=max_depth,
+    
+    depth_anything=True,
+    midas_model_type='vitl',
+    img_size=[392, 518],
+        
+    # some important params
+    # midas_model_type='DPT_BEiT_L_384',
+    pretrained_resource='local::./work_dir/DepthAnything_vitl.pt',
+    use_pretrained_midas=True,
+    train_midas=True,
+    freeze_midas_bn=True,
+    do_resize=False, # do not resize image in midas
+
+    # default settings
+    attractor_alpha=1000,
+    attractor_gamma=2,
+    attractor_kind='mean',
+    attractor_type='inv',
+    aug=True,
+    bin_centers_type='softplus',
+    bin_embedding_dim=128,
+    clip_grad=0.1,
+    dataset='nyu',
+    distributed=True,
+    force_keep_ar=True,
+    gpu='NULL',
+    inverse_midas=False,
+    log_images_every=0.1,
+    max_temp=50.0,
+    max_translation=100,
+    memory_efficient=True,
+    min_temp=0.0212,
+    model='zoedepth',
+    n_attractors=[16, 8, 4, 1],
+    n_bins=64,
+    name='ZoeDepth',
+    notes='',
+    output_distribution='logbinomial',
+    prefetch=False,
+    print_losses=False,
+    project='ZoeDepth',
+    random_crop=False,
+    random_translate=False,
+    root='.',
+    save_dir='',
+    shared_dict='NULL',
+    tags='',
+    translate_prob=0.2,
+    uid='NULL',
+    use_amp=False,
+    use_shared_dict=False,
+    validate_every=0.25,
+    version_name='v1',
+    workers=16,
+)
+
+model=dict(
+    type='BaselinePretrain',
+    patch_process_shape=(392, 518),
+    min_depth=min_depth,
+    max_depth=max_depth,
+    target='fine',
+    coarse_branch=zoe_depth_config,
+    fine_branch=zoe_depth_config,
+    sigloss=dict(type='SILogLoss'))
+
+collect_input_args=['image_lr', 'crops_image_hr', 'depth_gt', 'crop_depths', 'bboxs', 'image_hr']
+
+project='patchfusion'
+
+train_cfg=dict(max_epochs=24, val_interval=2, save_checkpoint_interval=24, log_interval=100, train_log_img_interval=500, val_log_img_interval=50, val_type='epoch_base', eval_start=0)
+
+optim_wrapper=dict(
+    # optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01),
+    optimizer=dict(type='AdamW', lr=0.0002/50, weight_decay=0.01),
+    clip_grad=dict(type='norm', max_norm=0.1, norm_type=2), # norm clip
+    paramwise_cfg=dict(
+        bypass_duplicate=True,
+        custom_keys={
+        }))
+
+param_scheduler=dict(
+    cycle_momentum=True,
+    base_momentum=0.85,
+    max_momentum=0.95,
+    div_factor=1,
+    final_div_factor=10000,
+    pct_start=0.5,
+    three_phase=False,)
+
+env_cfg=dict(
+    cudnn_benchmark=True,
+    mp_cfg=dict(mp_start_method='forkserver'),
+    dist_cfg=dict(backend='nccl'))
+
+convert_syncbn=True
+find_unused_parameters=True
+
+train_dataloader=dict(
+    dataset=dict(
+        resize_mode='depth-anything',
+        transform_cfg=dict(
+            network_process_size=[392, 518])))
+
+val_dataloader=dict(
+    dataset=dict(
+        resize_mode='depth-anything',
+        transform_cfg=dict(
+            network_process_size=[392, 518])))
\ No newline at end of file
diff --git a/configs/patchfusion_depthanything/depthanything_vitl_patchfusion_u4k.py b/configs/patchfusion_depthanything/depthanything_vitl_patchfusion_u4k.py
new file mode 100644
index 0000000000000000000000000000000000000000..a967696d26e2039511ddf68d2f06b41f24ddf1f9
--- /dev/null
+++ b/configs/patchfusion_depthanything/depthanything_vitl_patchfusion_u4k.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../_base_/datasets/u4k.py', 
+    '../_base_/datasets/general_dataset.py',
+    '../_base_/run_time.py'
+]
+
+min_depth=1e-3
+max_depth=80
+zoe_depth_config=dict(
+    type='DA-ZoeDepth',
+    
+    min_depth=min_depth,
+    max_depth=max_depth,
+    
+    depth_anything=True,
+    midas_model_type='vitl',
+    img_size=[392, 518],
+        
+    # some important params
+    # midas_model_type='DPT_BEiT_L_384',
+    pretrained_resource=None,
+    use_pretrained_midas=True,
+    train_midas=True,
+    freeze_midas_bn=True,
+    do_resize=False, # do not resize image in midas
+
+    # default settings
+    attractor_alpha=1000,
+    attractor_gamma=2,
+    attractor_kind='mean',
+    attractor_type='inv',
+    aug=True,
+    bin_centers_type='softplus',
+    bin_embedding_dim=128,
+    clip_grad=0.1,
+    dataset='nyu',
+    distributed=True,
+    force_keep_ar=True,
+    gpu='NULL',
+    inverse_midas=False,
+    log_images_every=0.1,
+    max_temp=50.0,
+    max_translation=100,
+    memory_efficient=True,
+    min_temp=0.0212,
+    model='zoedepth',
+    n_attractors=[16, 8, 4, 1],
+    n_bins=64,
+    name='ZoeDepth',
+    notes='',
+    output_distribution='logbinomial',
+    prefetch=False,
+    print_losses=False,
+    project='ZoeDepth',
+    random_crop=False,
+    random_translate=False,
+    root='.',
+    save_dir='',
+    shared_dict='NULL',
+    tags='',
+    translate_prob=0.2,
+    uid='NULL',
+    use_amp=False,
+    use_shared_dict=False,
+    validate_every=0.25,
+    version_name='v1',
+    workers=16,
+)
+
+
+model=dict(
+    type='PatchFusion',
+    config=dict(
+        image_raw_shape=(2160, 3840),
+        patch_split_num=(4, 4),
+        patch_process_shape=(392, 518),
+        min_depth=min_depth,
+        max_depth=max_depth,
+        load_branch=True,
+        pretrain_model=['./work_dir/depthanything_vitl_u4k/coarse_pretrain/checkpoint_24.pth', './work_dir/depthanything_vitl_u4k/fine_pretrain/checkpoint_24.pth'], # coarse, fine
+        coarse_branch=zoe_depth_config,
+        fine_branch=zoe_depth_config,
+        guided_fusion=dict(
+            type='GuidedFusionPatchFusion',
+            patch_process_shape=(392, 518),
+            in_channels=[32, 256, 256, 256, 256, 256],
+            num_patches=[392*518, 224*296, 112*148, 56*74, 28*37, 14*19],
+            n_channels=5, 
+            g2l=True,),
+        sigloss=dict(type='SILogLoss')))
+
+collect_input_args=['image_lr', 'crops_image_hr', 'depth_gt', 'crop_depths', 'bboxs', 'image_hr']
+
+project='patchfusion'
+
+train_cfg=dict(max_epochs=16, val_interval=2, save_checkpoint_interval=16, log_interval=100, train_log_img_interval=500, val_log_img_interval=50, val_type='epoch_base', eval_start=0)
+
+optim_wrapper=dict(
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.001),
+    clip_grad=dict(type='norm', max_norm=0.1, norm_type=2), # norm clip
+    paramwise_cfg=dict(
+        bypass_duplicate=True,
+        custom_keys={
+        }))
+
+param_scheduler=dict(
+    cycle_momentum=True,
+    base_momentum=0.85,
+    max_momentum=0.95,
+    div_factor=10,
+    final_div_factor=10000,
+    pct_start=0.25,
+    three_phase=False,)
+
+convert_syncbn=True
+find_unused_parameters=True
+
+train_dataloader=dict(
+    dataset=dict(
+        resize_mode='depth-anything',
+        transform_cfg=dict(
+            network_process_size=[392, 518])))
+
+val_dataloader=dict(
+    dataset=dict(
+        resize_mode='depth-anything',
+        transform_cfg=dict(
+            network_process_size=[392, 518])))
+
+general_dataloader=dict(
+    dataset=dict(
+        network_process_size=(392, 518),
+        resize_mode='depth-anything'))
\ No newline at end of file
diff --git a/configs/patchfusion_depthanything/depthanything_vits_coarse_pretrain_u4k.py b/configs/patchfusion_depthanything/depthanything_vits_coarse_pretrain_u4k.py
new file mode 100644
index 0000000000000000000000000000000000000000..f14ecb30abd26e4277c2a23302f663f05fdeab1e
--- /dev/null
+++ b/configs/patchfusion_depthanything/depthanything_vits_coarse_pretrain_u4k.py
@@ -0,0 +1,120 @@
+_base_ = [
+    '../_base_/datasets/u4k.py', 
+]
+
+min_depth=1e-3
+max_depth=80
+    
+zoe_depth_config=dict(
+    type='DA-ZoeDepth',
+    
+    min_depth=min_depth,
+    max_depth=max_depth,
+    
+    depth_anything=True,
+    midas_model_type='vits',
+    img_size=[392, 518],
+        
+    # some important params
+    # midas_model_type='DPT_BEiT_L_384',
+    pretrained_resource='local::./work_dir/DepthAnything_vits.pt',
+    use_pretrained_midas=True,
+    train_midas=True,
+    freeze_midas_bn=True,
+    do_resize=False, # do not resize image in midas
+
+    # default settings
+    attractor_alpha=1000,
+    attractor_gamma=2,
+    attractor_kind='mean',
+    attractor_type='inv',
+    aug=True,
+    bin_centers_type='softplus',
+    bin_embedding_dim=128,
+    clip_grad=0.1,
+    dataset='nyu',
+    distributed=True,
+    force_keep_ar=True,
+    gpu='NULL',
+    inverse_midas=False,
+    log_images_every=0.1,
+    max_temp=50.0,
+    max_translation=100,
+    memory_efficient=True,
+    min_temp=0.0212,
+    model='zoedepth',
+    n_attractors=[16, 8, 4, 1],
+    n_bins=64,
+    name='ZoeDepth',
+    notes='',
+    output_distribution='logbinomial',
+    prefetch=False,
+    print_losses=False,
+    project='ZoeDepth',
+    random_crop=False,
+    random_translate=False,
+    root='.',
+    save_dir='',
+    shared_dict='NULL',
+    tags='',
+    translate_prob=0.2,
+    uid='NULL',
+    use_amp=False,
+    use_shared_dict=False,
+    validate_every=0.25,
+    version_name='v1',
+    workers=16,
+)
+
+model=dict(
+    type='BaselinePretrain',
+    min_depth=min_depth,
+    max_depth=max_depth,
+    target='coarse',
+    coarse_branch=zoe_depth_config,
+    fine_branch=zoe_depth_config,
+    sigloss=dict(type='SILogLoss'))
+
+collect_input_args=['image_lr', 'crops_image_hr', 'depth_gt', 'crop_depths', 'bboxs', 'image_hr']
+
+project='patchfusion'
+
+train_cfg=dict(max_epochs=24, val_interval=2, save_checkpoint_interval=24, log_interval=100, train_log_img_interval=500, val_log_img_interval=50, val_type='epoch_base', eval_start=0)
+
+optim_wrapper=dict(
+    # optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01),
+    optimizer=dict(type='AdamW', lr=0.0002/50, weight_decay=0.01),
+    clip_grad=dict(type='norm', max_norm=0.1, norm_type=2), # norm clip
+    paramwise_cfg=dict(
+        bypass_duplicate=True,
+        custom_keys={
+        }))
+
+param_scheduler=dict(
+    cycle_momentum=True,
+    base_momentum=0.85,
+    max_momentum=0.95,
+    div_factor=1,
+    final_div_factor=10000,
+    pct_start=0.5,
+    three_phase=False,)
+
+env_cfg=dict(
+    cudnn_benchmark=True,
+    mp_cfg=dict(mp_start_method='forkserver'),
+    dist_cfg=dict(backend='nccl'))
+
+convert_syncbn=True
+find_unused_parameters=True
+
+train_dataloader=dict(
+    dataset=dict(
+        resize_mode='depth-anything',
+        transform_cfg=dict(
+            network_process_size=[392, 518])))
+
+val_dataloader=dict(
+    dataset=dict(
+        resize_mode='depth-anything',
+        transform_cfg=dict(
+            network_process_size=[392, 518])))
\ No newline at end of file
diff --git a/configs/patchfusion_depthanything/depthanything_vits_fine_pretrain_u4k.py b/configs/patchfusion_depthanything/depthanything_vits_fine_pretrain_u4k.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fa5d05b1b5f902d31e7d08ba80d0969518c7384
--- /dev/null
+++ b/configs/patchfusion_depthanything/depthanything_vits_fine_pretrain_u4k.py
@@ -0,0 +1,121 @@
+_base_ = [
+    '../_base_/datasets/u4k.py', 
+]
+
+min_depth=1e-3
+max_depth=80
+    
+zoe_depth_config=dict(
+    type='DA-ZoeDepth',
+    
+    min_depth=min_depth,
+    max_depth=max_depth,
+    
+    depth_anything=True,
+    midas_model_type='vits',
+    img_size=[392, 518],
+        
+    # some important params
+    # midas_model_type='DPT_BEiT_L_384',
+    pretrained_resource='local::./work_dir/DepthAnything_vits.pt',
+    use_pretrained_midas=True,
+    train_midas=True,
+    freeze_midas_bn=True,
+    do_resize=False, # do not resize image in midas
+
+    # default settings
+    attractor_alpha=1000,
+    attractor_gamma=2,
+    attractor_kind='mean',
+    attractor_type='inv',
+    aug=True,
+    bin_centers_type='softplus',
+    bin_embedding_dim=128,
+    clip_grad=0.1,
+    dataset='nyu',
+    distributed=True,
+    force_keep_ar=True,
+    gpu='NULL',
+    inverse_midas=False,
+    log_images_every=0.1,
+    max_temp=50.0,
+    max_translation=100,
+    memory_efficient=True,
+    min_temp=0.0212,
+    model='zoedepth',
+    n_attractors=[16, 8, 4, 1],
+    n_bins=64,
+    name='ZoeDepth',
+    notes='',
+    output_distribution='logbinomial',
+    prefetch=False,
+    print_losses=False,
+    project='ZoeDepth',
+    random_crop=False,
+    random_translate=False,
+    root='.',
+    save_dir='',
+    shared_dict='NULL',
+    tags='',
+    translate_prob=0.2,
+    uid='NULL',
+    use_amp=False,
+    use_shared_dict=False,
+    validate_every=0.25,
+    version_name='v1',
+    workers=16,
+)
+
+model=dict(
+    type='BaselinePretrain',
+    patch_process_shape=(392, 518),
+    min_depth=min_depth,
+    max_depth=max_depth,
+    target='fine',
+    coarse_branch=zoe_depth_config,
+    fine_branch=zoe_depth_config,
+    sigloss=dict(type='SILogLoss'))
+
+collect_input_args=['image_lr', 'crops_image_hr', 'depth_gt', 'crop_depths', 'bboxs', 'image_hr']
+
+project='patchfusion'
+
+train_cfg=dict(max_epochs=24, val_interval=2, save_checkpoint_interval=24, log_interval=100, train_log_img_interval=500, val_log_img_interval=50, val_type='epoch_base', eval_start=0)
+
+optim_wrapper=dict(
+    # optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01),
+    optimizer=dict(type='AdamW', lr=0.0002/50, weight_decay=0.01),
+    clip_grad=dict(type='norm', max_norm=0.1, norm_type=2), # norm clip
+    paramwise_cfg=dict(
+        bypass_duplicate=True,
+        custom_keys={
+        }))
+
+param_scheduler=dict(
+    cycle_momentum=True,
+    base_momentum=0.85,
+    max_momentum=0.95,
+    div_factor=1,
+    final_div_factor=10000,
+    pct_start=0.5,
+    three_phase=False,)
+
+env_cfg=dict(
+    cudnn_benchmark=True,
+    mp_cfg=dict(mp_start_method='forkserver'),
+    dist_cfg=dict(backend='nccl'))
+
+convert_syncbn=True
+find_unused_parameters=True
+
+train_dataloader=dict(
+    dataset=dict(
+        resize_mode='depth-anything',
+        transform_cfg=dict(
+            network_process_size=[392, 518])))
+
+val_dataloader=dict(
+    dataset=dict(
+        resize_mode='depth-anything',
+        transform_cfg=dict(
+            network_process_size=[392, 518])))
\ No newline at end of file
diff --git a/configs/patchfusion_depthanything/depthanything_vits_patchfusion_u4k.py b/configs/patchfusion_depthanything/depthanything_vits_patchfusion_u4k.py
new file mode 100644
index 0000000000000000000000000000000000000000..d498899fb2c825454f9d9ca00e2c58c058f619e2
--- /dev/null
+++ b/configs/patchfusion_depthanything/depthanything_vits_patchfusion_u4k.py
@@ -0,0 +1,133 @@
+_base_ = [
+    '../_base_/datasets/u4k.py', 
+    '../_base_/datasets/general_dataset.py',
+    '../_base_/run_time.py'
+]
+
+min_depth=1e-3
+max_depth=80
+zoe_depth_config=dict(
+    type='DA-ZoeDepth',
+    
+    min_depth=min_depth,
+    max_depth=max_depth,
+    
+    depth_anything=True,
+    midas_model_type='vits',
+    img_size=[392, 518],
+        
+    # some important params
+    # midas_model_type='DPT_BEiT_L_384',
+    pretrained_resource=None,
+    use_pretrained_midas=True,
+    train_midas=True,
+    freeze_midas_bn=True,
+    do_resize=False, # do not resize image in midas
+
+    # default settings
+    attractor_alpha=1000,
+    attractor_gamma=2,
+    attractor_kind='mean',
+    attractor_type='inv',
+    aug=True,
+    bin_centers_type='softplus',
+    bin_embedding_dim=128,
+    clip_grad=0.1,
+    dataset='nyu',
+    distributed=True,
+    force_keep_ar=True,
+    gpu='NULL',
+    inverse_midas=False,
+    log_images_every=0.1,
+    max_temp=50.0,
+    max_translation=100,
+    memory_efficient=True,
+    min_temp=0.0212,
+    model='zoedepth',
+    n_attractors=[16, 8, 4, 1],
+    n_bins=64,
+    name='ZoeDepth',
+    notes='',
+    output_distribution='logbinomial',
+    prefetch=False,
+    print_losses=False,
+    project='ZoeDepth',
+    random_crop=False,
+    random_translate=False,
+    root='.',
+    save_dir='',
+    shared_dict='NULL',
+    tags='',
+    translate_prob=0.2,
+    uid='NULL',
+    use_amp=False,
+    use_shared_dict=False,
+    validate_every=0.25,
+    version_name='v1',
+    workers=16,
+)
+
+
+model=dict(
+    type='PatchFusion',
+    config=dict(
+        image_raw_shape=(2160, 3840),
+        patch_split_num=(4, 4),
+        patch_process_shape=(392, 518),
+        min_depth=min_depth,
+        max_depth=max_depth,
+        load_branch=True,
+        pretrain_model=['./work_dir/depthanything_vits_u4k/coarse_pretrain/checkpoint_24.pth', './work_dir/depthanything_vits_u4k/fine_pretrain/checkpoint_24.pth'], # coarse, fine
+        coarse_branch=zoe_depth_config,
+        fine_branch=zoe_depth_config,
+        guided_fusion=dict(
+            type='GuidedFusionPatchFusion',
+            patch_process_shape=(392, 518),
+            in_channels=[32, 64, 64, 64, 64, 64],
+            num_patches=[392*518, 224*296, 112*148, 56*74, 28*37, 14*19],
+            n_channels=5, 
+            g2l=True,),
+        sigloss=dict(type='SILogLoss')))
+
+collect_input_args=['image_lr', 'crops_image_hr', 'depth_gt', 'crop_depths', 'bboxs', 'image_hr']
+
+project='patchfusion'
+
+train_cfg=dict(max_epochs=16, val_interval=2, save_checkpoint_interval=16, log_interval=100, train_log_img_interval=500, val_log_img_interval=50, val_type='epoch_base', eval_start=0)
+
+optim_wrapper=dict(
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.001),
+    clip_grad=dict(type='norm', max_norm=0.1, norm_type=2), # norm clip
+    paramwise_cfg=dict(
+        bypass_duplicate=True,
+        custom_keys={
+        }))
+
+param_scheduler=dict(
+    cycle_momentum=True,
+    base_momentum=0.85,
+    max_momentum=0.95,
+    div_factor=10,
+    final_div_factor=10000,
+    pct_start=0.25,
+    three_phase=False,)
+
+convert_syncbn=True
+find_unused_parameters=True
+
+train_dataloader=dict(
+    dataset=dict(
+        resize_mode='depth-anything',
+        transform_cfg=dict(
+            network_process_size=[392, 518])))
+
+val_dataloader=dict(
+    dataset=dict(
+        resize_mode='depth-anything',
+        transform_cfg=dict(
+            network_process_size=[392, 518])))
+
+general_dataloader=dict(
+    dataset=dict(
+        network_process_size=(392, 518),
+        resize_mode='depth-anything'))
\ No newline at end of file
diff --git a/configs/patchfusion_zoedepth/zoedepth_coarse_pretrain_u4k.py b/configs/patchfusion_zoedepth/zoedepth_coarse_pretrain_u4k.py
new file mode 100644
index 0000000000000000000000000000000000000000..a515696b52bc94491c69aab036c8c3737a0282ed
--- /dev/null
+++ b/configs/patchfusion_zoedepth/zoedepth_coarse_pretrain_u4k.py
@@ -0,0 +1,106 @@
+_base_ = [
+    '../_base_/datasets/u4k.py', 
+    '../_base_/datasets/general_dataset.py'
+]
+
+min_depth=1e-3
+max_depth=80
+    
+zoe_depth_config=dict(
+    type='ZoeDepth',
+        
+    # some important params
+    midas_model_type='DPT_BEiT_L_384',
+    pretrained_resource='local::./work_dir/ZoeDepthv1.pt', # use torch2 version
+    use_pretrained_midas=True,
+    train_midas=True,
+    freeze_midas_bn=True,
+    do_resize=False, # do not resize image in midas
+
+    # default settings
+    attractor_alpha=1000,
+    attractor_gamma=2,
+    attractor_kind='mean',
+    attractor_type='inv',
+    aug=True,
+    bin_centers_type='softplus',
+    bin_embedding_dim=128,
+    clip_grad=0.1,
+    dataset='nyu',
+    distributed=True,
+    force_keep_ar=True,
+    gpu='NULL',
+    img_size=[384, 512],
+    inverse_midas=False,
+    log_images_every=0.1,
+    max_temp=50.0,
+    max_translation=100,
+    memory_efficient=True,
+    min_temp=0.0212,
+    model='zoedepth',
+    n_attractors=[16, 8, 4, 1],
+    n_bins=64,
+    name='ZoeDepth',
+    notes='',
+    output_distribution='logbinomial',
+    prefetch=False,
+    print_losses=False,
+    project='ZoeDepth',
+    random_crop=False,
+    random_translate=False,
+    root='.',
+    save_dir='',
+    shared_dict='NULL',
+    tags='',
+    translate_prob=0.2,
+    uid='NULL',
+    use_amp=False,
+    use_shared_dict=False,
+    validate_every=0.25,
+    version_name='v1',
+    workers=16,
+    min_depth=min_depth,
+    max_depth=max_depth,
+)
+
+model=dict(
+    type='BaselinePretrain',
+    min_depth=min_depth,
+    max_depth=max_depth,
+    target='coarse',
+    coarse_branch=zoe_depth_config,
+    fine_branch=zoe_depth_config,
+    sigloss=dict(type='SILogLoss'))
+
+collect_input_args=['image_lr', 'crops_image_hr', 'depth_gt', 'crop_depths', 'bboxs', 'image_hr']
+
+project='patchfusion'
+
+train_cfg=dict(max_epochs=24, val_interval=2, save_checkpoint_interval=24, log_interval=100, train_log_img_interval=500, val_log_img_interval=50, val_type='epoch_base', eval_start=0)
+
+optim_wrapper=dict(
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01),
+    clip_grad=dict(type='norm', max_norm=0.1, norm_type=2), # norm clip
+    paramwise_cfg=dict(
+        bypass_duplicate=True,
+        custom_keys={
+            'coarse_branch.core': dict(lr_mult=0.1, decay_mult=1.0),
+        }))
+
+param_scheduler=dict(
+    cycle_momentum=True,
+    base_momentum=0.85,
+    max_momentum=0.95,
+    div_factor=1,
+    final_div_factor=10000,
+    pct_start=0.5,
+    three_phase=False,)
+
+env_cfg=dict(
+    cudnn_benchmark=True,
+    mp_cfg=dict(mp_start_method='forkserver'),
+    dist_cfg=dict(backend='nccl'))
+
+convert_syncbn=True
+find_unused_parameters=True
+
diff --git a/configs/patchfusion_zoedepth/zoedepth_fine_pretrain_u4k.py b/configs/patchfusion_zoedepth/zoedepth_fine_pretrain_u4k.py
new file mode 100644
index 0000000000000000000000000000000000000000..20e8f665bc545dae00c70a4b04fcf9b7652e18f8
--- /dev/null
+++ b/configs/patchfusion_zoedepth/zoedepth_fine_pretrain_u4k.py
@@ -0,0 +1,107 @@
+_base_ = [
+    '../_base_/datasets/u4k.py', 
+    '../_base_/datasets/general_dataset.py'
+]
+
+min_depth=1e-3
+max_depth=80
+    
+zoe_depth_config=dict(
+    type='ZoeDepth',
+    
+    min_depth=min_depth,
+    max_depth=max_depth,
+    
+    # some important params
+    midas_model_type='DPT_BEiT_L_384',
+    pretrained_resource='local::./work_dir/ZoeDepthv1.pt', # use torch2 version
+    use_pretrained_midas=True,
+    train_midas=True,
+    freeze_midas_bn=True,
+    do_resize=False, # do not resize image in midas
+
+    # default settings
+    attractor_alpha=1000,
+    attractor_gamma=2,
+    attractor_kind='mean',
+    attractor_type='inv',
+    aug=True,
+    bin_centers_type='softplus',
+    bin_embedding_dim=128,
+    clip_grad=0.1,
+    dataset='nyu',
+    distributed=True,
+    force_keep_ar=True,
+    gpu='NULL',
+    img_size=[384, 512],
+    inverse_midas=False,
+    log_images_every=0.1,
+    max_temp=50.0,
+    max_translation=100,
+    memory_efficient=True,
+    min_temp=0.0212,
+    model='zoedepth',
+    n_attractors=[16, 8, 4, 1],
+    n_bins=64,
+    name='ZoeDepth',
+    notes='',
+    output_distribution='logbinomial',
+    prefetch=False,
+    print_losses=False,
+    project='ZoeDepth',
+    random_crop=False,
+    random_translate=False,
+    root='.',
+    save_dir='',
+    shared_dict='NULL',
+    tags='',
+    translate_prob=0.2,
+    uid='NULL',
+    use_amp=False,
+    use_shared_dict=False,
+    validate_every=0.25,
+    version_name='v1',
+    workers=16,
+)
+
+model=dict(
+    type='BaselinePretrain',
+    min_depth=min_depth,
+    max_depth=max_depth,
+    target='fine',
+    coarse_branch=zoe_depth_config,
+    fine_branch=zoe_depth_config,
+    sigloss=dict(type='SILogLoss'))
+
+collect_input_args=['image_lr', 'crops_image_hr', 'depth_gt', 'crop_depths', 'bboxs', 'image_hr']
+
+project='patchfusion'
+
+train_cfg=dict(max_epochs=24, val_interval=2, save_checkpoint_interval=24, log_interval=100, train_log_img_interval=500, val_log_img_interval=50, val_type='epoch_base', eval_start=0)
+
+optim_wrapper=dict(
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01),
+    clip_grad=dict(type='norm', max_norm=0.1, norm_type=2), # norm clip
+    paramwise_cfg=dict(
+        bypass_duplicate=True,
+        custom_keys={
+            'fine_branch.core': dict(lr_mult=0.1, decay_mult=1.0),
+        }))
+
+param_scheduler=dict(
+    cycle_momentum=True,
+    base_momentum=0.85,
+    max_momentum=0.95,
+    div_factor=1,
+    final_div_factor=10000,
+    pct_start=0.5,
+    three_phase=False,)
+
+env_cfg=dict(
+    cudnn_benchmark=True,
+    mp_cfg=dict(mp_start_method='forkserver'),
+    dist_cfg=dict(backend='nccl'))
+
+convert_syncbn=True
+find_unused_parameters=True
+
diff --git a/configs/patchfusion_zoedepth/zoedepth_patchfusion_u4k.py b/configs/patchfusion_zoedepth/zoedepth_patchfusion_u4k.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fd1323075121b87573a4376dff72c1792affe96
--- /dev/null
+++ b/configs/patchfusion_zoedepth/zoedepth_patchfusion_u4k.py
@@ -0,0 +1,111 @@
+_base_ = [
+    '../_base_/datasets/u4k.py', 
+    '../_base_/datasets/general_dataset.py',
+    '../_base_/run_time.py'
+]
+
+min_depth=1e-3
+max_depth=80
+
+zoe_depth_config=dict(
+    type='ZoeDepth',
+    
+    min_depth=min_depth,
+    max_depth=max_depth,
+    
+    # some important params
+    midas_model_type='DPT_BEiT_L_384',
+    pretrained_resource='local::./work_dir/ZoeDepthv1.pt',
+    use_pretrained_midas=True,
+    train_midas=True,
+    freeze_midas_bn=True,
+    do_resize=False, # do not resize image in midas
+
+    # default settings
+    attractor_alpha=1000,
+    attractor_gamma=2,
+    attractor_kind='mean',
+    attractor_type='inv',
+    aug=True,
+    bin_centers_type='softplus',
+    bin_embedding_dim=128,
+    clip_grad=0.1,
+    dataset='nyu',
+    distributed=True,
+    force_keep_ar=True,
+    gpu='NULL',
+    img_size=[384, 512],
+    inverse_midas=False,
+    log_images_every=0.1,
+    max_temp=50.0,
+    max_translation=100,
+    memory_efficient=True,
+    min_temp=0.0212,
+    model='zoedepth',
+    n_attractors=[16, 8, 4, 1],
+    n_bins=64,
+    name='ZoeDepth',
+    notes='',
+    output_distribution='logbinomial',
+    prefetch=False,
+    print_losses=False,
+    project='ZoeDepth',
+    random_crop=False,
+    random_translate=False,
+    root='.',
+    save_dir='',
+    shared_dict='NULL',
+    tags='',
+    translate_prob=0.2,
+    uid='NULL',
+    use_amp=False,
+    use_shared_dict=False,
+    validate_every=0.25,
+    version_name='v1',
+    workers=16,
+)
+
+
+model=dict(
+    type='PatchFusion',
+    config=dict(
+        image_raw_shape=(2160, 3840),
+        patch_split_num=(4, 4),
+        patch_process_shape=(384, 512),
+        min_depth=min_depth,
+        max_depth=max_depth,
+        load_branch=True,
+        pretrain_model=['./work_dir/zoedepth_u4k/coarse_pretrain/checkpoint_24.pth', './work_dir/zoedepth_u4k/fine_pretrain/checkpoint_24.pth'], # coarse, fine
+        coarse_branch=zoe_depth_config,
+        fine_branch=zoe_depth_config,
+        guided_fusion=dict(
+            type='GuidedFusionPatchFusion',
+            n_channels=5, 
+            g2l=True,),
+        sigloss=dict(type='SILogLoss')))
+
+collect_input_args=['image_lr', 'crops_image_hr', 'depth_gt', 'crop_depths', 'bboxs', 'image_hr']
+
+project='patchfusion'
+
+train_cfg=dict(max_epochs=16, val_interval=2, save_checkpoint_interval=16, log_interval=100, train_log_img_interval=500, val_log_img_interval=50, val_type='epoch_base', eval_start=0)
+
+optim_wrapper=dict(
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.001),
+    clip_grad=dict(type='norm', max_norm=0.1, norm_type=2), # norm clip
+    paramwise_cfg=dict(
+        bypass_duplicate=True,
+        custom_keys={
+        }))
+
+param_scheduler=dict(
+    cycle_momentum=True,
+    base_momentum=0.85,
+    max_momentum=0.95,
+    div_factor=10,
+    final_div_factor=10000,
+    pct_start=0.25,
+    three_phase=False,)
+
+convert_syncbn=True
+find_unused_parameters=True
\ No newline at end of file
diff --git a/depth_anything/blocks.py b/depth_anything/blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dc585ea8ca43a3ecca8e82200b0156599694c3b
--- /dev/null
+++ b/depth_anything/blocks.py
@@ -0,0 +1,153 @@
+import torch.nn as nn
+
+
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    if len(in_shape) >= 4:
+        out_shape4 = out_shape
+
+    if expand:
+        out_shape1 = out_shape
+        out_shape2 = out_shape*2
+        out_shape3 = out_shape*4
+        if len(in_shape) >= 4:
+            out_shape4 = out_shape*8
+
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    if len(in_shape) >= 4:
+        scratch.layer4_rn = nn.Conv2d(
+            in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+        )
+
+    return scratch
+
+
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module.
+    """
+
+    def __init__(self, features, activation, bn):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+
+        self.bn = bn
+
+        self.groups=1
+
+        self.conv1 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
+        )
+        
+        self.conv2 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
+        )
+
+        if self.bn==True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+
+        self.activation = activation
+
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, x):
+        """Forward pass.
+
+        Args:
+            x (tensor): input
+
+        Returns:
+            tensor: output
+        """
+        
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn==True:
+            out = self.bn1(out)
+       
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn==True:
+            out = self.bn2(out)
+
+        if self.groups > 1:
+            out = self.conv_merge(out)
+
+        return self.skip_add.add(out, x)
+
+
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block.
+    """
+
+    def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True, size=None):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock, self).__init__()
+
+        self.deconv = deconv
+        self.align_corners = align_corners
+
+        self.groups=1
+
+        self.expand = expand
+        out_features = features
+        if self.expand==True:
+            out_features = features//2
+        
+        self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
+
+        self.resConfUnit1 = ResidualConvUnit(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit(features, activation, bn)
+        
+        self.skip_add = nn.quantized.FloatFunctional()
+
+        self.size=size
+
+    def forward(self, *xs, size=None):
+        """Forward pass.
+
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+
+        output = self.resConfUnit2(output)
+
+        if (size is None) and (self.size is None):
+            modifier = {"scale_factor": 2}
+        elif size is None:
+            modifier = {"size": self.size}
+        else:
+            modifier = {"size": size}
+
+        output = nn.functional.interpolate(
+            output, **modifier, mode="bilinear", align_corners=self.align_corners
+        )
+
+        output = self.out_conv(output)
+
+        return output
\ No newline at end of file
diff --git a/depth_anything/dpt.py b/depth_anything/dpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0b40db892bb791bc3187476fe70605bfc972155
--- /dev/null
+++ b/depth_anything/dpt.py
@@ -0,0 +1,157 @@
+import torch
+import torch.nn as nn
+
+from .blocks import FeatureFusionBlock, _make_scratch
+import torch.nn.functional as F
+
+
+def _make_fusion_block(features, use_bn, size = None):
+    return FeatureFusionBlock(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+        size=size,
+    )
+
+
+class DPTHead(nn.Module):
+    def __init__(self, in_channels, features=256, use_bn=False, out_channels=[256, 512, 1024, 1024], use_clstoken=False):
+        super(DPTHead, self).__init__()
+        
+        self.use_clstoken = use_clstoken
+        
+        # out_channels = [in_channels // 8, in_channels // 4, in_channels // 2, in_channels]
+        # out_channels = [in_channels // 4, in_channels // 2, in_channels, in_channels]
+        # out_channels = [in_channels, in_channels, in_channels, in_channels]
+        
+        self.projects = nn.ModuleList([
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channel,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ) for out_channel in out_channels
+        ])
+        
+        self.resize_layers = nn.ModuleList([
+            nn.ConvTranspose2d(
+                in_channels=out_channels[0],
+                out_channels=out_channels[0],
+                kernel_size=4,
+                stride=4,
+                padding=0),
+            nn.ConvTranspose2d(
+                in_channels=out_channels[1],
+                out_channels=out_channels[1],
+                kernel_size=2,
+                stride=2,
+                padding=0),
+            nn.Identity(),
+            nn.Conv2d(
+                in_channels=out_channels[3],
+                out_channels=out_channels[3],
+                kernel_size=3,
+                stride=2,
+                padding=1)
+        ])
+        
+        if use_clstoken:
+            self.readout_projects = nn.ModuleList()
+            for _ in range(len(self.projects)):
+                self.readout_projects.append(
+                    nn.Sequential(
+                        nn.Linear(2 * in_channels, in_channels),
+                        nn.GELU()))
+        
+        self.scratch = _make_scratch(
+            out_channels,
+            features,
+            groups=1,
+            expand=False,
+        )
+
+        self.scratch.stem_transpose = None
+        
+        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+
+        head_features_1 = features
+        head_features_2 = 32
+        
+        self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1)
+        
+        self.scratch.output_conv2 = nn.Sequential(
+            nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(True),
+            nn.Identity(),
+        )
+    
+    def forward(self, out_features, patch_h, patch_w):
+        out = []
+        for i, x in enumerate(out_features):
+            if self.use_clstoken:
+                x, cls_token = x[0], x[1]
+                readout = cls_token.unsqueeze(1).expand_as(x)
+                x = self.readout_projects[i](torch.cat((x, readout), -1))
+            else:
+                x = x[0]
+            
+            x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w))
+            
+            x = self.projects[i](x)
+            x = self.resize_layers[i](x)
+            
+            out.append(x)
+        
+        layer_1, layer_2, layer_3, layer_4 = out
+        
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        
+        path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:])
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:])
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        
+        out = self.scratch.output_conv1(path_1)
+        out = F.interpolate(out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True)
+        out = self.scratch.output_conv2(out)
+            
+        return out
+
+
+class DPT_DINOv2(nn.Module):
+    def __init__(self, encoder='vitl', features=256, use_bn=False, out_channels=[256, 512, 1024, 1024], use_clstoken=False):
+
+        super(DPT_DINOv2, self).__init__()
+
+        torch.manual_seed(1)
+        
+        self.pretrained = torch.hub.load('./torchhub/facebookresearch_dinov2_main', 'dinov2_{:}14'.format(encoder), source='local', pretrained=False)
+        
+        dim = self.pretrained.blocks[0].attn.qkv.in_features
+        
+        self.depth_head = DPTHead(dim, features, use_bn, out_channels=out_channels, use_clstoken=use_clstoken)
+
+    def forward(self, x):
+        h, w = x.shape[-2:]
+
+        features = self.pretrained.get_intermediate_layers(x, 4, return_class_token=True)
+        
+        patch_h, patch_w = h // 14, w // 14
+
+        depth = self.depth_head(features, patch_h, patch_w)
+        depth = F.interpolate(depth, size=(h, w), mode="bilinear", align_corners=True)
+        depth = F.relu(depth)
+
+        return depth.squeeze(1)
\ No newline at end of file
diff --git a/ControlNet/annotator/midas/midas/transforms.py b/depth_anything/transform.py
similarity index 53%
rename from ControlNet/annotator/midas/midas/transforms.py
rename to depth_anything/transform.py
index 350cbc11662633ad7f8968eb10be2e7de6e384e9..541be67ffb97108a381f5d10ada69d19537c5542 100644
--- a/ControlNet/annotator/midas/midas/transforms.py
+++ b/depth_anything/transform.py
@@ -1,49 +1,7 @@
-import numpy as np
-import cv2
-import math
-
-
-def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
-    """Rezise the sample to ensure the given size. Keeps aspect ratio.
-
-    Args:
-        sample (dict): sample
-        size (tuple): image size
-
-    Returns:
-        tuple: new size
-    """
-    shape = list(sample["disparity"].shape)
-
-    if shape[0] >= size[0] and shape[1] >= size[1]:
-        return sample
-
-    scale = [0, 0]
-    scale[0] = size[0] / shape[0]
-    scale[1] = size[1] / shape[1]
-
-    scale = max(scale)
-
-    shape[0] = math.ceil(scale * shape[0])
-    shape[1] = math.ceil(scale * shape[1])
-
-    # resize
-    sample["image"] = cv2.resize(
-        sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method
-    )
-
-    sample["disparity"] = cv2.resize(
-        sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST
-    )
-    sample["mask"] = cv2.resize(
-        sample["mask"].astype(np.float32),
-        tuple(shape[::-1]),
-        interpolation=cv2.INTER_NEAREST,
-    )
-    sample["mask"] = sample["mask"].astype(bool)
-
-    return tuple(shape)
 
+import torch
+import torch.nn as nn
+import numpy as np
 
 class Resize(object):
     """Resize sample to given size (width, height).
@@ -57,10 +15,8 @@ class Resize(object):
         keep_aspect_ratio=False,
         ensure_multiple_of=1,
         resize_method="lower_bound",
-        image_interpolation_method=cv2.INTER_AREA,
     ):
         """Init.
-
         Args:
             width (int): desired output width
             height (int): desired output height
@@ -82,23 +38,31 @@ class Resize(object):
                 "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
                 Defaults to "lower_bound".
         """
+        # print("Params passed to Resize transform:")
+        # print("\twidth: ", width)
+        # print("\theight: ", height)
+        # print("\tresize_target: ", resize_target)
+        # print("\tkeep_aspect_ratio: ", keep_aspect_ratio)
+        # print("\tensure_multiple_of: ", ensure_multiple_of)
+        # print("\tresize_method: ", resize_method)
+
         self.__width = width
         self.__height = height
 
-        self.__resize_target = resize_target
         self.__keep_aspect_ratio = keep_aspect_ratio
         self.__multiple_of = ensure_multiple_of
         self.__resize_method = resize_method
-        self.__image_interpolation_method = image_interpolation_method
 
     def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
         y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
 
         if max_val is not None and y > max_val:
-            y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
+            y = (np.floor(x / self.__multiple_of)
+                 * self.__multiple_of).astype(int)
 
         if y < min_val:
-            y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
+            y = (np.ceil(x / self.__multiple_of)
+                 * self.__multiple_of).astype(int)
 
         return y
 
@@ -155,80 +119,11 @@ class Resize(object):
             new_height = self.constrain_to_multiple_of(scale_height * height)
             new_width = self.constrain_to_multiple_of(scale_width * width)
         else:
-            raise ValueError(f"resize_method {self.__resize_method} not implemented")
+            raise ValueError(
+                f"resize_method {self.__resize_method} not implemented")
 
         return (new_width, new_height)
 
-    def __call__(self, sample):
-        width, height = self.get_size(
-            sample["image"].shape[1], sample["image"].shape[0]
-        )
-
-        # resize sample
-        sample["image"] = cv2.resize(
-            sample["image"],
-            (width, height),
-            interpolation=self.__image_interpolation_method,
-        )
-
-        if self.__resize_target:
-            if "disparity" in sample:
-                sample["disparity"] = cv2.resize(
-                    sample["disparity"],
-                    (width, height),
-                    interpolation=cv2.INTER_NEAREST,
-                )
-
-            if "depth" in sample:
-                sample["depth"] = cv2.resize(
-                    sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST
-                )
-
-            sample["mask"] = cv2.resize(
-                sample["mask"].astype(np.float32),
-                (width, height),
-                interpolation=cv2.INTER_NEAREST,
-            )
-            sample["mask"] = sample["mask"].astype(bool)
-
-        return sample
-
-
-class NormalizeImage(object):
-    """Normlize image by given mean and std.
-    """
-
-    def __init__(self, mean, std):
-        self.__mean = mean
-        self.__std = std
-
-    def __call__(self, sample):
-        sample["image"] = (sample["image"] - self.__mean) / self.__std
-
-        return sample
-
-
-class PrepareForNet(object):
-    """Prepare sample for usage as network input.
-    """
-
-    def __init__(self):
-        pass
-
-    def __call__(self, sample):
-        image = np.transpose(sample["image"], (2, 0, 1))
-        sample["image"] = np.ascontiguousarray(image).astype(np.float32)
-
-        if "mask" in sample:
-            sample["mask"] = sample["mask"].astype(np.float32)
-            sample["mask"] = np.ascontiguousarray(sample["mask"])
-
-        if "disparity" in sample:
-            disparity = sample["disparity"].astype(np.float32)
-            sample["disparity"] = np.ascontiguousarray(disparity)
-
-        if "depth" in sample:
-            depth = sample["depth"].astype(np.float32)
-            sample["depth"] = np.ascontiguousarray(depth)
-
-        return sample
+    def __call__(self, x):
+        width, height = self.get_size(*x.shape[-2:][::-1])
+        return nn.functional.interpolate(x, (int(height), int(width)), mode='bilinear', align_corners=True)
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000000000000000000000000000000000000..3113733efb294628c9baa20a69833e987017f061
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,38 @@
+name: patchfusion
+channels:
+  - pytorch
+  - nvidia
+  - anaconda
+  - conda-forge
+dependencies:
+  - cuda=11.8
+  - h5py=3.9.0 
+  - hdf5=1.12.1
+  - matplotlib=3.7.3 
+  - matplotlib-base=3.7.3
+  - numpy=1.24.4 
+  # - opencv=4.6.0
+  - pip=23.3.2 
+  - python=3.10
+  - pytorch=2.1.2 
+  - pytorch-cuda=11.8 
+  - pytorch-mutex=1.0
+  - torchaudio=2.1.2
+  - torchvision=0.16.2
+  - json5=0.9.14
+  - einops=0.7.0
+  - scikit-image=0.20.0
+  - pip:
+    - huggingface-hub==0.20.1
+    - timm==0.9.2
+    - tqdm==4.66.1
+    - wandb==0.16.1
+    - opencv-python==4.8.1.78
+    - imageio==2.33.1
+    - mmengine==0.10.2
+    - scipy==1.10.1
+    - prettytable==3.10.0
+    - kornia==0.7.2
+    - transformers==4.36.2
+    - torchmetrics==1.3.1
+    - pandas==2.0.3
\ No newline at end of file
diff --git a/estimator/__init__.py b/estimator/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..319c9e41693d7c9f749c30a1a75a922ccf9de11b
--- /dev/null
+++ b/estimator/__init__.py
@@ -0,0 +1,6 @@
+from .datasets import *
+from .models import *
+from .registry import *
+from .trainer import *
+from .utils import *
+from .tester import *
\ No newline at end of file
diff --git a/estimator/models/__init__.py b/estimator/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..079ddd70e6d01210e51df124782bc01d0b7baf63
--- /dev/null
+++ b/estimator/models/__init__.py
@@ -0,0 +1,5 @@
+from .builder import build_model
+from .baseline_pretrain import BaselinePretrain
+from .losses import SILogLoss, ErrorLoss, EdgeguidedRankingLoss, ExistLoss, EdgeClsLoss
+from .blocks import *
+from .patchfusion import PatchFusion
\ No newline at end of file
diff --git a/estimator/models/baseline_pretrain.py b/estimator/models/baseline_pretrain.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c4ba22c8ece44f18f10b59fadaada6edeb791e8
--- /dev/null
+++ b/estimator/models/baseline_pretrain.py
@@ -0,0 +1,417 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Zhenyu Li
+
+import itertools
+
+import math
+import copy
+import random
+import torch
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine import print_log
+
+from estimator.registry import MODELS
+from estimator.models import build_model
+from estimator.models.utils import get_activation
+from zoedepth.models.zoedepth import ZoeDepth
+
+import matplotlib.pyplot as plt
+from estimator.models.utils import get_activation, generatemask, RunningAverageMap
+from zoedepth.models.base_models.midas import Resize as ResizeZoe
+from depth_anything.transform import Resize as ResizeDA
+
+@MODELS.register_module()
+class BaselinePretrain(nn.Module):
+    def __init__(self, 
+                 coarse_branch, 
+                 fine_branch, 
+                 sigloss, 
+                 min_depth, 
+                 max_depth, 
+                 image_raw_shape=(2160, 3840),
+                 patch_process_shape=(384, 512),
+                 patch_split_num=(4, 4),
+                 target='coarse',
+                 coarse_branch_zoe=None):
+        """ZoeDepth model
+        """
+        super().__init__()
+        
+        self.patch_process_shape = patch_process_shape
+        self.tile_cfg = self.prepare_tile_cfg(image_raw_shape, patch_split_num)
+        
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.coarse_branch_cfg = coarse_branch
+        self.fine_branch_cfg = fine_branch
+        if target == 'coarse':
+            if self.coarse_branch_cfg.type == 'ZoeDepth':
+                self.coarse_branch = ZoeDepth.build(**coarse_branch)
+                print_log("Current zoedepth.core.prep.resizer is {}".format(type(self.coarse_branch.core.prep.resizer)), logger='current')
+                self.resizer = ResizeZoe(patch_process_shape[1], patch_process_shape[0], keep_aspect_ratio=False, ensure_multiple_of=32, resize_method="minimal")
+            elif self.coarse_branch_cfg.type == 'DA-ZoeDepth':
+                self.coarse_branch = ZoeDepth.build(**coarse_branch)
+                print_log("Current zoedepth.core.prep.resizer is {}".format(type(self.coarse_branch.core.prep.resizer)), logger='current')
+                self.resizer = ResizeDA(patch_process_shape[1], patch_process_shape[0], keep_aspect_ratio=False, ensure_multiple_of=14, resize_method="minimal")
+                
+        if target == 'fine':
+            if self.fine_branch_cfg.type == 'ZoeDepth':
+                self.fine_branch = ZoeDepth.build(**fine_branch)
+                print_log("Current zoedepth.core.prep.resizer is {}".format(type(self.fine_branch.core.prep.resizer)), logger='current')
+                self.resizer = ResizeZoe(patch_process_shape[1], patch_process_shape[0], keep_aspect_ratio=False, ensure_multiple_of=32, resize_method="minimal")
+            elif self.fine_branch_cfg.type == 'DA-ZoeDepth':
+                self.fine_branch = ZoeDepth.build(**fine_branch)
+                print_log("Current zoedepth.core.prep.resizer is {}".format(type(self.fine_branch.core.prep.resizer)), logger='current')
+                self.resizer = ResizeDA(patch_process_shape[1], patch_process_shape[0], keep_aspect_ratio=False, ensure_multiple_of=14, resize_method="minimal")
+            
+        self.sigloss = build_model(sigloss)
+        self.target = target
+        
+
+    def prepare_tile_cfg(self, image_raw_shape, patch_split_num):
+        # information for process
+        patch_split_num = patch_split_num
+        patch_reensemble_shape = (self.patch_process_shape[0] * patch_split_num[0], self.patch_process_shape[1] * patch_split_num[1])
+        patch_raw_shape = (image_raw_shape[0] // patch_split_num[0], image_raw_shape[1] // patch_split_num[1])
+        image_raw_shape = image_raw_shape
+        
+        raw_h_split_point = []
+        raw_w_split_point = []
+        for i in range(patch_split_num[0]):
+            raw_h_split_point.append(int(patch_raw_shape[0] * i))
+        for i in range(patch_split_num[1]):
+            raw_w_split_point.append(int(patch_raw_shape[1] * i))
+            
+        tile_cfg = {
+            'patch_split_num': patch_split_num,
+            'patch_reensemble_shape': patch_reensemble_shape,
+            'patch_raw_shape': patch_raw_shape,
+            'image_raw_shape': image_raw_shape,
+            'raw_h_split_point': raw_h_split_point,
+            'raw_w_split_point': raw_w_split_point}
+        
+        return tile_cfg
+        
+    def load_dict(self, dict):
+        if hasattr(self, 'coarse_branch') and hasattr(self, 'fine_branch') == False:
+            return self.coarse_branch.load_state_dict(dict, strict=True)
+        elif hasattr(self, 'fine_branch') and hasattr(self, 'coarse_branch') == False:
+            return self.fine_branch.load_state_dict(dict, strict=True)
+        else:
+            raise NotImplementedError('Not support loading coarse and fine together')
+                
+    def get_save_dict(self):
+        model_state_dict = {}
+        if hasattr(self, 'coarse_branch') and hasattr(self, 'fine_branch') == False:
+            model_state_dict.update(self.coarse_branch.state_dict())
+        elif hasattr(self, 'fine_branch') and hasattr(self, 'coarse_branch') == False:
+            model_state_dict.update(self.fine_branch.state_dict())
+        else:
+            raise NotImplementedError('Not support training coarse and fine together')
+        return model_state_dict
+    
+    def infer_forward(self, imgs_crop):
+        output_dict = self.fine_branch(imgs_crop)
+        return output_dict['metric_depth']
+    
+    @torch.no_grad()
+    def random_tile(
+        self, 
+        image_hr, 
+        tile_temp=None, 
+        blur_mask=None, 
+        avg_depth_map=None,
+        tile_cfg=None,
+        process_num=4,):
+        ## setting
+        height, width = tile_cfg['patch_raw_shape'][0], tile_cfg['patch_raw_shape'][1]
+
+        h_start_list = [random.randint(0, tile_cfg['image_raw_shape'][0] - height - 1) for _ in range(process_num)]
+        w_start_list = [random.randint(0, tile_cfg['image_raw_shape'][1] - width - 1)]
+
+        ## prepare data
+        imgs_crop = []
+        bboxs = []
+
+        for h_start in h_start_list:
+            for w_start in w_start_list:
+                crop_image = image_hr[:, h_start: h_start+height, w_start: w_start+width]
+                crop_image_resized = self.resizer(crop_image.unsqueeze(dim=0)).squeeze(dim=0) # resize to patch_process_shape
+                bbox = torch.tensor([w_start, h_start, w_start+width, h_start+height])
+                imgs_crop.append(crop_image_resized)
+                bboxs.append(bbox)
+
+        imgs_crop = torch.stack(imgs_crop, dim=0)
+        bboxs = torch.stack(bboxs, dim=0)
+
+        imgs_crop = imgs_crop.to(image_hr.device)
+        bboxs = bboxs.to(image_hr.device).int()
+        bboxs_feat_factor = torch.tensor([
+                1 / tile_cfg['image_raw_shape'][1] * self.patch_process_shape[1], 
+                1 / tile_cfg['image_raw_shape'][0] * self.patch_process_shape[0], 
+                1 / tile_cfg['image_raw_shape'][1] * self.patch_process_shape[1], 
+                1 / tile_cfg['image_raw_shape'][0] * self.patch_process_shape[0]], device=bboxs.device).unsqueeze(dim=0)
+        bboxs_feat = bboxs * bboxs_feat_factor
+        inds = torch.arange(bboxs.shape[0]).to(bboxs.device).unsqueeze(dim=-1)
+        bboxs_feat = torch.cat((inds, bboxs_feat), dim=-1)
+        
+        if tile_temp is not None:
+            coarse_postprocess_dict = self.coarse_postprocess_test(bboxs=bboxs, bboxs_feat=bboxs_feat, **tile_temp) 
+        
+        prediction_list = []
+        if tile_temp is not None:
+            coarse_temp_dict = {}
+            for k, v in coarse_postprocess_dict.items():
+                if k == 'coarse_feats_roi':
+                    coarse_temp_dict[k] = [f for f in v]
+                else:
+                    coarse_temp_dict[k] = v
+            bbox_feat_forward = bboxs_feat
+            bbox_feat_forward[:, 0] = 0
+            prediction = self.infer_forward(imgs_crop, bbox_feat_forward, tile_temp, coarse_temp_dict)
+        else:
+            prediction = self.infer_forward(imgs_crop)
+            
+        prediction_list.append(prediction)
+        predictions = torch.cat(prediction_list, dim=0)
+        predictions = F.interpolate(predictions, tile_cfg['patch_raw_shape'])
+                  
+        patch_select_idx = 0
+        for h_start in h_start_list:
+            for w_start in w_start_list:
+                temp_depth = predictions[patch_select_idx]
+                
+                count_map = torch.zeros(tile_cfg['image_raw_shape'], device=temp_depth.device)
+                pred_depth = torch.zeros(tile_cfg['image_raw_shape'], device=temp_depth.device)
+                count_map[h_start: h_start+tile_cfg['patch_raw_shape'][0], w_start: w_start+tile_cfg['patch_raw_shape'][1]] = blur_mask
+                pred_depth[h_start: h_start+tile_cfg['patch_raw_shape'][0], w_start: w_start+tile_cfg['patch_raw_shape'][1]] = temp_depth * blur_mask
+                avg_depth_map.update(pred_depth, count_map)
+                    
+                patch_select_idx += 1
+        
+        return avg_depth_map
+    
+    
+    @torch.no_grad()
+    def regular_tile(
+        self, 
+        offset, 
+        offset_process, 
+        image_hr, 
+        init_flag=False, 
+        tile_temp=None, 
+        blur_mask=None, 
+        avg_depth_map=None,
+        tile_cfg=None,
+        process_num=4,):
+        
+        ## setting
+        height, width = tile_cfg['patch_raw_shape'][0], tile_cfg['patch_raw_shape'][1]
+        offset_h, offset_w = offset[0], offset[1]
+        assert offset_w >= 0 and offset_h >= 0
+        
+        tile_num_h = (tile_cfg['image_raw_shape'][0] - offset_h) // height
+        tile_num_w = (tile_cfg['image_raw_shape'][1] - offset_w) // width
+        h_start_list = [height * h + offset_h for h in range(tile_num_h)]
+        w_start_list = [width * w + offset_w for w in range(tile_num_w)]
+        
+        height_process, width_process = self.patch_process_shape[0], self.patch_process_shape[1]
+        offset_h_process, offset_w_process = offset_process[0], offset_process[1]
+        assert offset_h_process >= 0 and offset_w_process >= 0
+        
+        tile_num_h_process = (tile_cfg['patch_reensemble_shape'][0] - offset_h_process) // height_process
+        tile_num_w_process = (tile_cfg['patch_reensemble_shape'][1] - offset_w_process) // width_process
+        h_start_list_process = [height_process * h + offset_h_process for h in range(tile_num_h_process)]
+        w_start_list_process = [width_process * w + offset_w_process for w in range(tile_num_w_process)]
+        
+        ## prepare data
+        imgs_crop = []
+        bboxs = []
+
+        iter_priors = []
+        for h_start in h_start_list:
+            for w_start in w_start_list:
+                crop_image = image_hr[:, h_start: h_start+height, w_start: w_start+width]
+                crop_image_resized = self.resizer(crop_image.unsqueeze(dim=0)).squeeze(dim=0) # resize to patch_process_shape
+                bbox = torch.tensor([w_start, h_start, w_start+width, h_start+height])
+                imgs_crop.append(crop_image_resized)
+                bboxs.append(bbox)
+
+        imgs_crop = torch.stack(imgs_crop, dim=0)
+        bboxs = torch.stack(bboxs, dim=0)
+
+        imgs_crop = imgs_crop.to(image_hr.device)
+        bboxs = bboxs.to(image_hr.device).int()
+        
+        bboxs = bboxs.squeeze() # HACK: during inference, 1, 16, 4 -> 16, 4
+        if len(bboxs.shape) == 1:
+            bboxs = bboxs.unsqueeze(dim=0)
+        bboxs_feat_factor = torch.tensor([
+                1 / tile_cfg['image_raw_shape'][1] * self.patch_process_shape[1], 
+                1 / tile_cfg['image_raw_shape'][0] * self.patch_process_shape[0], 
+                1 / tile_cfg['image_raw_shape'][1] * self.patch_process_shape[1], 
+                1 / tile_cfg['image_raw_shape'][0] * self.patch_process_shape[0]], device=bboxs.device).unsqueeze(dim=0)
+        bboxs_feat = bboxs * bboxs_feat_factor
+        inds = torch.arange(bboxs.shape[0]).to(bboxs.device).unsqueeze(dim=-1)
+        bboxs_feat = torch.cat((inds, bboxs_feat), dim=-1)
+        
+        # post_process
+        if tile_temp is not None:
+            # coarse_prediction_roi, coarse_features_patch_area, crop_coarse_prediction_collection = self.coarse_postprocess_test(bboxs=bboxs, bboxs_feat=bboxs_feat, **tile_temp)
+            coarse_postprocess_dict = self.coarse_postprocess_test(bboxs=bboxs, bboxs_feat=bboxs_feat, **tile_temp) 
+        
+        count_map = torch.zeros(tile_cfg['patch_reensemble_shape'], device=image_hr.device)
+        pred_depth = torch.zeros(tile_cfg['patch_reensemble_shape'], device=image_hr.device)
+        
+        prediction_list = []
+        split_rebatch_image = torch.split(imgs_crop, process_num, dim=0)
+        for idx, rebatch_image in enumerate(split_rebatch_image):
+            if tile_temp is not None:
+                coarse_temp_dict = {}
+                for k, v in coarse_postprocess_dict.items():
+                    if k == 'coarse_feats_roi':
+                        coarse_temp_dict[k] = [f[idx*process_num:(idx+1)*process_num, :, :, :] for f in v]
+                    else:
+                        coarse_temp_dict[k] = v[idx*process_num:(idx+1)*process_num, :, :, :]
+                bbox_feat_forward = bboxs_feat[idx*process_num:(idx+1)*process_num, :]
+                bbox_feat_forward[:, 0] = 0
+                prediction = self.infer_forward(rebatch_image, bbox_feat_forward, tile_temp, coarse_temp_dict)
+            else:
+                prediction = self.infer_forward(rebatch_image)
+            prediction_list.append(prediction)
+        predictions = torch.cat(prediction_list, dim=0)
+                         
+        patch_select_idx = 0
+        for h_start in h_start_list_process:
+            for w_start in w_start_list_process:
+                temp_depth = predictions[patch_select_idx]
+                
+                if init_flag:
+                    count_map[h_start: h_start+self.patch_process_shape[0], w_start: w_start+self.patch_process_shape[1]] = blur_mask
+                    pred_depth[h_start: h_start+self.patch_process_shape[0], w_start: w_start+self.patch_process_shape[1]] = temp_depth * blur_mask
+            
+                else:
+                    count_map = torch.zeros(tile_cfg['patch_reensemble_shape'], device=temp_depth.device)
+                    pred_depth = torch.zeros(tile_cfg['patch_reensemble_shape'], device=temp_depth.device)
+                    count_map[h_start: h_start+self.patch_process_shape[0], w_start: w_start+self.patch_process_shape[1]] = blur_mask
+                    pred_depth[h_start: h_start+self.patch_process_shape[0], w_start: w_start+self.patch_process_shape[1]] = temp_depth * blur_mask
+                    avg_depth_map.update(pred_depth, count_map)
+                    
+                patch_select_idx += 1
+        
+        if init_flag:
+            avg_depth_map = RunningAverageMap(pred_depth, count_map)
+                        
+        return avg_depth_map
+    
+    def forward(
+        self,
+        mode,
+        image_lr,
+        image_hr,
+        depth_gt,
+        crop_depths=None,
+        crops_image_hr=None,
+        bboxs=None,
+        tile_cfg=None,
+        cai_mode='m1',
+        process_num=4,
+        **kwargs):
+        
+        if mode == 'train':
+            loss_dict = {}
+            if self.target == 'coarse':
+                model_output_dict = self.coarse_branch(image_lr)    
+                depth_prediction = model_output_dict['metric_depth']
+                loss_dict['coarse_loss'] = self.sigloss(depth_prediction, depth_gt, self.min_depth, self.max_depth)
+                loss_dict['total_loss'] = loss_dict['coarse_loss']
+                return loss_dict, {'rgb': image_lr, 'depth_pred': depth_prediction, 'depth_gt': depth_gt}
+            elif self.target == 'fine':
+                model_output_dict = self.fine_branch(crops_image_hr) # 1/2 res, 1/4 res, 1/8 res, 1/16 res
+                depth_prediction = model_output_dict['metric_depth']
+                loss_dict['fine_loss'] = self.sigloss(depth_prediction, crop_depths, self.min_depth, self.max_depth)
+                loss_dict['total_loss'] = loss_dict['fine_loss']
+                return loss_dict, {'rgb': image_lr, 'depth_pred': depth_prediction, 'depth_gt': crop_depths}
+            else:
+                raise NotImplementedError
+
+        else:
+            if self.target == 'coarse':
+                model_output_dict = self.coarse_branch(image_lr)    
+                depth_prediction = model_output_dict['metric_depth']
+                
+            elif self.target == 'fine':
+                if tile_cfg is None:
+                    tile_cfg = self.tile_cfg
+                else:
+                    tile_cfg = self.prepare_tile_cfg(tile_cfg['image_raw_shape'], tile_cfg['patch_split_num'])
+                
+                assert image_hr.shape[0] == 1
+            
+                blur_mask = generatemask((self.patch_process_shape[0], self.patch_process_shape[1])) + 1e-3
+                blur_mask = torch.tensor(blur_mask, device=image_hr.device)
+                avg_depth_map = self.regular_tile(
+                    offset=[0, 0], 
+                    offset_process=[0, 0], 
+                    image_hr=image_hr[0], 
+                    init_flag=True, 
+                    tile_temp=None, 
+                    blur_mask=blur_mask,
+                    tile_cfg=tile_cfg,
+                    process_num=process_num)
+
+                if cai_mode == 'm2' or cai_mode[0] == 'r':
+                    avg_depth_map = self.regular_tile(
+                        offset=[0, tile_cfg['patch_raw_shape'][1]//2], 
+                        offset_process=[0, self.patch_process_shape[1]//2], 
+                        image_hr=image_hr[0], init_flag=False, tile_temp=None, blur_mask=blur_mask, avg_depth_map=avg_depth_map, tile_cfg=tile_cfg, process_num=process_num)
+                    avg_depth_map = self.regular_tile(
+                        offset=[tile_cfg['patch_raw_shape'][0]//2, 0],
+                        offset_process=[self.patch_process_shape[0]//2, 0], 
+                        image_hr=image_hr[0], init_flag=False, tile_temp=None, blur_mask=blur_mask, avg_depth_map=avg_depth_map, tile_cfg=tile_cfg, process_num=process_num)
+                    avg_depth_map = self.regular_tile(
+                        offset=[tile_cfg['patch_raw_shape'][0]//2, tile_cfg['patch_raw_shape'][1]//2],
+                        offset_process=[self.patch_process_shape[0]//2, self.patch_process_shape[1]//2], 
+                        init_flag=False, image_hr=image_hr[0], tile_temp=None, blur_mask=blur_mask, avg_depth_map=avg_depth_map, tile_cfg=tile_cfg, process_num=process_num)
+                    
+                if cai_mode[0] == 'r':
+                    blur_mask = generatemask((tile_cfg['patch_raw_shape'][0], tile_cfg['patch_raw_shape'][1])) + 1e-3
+                    blur_mask = torch.tensor(blur_mask, device=image_hr.device)
+                    avg_depth_map.resize(tile_cfg['image_raw_shape'])
+                    patch_num = int(cai_mode[1:])
+                    for i in range(patch_num):
+                        avg_depth_map = self.random_tile(
+                            image_hr=image_hr[0], tile_temp=None, blur_mask=blur_mask, avg_depth_map=avg_depth_map, tile_cfg=tile_cfg, process_num=process_num)
+                
+                depth = avg_depth_map.average_map
+                depth = depth.unsqueeze(dim=0).unsqueeze(dim=0)
+                return depth, {}
+            
+            else:
+                raise NotImplementedError
+            
+            return depth_prediction, {'rgb': image_lr, 'depth_pred': depth_prediction, 'depth_gt': depth_gt}
+
diff --git a/estimator/models/blocks/__init__.py b/estimator/models/blocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5b12ed492a0e4064c0116a888a593234fc38824
--- /dev/null
+++ b/estimator/models/blocks/__init__.py
@@ -0,0 +1,2 @@
+from .guided_fusion_model import GuidedFusionPatchFusion
+from .swin_layers import G2LFusion
\ No newline at end of file
diff --git a/estimator/models/blocks/guided_fusion_model.py b/estimator/models/blocks/guided_fusion_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ef46917a30ab93f4a108083b171f7d30ce1781e
--- /dev/null
+++ b/estimator/models/blocks/guided_fusion_model.py
@@ -0,0 +1,207 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Zhenyu Li
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# from zoedepth.models.layers.swin_layers import G2LFusion
+from estimator.models.blocks.swin_layers import G2LFusion
+from torchvision.ops import roi_align as torch_roi_align
+from estimator.registry import MODELS
+
+class DoubleConvWOBN(nn.Module):
+    """(convolution => [BN] => ReLU) * 2"""
+
+    def __init__(self, in_channels, out_channels, mid_channels=None):
+        super().__init__()
+        if not mid_channels:
+            mid_channels = out_channels
+        self.double_conv = nn.Sequential(
+            nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=True),
+            # nn.BatchNorm2d(mid_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=True),
+            # nn.BatchNorm2d(mid_channels),
+            nn.ReLU(inplace=True))
+
+    def forward(self, x):
+        return self.double_conv(x)
+
+class DoubleConv(nn.Module):
+    """(convolution => [BN] => ReLU) * 2"""
+
+    def __init__(self, in_channels, out_channels, mid_channels=None):
+        super().__init__()
+        if not mid_channels:
+            mid_channels = out_channels
+        self.double_conv = nn.Sequential(
+            nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(mid_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True)
+        )
+
+    def forward(self, x):
+        return self.double_conv(x)
+    
+
+class Down(nn.Module):
+    """Downscaling with maxpool then double conv"""
+
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.maxpool_conv = nn.Sequential(
+            nn.MaxPool2d(2),
+            DoubleConv(in_channels, out_channels)
+        )
+
+    def forward(self, x):
+        return self.maxpool_conv(x)
+
+class Upv1(nn.Module):
+    """Upscaling then double conv"""
+
+    def __init__(self, in_channels, out_channels, mid_channels=None):
+        super().__init__()
+        # self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
+        if mid_channels is not None:
+            self.conv = DoubleConvWOBN(in_channels, out_channels, mid_channels)
+        else:
+            self.conv = DoubleConvWOBN(in_channels, out_channels, in_channels)
+
+    def forward(self, x1, x2):
+        # x1 = self.up(x1)
+        x1 = F.interpolate(x1, size=x2.shape[-2:], mode='bilinear', align_corners=True)
+        x = torch.cat([x2, x1], dim=1)
+        return self.conv(x)
+
+@MODELS.register_module()
+class GuidedFusionPatchFusion(nn.Module):
+    def __init__(
+        self, 
+        n_channels, 
+        g2l,
+        in_channels=[32, 256, 256, 256, 256, 256],
+        depth=[2, 2, 3, 3, 4, 4],
+        num_heads=[8, 8, 16, 16, 32, 32],
+        # num_patches=[12*16, 24*32, 48*64, 96*128, 192*256, 384*512], 
+        num_patches=[384*512, 192*256, 96*128, 48*64, 24*32, 12*16], 
+        patch_process_shape=[384, 512]):
+        
+        super(GuidedFusionPatchFusion, self).__init__()
+        self.n_channels = n_channels
+
+        self.inc = DoubleConv(n_channels, in_channels[0])
+        
+        self.down_conv_list = nn.ModuleList()
+        for idx in range(len(in_channels) - 1):
+            lay = Down(in_channels[idx], in_channels[idx+1])
+            self.down_conv_list.append(lay)
+        
+            
+        in_channels_inv = in_channels[::-1]
+        self.up_conv_list = nn.ModuleList()
+        for idx in range(1, len(in_channels)):
+            lay = Upv1(in_channels_inv[idx] + in_channels_inv[idx-1] + in_channels_inv[idx-1], in_channels_inv[idx])
+            self.up_conv_list.append(lay)
+            
+        self.g2l = g2l
+        if self.g2l:
+            self.g2l_att = nn.ModuleList()
+            
+            win = 12
+            self.patch_process_shape = patch_process_shape
+            num_heads_inv = num_heads[::-1]
+            depth_inv = depth[::-1]
+            num_patches_inv = num_patches[::-1]
+            self.g2l_list = nn.ModuleList()
+            self.convs = nn.ModuleList()
+            
+            for idx in range(len(in_channels_inv)):
+                g2l_layer = G2LFusion(input_dim=in_channels_inv[idx], embed_dim=in_channels_inv[idx], window_size=win, num_heads=num_heads_inv[idx], depth=depth_inv[idx], num_patches=num_patches_inv[idx])
+                self.g2l_list.append(g2l_layer)
+                layer = DoubleConvWOBN(in_channels_inv[idx] * 2, in_channels_inv[idx], in_channels_inv[idx])
+                self.convs.append(layer)
+                
+            # self.g2l5 = G2LFusion(input_dim=in_channels[5], embed_dim=crf_dims[5], window_size=win, num_heads=32, depth=4, num_patches=num_patches[0])
+            # self.g2l4 = G2LFusion(input_dim=in_channels[4], embed_dim=crf_dims[4], window_size=win, num_heads=32, depth=4, num_patches=num_patches[1])
+            # self.g2l3 = G2LFusion(input_dim=in_channels[3], embed_dim=crf_dims[3], window_size=win, num_heads=16, depth=3, num_patches=num_patches[2])
+            # self.g2l2 = G2LFusion(input_dim=in_channels[2], embed_dim=crf_dims[2], window_size=win, num_heads=16, depth=3, num_patches=num_patches[3])
+            # self.g2l1 = G2LFusion(input_dim=in_channels[1], embed_dim=crf_dims[1], window_size=win, num_heads=8, depth=2, num_patches=num_patches[4])
+            # self.g2l0 = G2LFusion(input_dim=in_channels[0], embed_dim=crf_dims[0], window_size=win, num_heads=8, depth=2, num_patches=num_patches[5]) 
+            # self.conv5 = DoubleConvWOBN(in_channels[5] * 2, in_channels[5], in_channels[5])
+            # self.conv4 = DoubleConvWOBN(in_channels[4] * 2, in_channels[4], in_channels[4])
+            # self.conv3 = DoubleConvWOBN(in_channels[3] * 2, in_channels[3], in_channels[3])
+            # self.conv2 = DoubleConvWOBN(in_channels[2] * 2, in_channels[2], in_channels[2])
+            # self.conv1 = DoubleConvWOBN(in_channels[1] * 2, in_channels[1], in_channels[1])
+            # self.conv0 = DoubleConvWOBN(in_channels[0] * 2, in_channels[0], in_channels[0])
+            
+    def forward(self, 
+                input_tensor, 
+                guide_plus, 
+                guide_cat, 
+                bbox=None, 
+                fine_feat_crop=None, 
+                coarse_feat_whole=None, 
+                coarse_feat_whole_hack=None, 
+                coarse_feat_crop=None):
+
+        # apply unscaled feat to swin
+        if coarse_feat_whole_hack is not None:
+            coarse_feat_whole = coarse_feat_whole_hack
+
+        feat_list = []
+        
+        x = self.inc(input_tensor)
+        feat_list.append(x)
+        
+        for layer in self.down_conv_list:
+            x = layer(x)
+            feat_list.append(x)
+        
+        output = []
+        feat_inv_list = feat_list[::-1]
+        for idx, (feat_enc, feat_c) in enumerate(zip(feat_inv_list, coarse_feat_whole)):
+            
+            # in case for depth-anything
+            _, _, h, w = feat_enc.shape
+            if h != feat_c.shape[-2] or w != feat_c.shape[-1]:
+                feat_enc = F.interpolate(feat_enc, size=feat_c.shape[-2:], mode='bilinear', align_corners=True)
+                
+            if idx == 0:
+                pass
+            else:
+                feat_enc = self.up_conv_list[idx-1](torch.cat([temp_feat, guide_cat[idx-1]], dim=1), feat_enc)
+            
+            _, _, h, w = feat_c.shape
+            feat_c = self.g2l_list[idx](feat_c, None)
+            feat_c = torch_roi_align(feat_c, bbox, (h, w), h/self.patch_process_shape[0], aligned=True)
+            x = self.convs[idx](torch.cat([feat_enc, feat_c], dim=1))
+            temp_feat = x
+            output.append(x)
+
+        return output[::-1]
\ No newline at end of file
diff --git a/estimator/models/blocks/swin_layers.py b/estimator/models/blocks/swin_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3ef146f9365bc941e75c917d988f865552c89ea
--- /dev/null
+++ b/estimator/models/blocks/swin_layers.py
@@ -0,0 +1,432 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Zhenyu Li
+
+# This file mainly copy-and-paste from Swin Transformer (https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py); author: Ze Liu
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+import torch.nn.functional as F
+import numpy as np
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class WindowAttention(nn.Module):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
+
+
+
+class SwinTransformerBlock(nn.Module):
+    r""" Swin Transformer Block.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+        fused_window_process (bool, optional): If True, use one kernel to fused window shift & window partition for acceleration, similar for the reversed part. Default: False
+    """
+
+    def __init__(self, dim, num_heads, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm,
+                 fused_window_process=False):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+        self.fused_window_process = fused_window_process
+        self.H = None
+        self.W = None
+
+    def forward(self, x, mask_matrix):
+        H, W = self.H, self.W
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+
+        x = x.view(B, H * W, C)
+        x = shortcut + self.drop_path(x)
+
+        # FFN
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+
+class G2LBasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+        fused_window_process (bool, optional): If True, use one kernel to fused window shift & window partition for acceleration, similar for the reversed part. Default: False
+    """
+
+    def __init__(self, dim, depth, num_heads, window_size,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False,
+                 fused_window_process=False):
+
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.dim = dim
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(dim=dim,
+                                 num_heads=num_heads, window_size=window_size,
+                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
+                                 mlp_ratio=mlp_ratio,
+                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
+                                 drop=drop, attn_drop=attn_drop,
+                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                                 norm_layer=norm_layer,
+                                 fused_window_process=fused_window_process)
+            for i in range(depth)])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+        
+
+    def forward(self, x, H, W, area_prior=None):
+        
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        
+        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x, H, W
+
+
+
+
+class G2LFusion(nn.Module):
+    def __init__(self,
+                 input_dim=96,
+                 embed_dim=96,
+                 window_size=7,
+                 num_heads=4,
+                 depth=2,
+                #  patch_size=4,
+                #  in_chans=3,
+                 norm_layer=nn.LayerNorm,
+                 patch_norm=True,
+                 ape=True,
+                 num_patches=0):
+        super().__init__()
+
+        self.embed_dim = embed_dim
+        self.patch_norm = patch_norm
+        
+        if input_dim != embed_dim:
+            self.proj_x = nn.Conv2d(input_dim, embed_dim, 3, padding=1)
+        else:
+            self.proj_x = None
+
+        self.g2l_layer = G2LBasicLayer(
+                dim=embed_dim,
+                depth=depth,
+                num_heads=num_heads,
+                window_size=window_size,
+                mlp_ratio=4.,
+                qkv_bias=True,
+                qk_scale=None,
+                drop=0.,
+                attn_drop=0.,
+                drop_path=0.,
+                norm_layer=norm_layer,
+                downsample=None,
+                use_checkpoint=False)
+
+        layer = norm_layer(embed_dim)
+        layer_name = 'g2l_layer_norm'
+        self.add_module(layer_name, layer)
+
+        # self.embed_proj = nn.Linear(1, dim) # learnable
+        self.embed_proj = nn.Conv2d(1, embed_dim, 1, 1, 0) # learnable
+
+        self.ape = ape
+        if self.ape:
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim), requires_grad=True)
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+
+    def forward(self, x, area_prior=None):
+        if self.proj_x is not None:
+            x = self.proj_x(x)
+        
+        if area_prior is not None:
+            prior_embed = self.embed_proj(area_prior)
+            x = x + prior_embed
+
+        Wh, Ww = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)
+
+        if self.ape:
+            x = x + self.absolute_pos_embed # this line is later added
+
+        if area_prior is not None:
+            area_prior = area_prior.flatten(2).transpose(1, 2)
+
+        x_out, H, W = self.g2l_layer(x, Wh, Ww, area_prior)
+        norm_layer = getattr(self, f'g2l_layer_norm')
+        x_out = norm_layer(x_out)
+        out = x_out.view(-1, H, W, self.embed_dim).permute(0, 3, 1, 2).contiguous()
+
+        return out
\ No newline at end of file
diff --git a/estimator/models/builder.py b/estimator/models/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..f327927af521f448273ff37baa71ac3c0b559351
--- /dev/null
+++ b/estimator/models/builder.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from estimator.registry import MODELS
+
+def build_model(cfg):
+    """Build backbone."""
+    return MODELS.build(cfg)
\ No newline at end of file
diff --git a/estimator/models/losses.py b/estimator/models/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..51aa3b7acb21963eee2159d2aca46dd995016249
--- /dev/null
+++ b/estimator/models/losses.py
@@ -0,0 +1,613 @@
+import copy
+import kornia
+
+import torch
+import torch.nn as nn
+from mmengine import print_log
+import torch.nn.functional as F
+import random
+import math
+
+from estimator.registry import MODELS
+from kornia.losses import dice_loss, focal_loss
+
+@MODELS.register_module()
+class SILogLoss(nn.Module):
+    """SILog loss (pixel-wise)"""
+    def __init__(self, beta=0.15, **kwargs):
+        super(SILogLoss, self).__init__()
+        self.name = 'SILog'
+        self.beta = beta
+
+    def forward(self, input, target, min_depth, max_depth, additional_mask=None):
+        _, _, h_i, w_i = input.shape
+        _, _, h_t, w_t = target.shape
+        
+        if h_i != h_t or w_i != w_t:
+            input = F.interpolate(input, (h_t, w_t), mode='bilinear', align_corners=True)
+        
+        mask = torch.logical_and(target>min_depth, target<max_depth)
+        
+        if additional_mask is not None:
+            mask_merge = torch.logical_and(mask, additional_mask)
+            if torch.sum(mask_merge) >= h_i * w_i * 0.001:
+                mask = mask_merge
+            else:
+                print_log("torch.sum(mask_merge) < h_i * w_i * 0.001, reduce to previous mask for stable training", logger='current')
+        
+        if torch.sum(mask) <= 1:
+            print_log("torch.sum(mask) <= 1, hack to skip avoiding nan", logger='current')
+            return input * 0.0
+        
+        input = input[mask]
+        target = target[mask]
+        alpha = 1e-7
+        g = torch.log(input + alpha) - torch.log(target + alpha)
+        Dg = torch.var(g) + self.beta * torch.pow(torch.mean(g), 2)
+        loss = 10 * torch.sqrt(Dg)
+
+        if torch.isnan(loss):
+            print_log("Nan SILog loss", logger='current')
+            print_log("input: {}".format(input.shape), logger='current')
+            print_log("target: {}".format(target.shape), logger='current')
+            
+            print_log("G: {}".format(torch.sum(torch.isnan(g))), logger='current')
+            print_log("Input min: {} max: {}".format(torch.min(input), torch.max(input)), logger='current')
+            print_log("Target min: {} max: {}".format(torch.min(target), torch.max(target)), logger='current')
+            print_log("Dg: {}".format(torch.isnan(Dg)), logger='current')
+            print_log("loss: {}".format(torch.isnan(loss)), logger='current')
+
+        return loss
+
+
+def get_grad_map(value):
+    grad = kornia.filters.spatial_gradient(value)
+    grad_xy = (grad[:,:,0,:,:] ** 2 + grad[:,:,1,:,:] ** 2) ** (1/2)
+    return grad_xy
+
+def get_grad_error_mask(gt, coarse_prediction, shape=(384, 512), min_depth=1e-3, max_depth=80):
+    invalid_mask = torch.logical_or(gt<=min_depth, gt>=max_depth)
+    gt_grad = get_grad_map(gt)
+    coarse_prediction_grad = get_grad_map(coarse_prediction)
+    grad_error = ((gt_grad - coarse_prediction_grad) / gt).abs()
+    grad_error[grad_error>0.001] = 1.
+    grad_error[invalid_mask] = 2. # filter invalid area
+    grad_error[gt>10000] = 3.
+    return grad_error.long().squeeze(dim=1)
+
+def get_grad_value_error_mask(gt, coarse_prediction, shape=(384, 512), min_depth=1e-3, max_depth=80):
+    invalid_mask = torch.logical_or(gt<=min_depth, gt>=max_depth)
+    error = ((gt - coarse_prediction) / gt).abs() 
+    error[error>0.1] = 1.
+    gt_grad = get_grad_map(gt)
+    coarse_prediction_grad = get_grad_map(coarse_prediction)
+    grad_error = ((gt_grad - coarse_prediction_grad) / gt).abs()
+    error[grad_error>0.001] = 1.
+    error[invalid_mask] = 2. # filter invalid area
+    error[gt>10000] = 3.
+    return error.long().squeeze(dim=1)
+
+def get_incoherent_mask(gt, shape=(384, 512), min_depth=1e-3, max_depth=80):
+    # incoherent
+    ori_shpae = gt.shape[-2:]
+    gt_lr = F.interpolate(gt, shape, mode='bilinear', align_corners=True)
+    invalid_mask = torch.logical_or(gt<=min_depth, gt>=max_depth)
+    gt_recover = F.interpolate(gt_lr, ori_shpae, mode='bilinear', align_corners=True)
+    residue = (gt - gt_recover).abs()
+    
+    gt_label = torch.zeros_like(gt)
+    gt_label[residue >= 0.01] = 1. # set incoherent area as 1
+    gt_label[invalid_mask] = 2. # filter invalid area
+    gt_label[gt>10000] = 3.
+    return gt_label.long().squeeze(dim=1)
+
+def get_incoherent_grad_error_mask(gt, coarse_prediction, shape=(384, 512), min_depth=1e-3, max_depth=80):
+    # incoherent
+    ori_shpae = gt.shape[-2:]
+    gt_lr = F.interpolate(gt, shape, mode='bilinear', align_corners=True)
+    invalid_mask = torch.logical_or(gt<=min_depth, gt>=max_depth)
+    gt_recover = F.interpolate(gt_lr, ori_shpae, mode='bilinear', align_corners=True)
+    residue = (gt - gt_recover).abs()
+    # coarse_prediction = F.interpolate(coarse_prediction, gt.shape[-2:], mode='bilinear', align_corners=True)
+    # error = (gt - coarse_prediction).abs()
+    
+    # grad error
+    gt_grad = get_grad_map(gt)
+    coarse_prediction_grad = get_grad_map(coarse_prediction)
+    grad_error = ((gt_grad - coarse_prediction_grad) / gt).abs()
+    
+    bad_area_mask = torch.logical_or(residue>0.01, grad_error>0.001)
+    gt_label = torch.zeros_like(gt)
+    gt_label[bad_area_mask] = 1.
+    gt_label[invalid_mask] = 2. # filter invalid area
+    gt_label[gt>10000] = 3.
+    return gt_label.long().squeeze(dim=1)
+
+def get_incoherent_grad_value_error_mask(gt, coarse_prediction, shape=(384, 512), min_depth=1e-3, max_depth=80):
+    # incoherent
+    ori_shpae = gt.shape[-2:]
+    gt_lr = F.interpolate(gt, shape, mode='bilinear', align_corners=True)
+    invalid_mask = torch.logical_or(gt<=min_depth, gt>=max_depth)
+    gt_recover = F.interpolate(gt_lr, ori_shpae, mode='bilinear', align_corners=True)
+    residue = (gt - gt_recover).abs()
+    
+    # value error
+    coarse_prediction = F.interpolate(coarse_prediction, gt.shape[-2:], mode='bilinear', align_corners=True)
+    error = (gt - coarse_prediction).abs()
+    bad_area_mask = torch.logical_or(residue>0.01, error>0.5)
+    
+    # grad error
+    gt_grad = get_grad_map(gt)
+    coarse_prediction_grad = get_grad_map(coarse_prediction)
+    grad_error = ((gt_grad - coarse_prediction_grad) / gt).abs()
+    bad_area_mask = torch.logical_or(grad_error, grad_error>0.001)
+    
+    gt_label = torch.zeros_like(gt)
+    gt_label[bad_area_mask] = 1.
+    gt_label[invalid_mask] = 2. # filter invalid area
+    gt_label[gt>10000] = 3.
+    return gt_label.long().squeeze(dim=1)
+
+class GeneralizedSoftDiceLoss(nn.Module):
+    def __init__(self,
+                 p=1,
+                 smooth=1,
+                 reduction='mean'):
+        super(GeneralizedSoftDiceLoss, self).__init__()
+        self.p = p
+        self.smooth = smooth
+        self.reduction = reduction
+
+    def forward(self, probs, label):
+        '''
+        args: logits: tensor of shape (N, C, H, W)
+        args: label: tensor of shape(N, H, W)
+        '''
+        # compute loss
+        numer = torch.sum((probs*label), dim=(2, 3))
+        denom = torch.sum(probs.pow(self.p) + label.pow(self.p), dim=(2, 3))
+        numer = torch.sum(numer, dim=1)
+        denom = torch.sum(denom, dim=1)
+        loss = 1 - (2*numer+self.smooth)/(denom+self.smooth)
+        if self.reduction == 'mean':
+            loss = loss.mean()
+        return loss
+
+@MODELS.register_module()
+class EdgeClsLoss(nn.Module):
+    """Error loss (pixel-wise)"""
+    def __init__(self, focal_weight=0.5):
+        super(EdgeClsLoss, self).__init__()
+        self.name = 'Error'
+        self.criterion_dice = GeneralizedSoftDiceLoss()
+        self.criterion_bce = nn.BCELoss()
+        self.focal_weight = focal_weight
+
+    def forward(self, input, target):
+        _, _, h_i, w_i = input.shape
+        _, h_t, w_t = target.shape
+        
+        if h_i != h_t or w_i != w_t:
+            input = F.interpolate(input, (h_t, w_t), mode='bilinear', align_corners=True)
+
+        target = target.long()
+        dice = dice_loss(input, target)
+        focal = focal_loss(input, target, alpha=self.focal_weight, reduction='mean')
+        
+        return dice, focal
+
+
+@MODELS.register_module()
+class ErrorLoss(nn.Module):
+    """Error loss (pixel-wise)"""
+    def __init__(self, loss_type, focal_weight):
+        super(ErrorLoss, self).__init__()
+        self.name = 'Error'
+        self.criterion_dice = GeneralizedSoftDiceLoss()
+        self.criterion_bce = nn.BCELoss()
+        self.loss_type = loss_type
+        self.focal_weight = focal_weight
+
+    def forward(self, input, target, coarse_prediction, min_depth, max_depth):
+        _, _, h_i, w_i = input.shape
+        _, _, h_c, w_c = coarse_prediction.shape
+        _, _, h_t, w_t = target.shape
+        
+        if h_i != h_t or w_i != w_t:
+            input = F.interpolate(input, (h_t, w_t), mode='bilinear', align_corners=True)
+        if h_c != h_t or w_c != w_t:
+            coarse_prediction = F.interpolate(coarse_prediction, (h_t, w_t), mode='bilinear')
+            
+        
+        if self.loss_type == 'incoh':
+            gt_mask = get_incoherent_mask(target, shape=(h_i, w_i), min_depth=min_depth, max_depth=max_depth)
+        elif self.loss_type == 'incoh+grad':
+            gt_mask = get_incoherent_grad_error_mask(target, coarse_prediction, shape=(h_i, w_i), min_depth=min_depth, max_depth=max_depth)
+        elif self.loss_type == 'incoh+grad+depth':
+            gt_mask = get_incoherent_grad_value_error_mask(target, coarse_prediction, shape=(h_i, w_i), min_depth=min_depth, max_depth=max_depth)
+        else:
+            raise NotImplementedError
+        
+
+        dice = dice_loss(input, gt_mask)
+        # focal = focal_loss(input, gt_mask, alpha=0.5, reduction='mean')
+        focal = focal_loss(input, gt_mask, alpha=self.focal_weight, reduction='mean')
+        
+        return dice, focal, gt_mask
+
+
+def ind2sub(idx, w):
+    # r = idx // cols
+    # c = idx - r * cols
+    # return r, c
+    row = idx // w
+    col = idx % w
+    return row, col
+
+def sub2ind(r, c, cols):
+    idx = r * cols + c
+    return idx
+import matplotlib.pyplot as plt
+
+######################################################
+# EdgeguidedRankingLoss
+#####################################################
+@MODELS.register_module()
+class EdgeguidedRankingLoss(nn.Module):
+    def __init__(
+        self, 
+        point_pairs=10000, 
+        sigma=0.03, 
+        alpha=1.0, 
+        mask_value=-1e-8, 
+        reweight_target=False, 
+        only_missing_area=False,
+        min_depth=-1e-3, 
+        max_depth=80,
+        missing_value=-99,
+        random_direct=True):
+        super(EdgeguidedRankingLoss, self).__init__()
+        self.point_pairs = point_pairs # number of point pairs
+        self.sigma = sigma # used for determining the ordinal relationship between a selected pair
+        self.alpha = alpha # used for balancing the effect of = and (<,>)
+        self.mask_value = mask_value
+        #self.regularization_loss = GradientLoss(scales=4)
+        self.reweight_target = reweight_target
+        self.only_missing_area = only_missing_area
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.missing_value = missing_value
+        self.random_direct = random_direct
+        
+        self.idx = 0
+        self.idx_inner = 0
+        
+    def getEdge(self, images):
+        n,c,h,w = images.size()
+        a = torch.Tensor([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]]).cuda().view((1,1,3,3)).repeat(1, 1, 1, 1)
+        b = torch.Tensor([[1, 2, 1], [0, 0, 0], [-1, -2, -1]]).cuda().view((1,1,3,3)).repeat(1, 1, 1, 1)
+        if c == 3:
+            gradient_x = F.conv2d(images[:,0,:,:].unsqueeze(1), a)
+            gradient_y = F.conv2d(images[:,0,:,:].unsqueeze(1), b)
+        else:
+            gradient_x = F.conv2d(images, a)
+            gradient_y = F.conv2d(images, b)
+        edges = torch.sqrt(torch.pow(gradient_x,2)+ torch.pow(gradient_y,2))
+        edges = F.pad(edges, (1,1,1,1), "constant", 0)
+        thetas = torch.atan2(gradient_y, gradient_x)
+        thetas = F.pad(thetas, (1,1,1,1), "constant", 0)
+
+        return edges, thetas
+
+    def edgeGuidedSampling(self, inputs, targets, edges_img, thetas_img, missing_mask, depth_gt, strict_mask):
+        # find edges
+        edges_max = edges_img.max()
+        edges_mask = edges_img.ge(edges_max*0.1)
+        edges_mask = torch.logical_and(edges_mask, strict_mask) # 1 edge, 2 strict mask
+        
+        if self.only_missing_area:
+            # edges_mask = torch.logical_and(edges_mask, missing_mask) # base anchor: 1 missing values (0) and 2 edge masks and within 3 strict mask
+            edges_mask = missing_mask
+            
+        edges_loc = edges_mask.nonzero()
+
+        minlen = edges_loc.shape[0]
+        
+        if minlen == 0:
+            return torch.Tensor([]), torch.Tensor([]), torch.Tensor([]), torch.Tensor([]), 0
+
+        # find anchor points (i.e, edge points)
+        sample_num = self.point_pairs
+        sample_index = torch.randint(0, minlen, (sample_num,), dtype=torch.long).cuda()
+        sample_h, sample_w = edges_loc[sample_index, 0], edges_loc[sample_index, 1]
+        theta_anchors = thetas_img[sample_h, sample_w] 
+        
+        sidx = edges_loc.shape[0] // 2
+        # plt.figure()
+        # plt.imshow(missing_mask.squeeze().cpu().numpy())
+        # plt.savefig('nfs_scannet/debug_figs/debug_mask_{}.png'.format(self.idx))
+        # plt.figure()
+        # plt.imshow(missing_mask.squeeze().cpu().numpy())
+        # circle = plt.Circle((sample_w[0], sample_h[0]), 0.5, color='r')
+        # plt.gca().add_patch(circle)
+        # plt.savefig('nfs_scannet/debug_figs/debug_anchor_{}.png'.format(self.idx))
+        
+        ## compute the coordinates of 4-points,  distances are from [-30, 30]
+        distance_matrix = torch.randint(2, 31, (4,sample_num)).cuda()
+        pos_or_neg = torch.ones(4, sample_num).cuda()
+        pos_or_neg[:2,:] = -pos_or_neg[:2,:]
+            
+        distance_matrix = distance_matrix.float() * pos_or_neg
+        
+        p = random.random()
+        # theta_anchors = theta_anchors + math.pi / 2
+        # # Normalize the angle to be between -pi and pi
+        # theta_anchors = (theta_anchors + math.pi) % (2 * math.pi) - math.pi
+        # col = sample_w.unsqueeze(0).expand(4, sample_num).long() + torch.round(distance_matrix.double() * torch.sin(theta_anchors).unsqueeze(0)).long()
+        # row = sample_h.unsqueeze(0).expand(4, sample_num).long() + torch.round(distance_matrix.double() * torch.cos(theta_anchors).unsqueeze(0)).long()
+        
+        if self.random_direct:
+            if p < 0.5:
+                col = sample_w.unsqueeze(0).expand(4, sample_num).long() + torch.round(distance_matrix.double() * torch.cos(theta_anchors).unsqueeze(0)).long()
+                row = sample_h.unsqueeze(0).expand(4, sample_num).long() + torch.round(distance_matrix.double() * torch.sin(theta_anchors).unsqueeze(0)).long()
+            else:
+                theta_anchors = theta_anchors + math.pi / 2
+                theta_anchors = (theta_anchors + math.pi) % (2 * math.pi) - math.pi
+                col = sample_w.unsqueeze(0).expand(4, sample_num).long() + torch.round(distance_matrix.double() * torch.sin(theta_anchors).unsqueeze(0)).long()
+                row = sample_h.unsqueeze(0).expand(4, sample_num).long() + torch.round(distance_matrix.double() * torch.cos(theta_anchors).unsqueeze(0)).long()
+        else:
+            col = sample_w.unsqueeze(0).expand(4, sample_num).long() + torch.round(distance_matrix.double() * torch.cos(theta_anchors).unsqueeze(0)).long()
+            row = sample_h.unsqueeze(0).expand(4, sample_num).long() + torch.round(distance_matrix.double() * torch.sin(theta_anchors).unsqueeze(0)).long()
+
+        # constrain 0=<c<=w, 0<=r<=h
+        # Note: index should minus 1
+        w, h = depth_gt.shape[-1], depth_gt.shape[-2]
+        invalid_mask = (col<0) + (col>w-1) + (row<0) + (row>h-1)
+        invalid_mask = torch.sum(invalid_mask, dim=0) > 0
+        col = col[:, torch.logical_not(invalid_mask)]
+        row = row[:, torch.logical_not(invalid_mask)]
+    
+        # only use one pair
+        a = torch.stack([row[0, :], col[0, :]])
+        b = torch.stack([row[1, :], col[1, :]])
+        c = torch.stack([row[2, :], col[2, :]])
+        d = torch.stack([row[3, :], col[3, :]])
+        
+        
+        if self.only_missing_area:
+            valid_check_a_strict = strict_mask[a[0, :], a[1, :]] == True
+            valid_check_a_missing = missing_mask[a[0, :], a[1, :]] == True
+            valid_check_b_strict = strict_mask[b[0, :], b[1, :]] == True
+            valid_check_b_missing = missing_mask[b[0, :], b[1, :]] == True
+            valid_check_c_strict = strict_mask[c[0, :], c[1, :]] == True
+            valid_check_c_missing = missing_mask[c[0, :], c[1, :]] == True
+            valid_check_d_strict = strict_mask[d[0, :], d[1, :]] == True
+            valid_check_d_missing = missing_mask[d[0, :], d[1, :]] == True
+            
+            valid_mask_ab = torch.logical_not(torch.logical_and(valid_check_a_strict, valid_check_b_strict))
+            valid_mask_bc = torch.logical_not(torch.logical_and(valid_check_b_strict, valid_check_c_strict))
+            valid_mask_cd = torch.logical_not(torch.logical_and(valid_check_c_strict, valid_check_d_strict))
+
+            valid_mask = torch.logical_and(valid_mask_ab, valid_mask_bc)
+            valid_mask = torch.logical_and(valid_mask, valid_mask_cd)
+            
+            # valid_check_a_missing = missing_mask[a[0, :], a[1, :]] == True
+            # valid_check_b_missing = missing_mask[b[0, :], b[1, :]] == True
+            # valid_check_c_missing = missing_mask[c[0, :], c[1, :]] == True
+            # valid_check_d_missing = missing_mask[d[0, :], d[1, :]] == True
+            # valid_mask = torch.logical_and(valid_check_a_missing, valid_check_b_missing)
+            # valid_mask = torch.logical_and(valid_mask, valid_check_c_missing)
+            # valid_mask = torch.logical_and(valid_mask, valid_check_d_missing)
+            a = a[:, valid_mask]
+            b = b[:, valid_mask]   
+            c = c[:, valid_mask]
+            d = d[:, valid_mask]   
+        
+        if a.numel() == 0 or b.numel() == 0 or c.numel() == 0 or d.numel() == 0:
+            return torch.Tensor([]), torch.Tensor([]), torch.Tensor([]), torch.Tensor([]), 0
+        
+        # sidx = 0
+        # plt.figure()
+        # plt.imshow(depth_gt.squeeze().cpu().numpy())
+        # circle = plt.Circle((a[1][sidx], a[0][sidx]), 3, color='r')
+        # plt.gca().add_patch(circle)
+        # circle = plt.Circle((b[1][sidx], b[0][sidx]), 3, color='r')
+        # plt.gca().add_patch(circle)
+        # plt.savefig('nfs_scannet/debug_figs/debug_{}_{}.png'.format(self.idx, self.idx_inner))
+        # self.idx_inner += 1
+        
+        A = torch.cat((a,b,c), 1)
+        B = torch.cat((b,c,d), 1)
+        sumple_num = A.shape[1]
+        
+        inputs_A = inputs[A[0, :], A[1, :]]
+        inputs_B = inputs[B[0, :], B[1, :]]
+        targets_A = targets[A[0, :], A[1, :]]
+        targets_B = targets[B[0, :], B[1, :]]
+
+        return inputs_A, inputs_B, targets_A, targets_B, sumple_num
+
+
+    def randomSampling(self, inputs, targets, missing_part, valid_part, sample_num):
+        # Apply masks to get the valid indices for missing and valid parts
+        missing_indices = torch.nonzero(missing_part.float()).squeeze()
+        valid_indices = torch.nonzero(valid_part.float()).squeeze()
+
+        # Ensure that we have enough points to sample from
+        sample_num = min(sample_num, len(missing_indices), len(valid_indices))
+
+        # Shuffle and sample indices from the missing and valid parts
+        shuffle_missing_indices = torch.randperm(len(missing_indices))[:sample_num].cuda()
+        shuffle_valid_indices = torch.randperm(len(valid_indices))[:sample_num].cuda()
+
+        # Select the sampled points for inputs and targets based on the shuffled indices
+        inputs_A = inputs[missing_indices[shuffle_missing_indices]]
+        inputs_B = inputs[valid_indices[shuffle_valid_indices]]
+
+        targets_A = targets[missing_indices[shuffle_missing_indices]]
+        targets_B = targets[valid_indices[shuffle_valid_indices]]
+        return inputs_A, inputs_B, targets_A, targets_B, sample_num
+
+    def forward(self, inputs, targets, images, depth_gt=None, interpolate=True):
+        if interpolate:
+            targets = F.interpolate(targets, inputs.shape[-2:], mode='bilinear', align_corners=True)
+            images = F.interpolate(images, inputs.shape[-2:], mode='bilinear', align_corners=True)
+            depth_gt = F.interpolate(depth_gt, inputs.shape[-2:], mode='bilinear', align_corners=True)
+            
+        n, _, _, _= inputs.size()
+        
+        # strict_mask is a range mask
+        strict_mask = torch.logical_and(depth_gt>self.min_depth, depth_gt<self.max_depth)
+        
+        # remove pl out of range pixels
+        invalid_pl_mask = targets == 80
+        strict_mask = torch.logical_and(strict_mask, torch.logical_not(invalid_pl_mask))
+        
+        if self.only_missing_area:
+            masks = depth_gt == self.missing_value # only consider missing values in semi loss
+        else:
+            masks = torch.ones_like(strict_mask).bool()
+
+        edges_img, thetas_img = self.getEdge(images)
+        
+        # initialization
+        loss = torch.DoubleTensor([0.0]).cuda()
+        sample_num_sum = torch.tensor([0.0])
+        for i in range(n):
+            # Edge-Guided sampling
+            inputs_A, inputs_B, targets_A, targets_B, sample_num_e = self.edgeGuidedSampling(
+                inputs[i].squeeze(), 
+                targets[i].squeeze(), 
+                edges_img[i].squeeze(), 
+                thetas_img[i].squeeze(), 
+                masks[i].squeeze(), 
+                depth_gt[i].squeeze(),
+                strict_mask[i].squeeze())
+            sample_num_sum += sample_num_e
+            
+            if sample_num_e == 0:
+                continue
+            if len(inputs_A) == 0 or len(inputs_B) == 0 or len(targets_A) == 0 or len(targets_B) == 0:
+                continue
+            
+            # try:
+            #     inputs_A_r, inputs_B_r, targets_A_r, targets_B_r, sample_num_r = self.randomSampling(
+            #         inputs[i].squeeze().view(-1), 
+            #         targets[i].squeeze().view(-1), 
+            #         masks[i].squeeze().view(-1), 
+            #         strict_mask[i].squeeze().view(-1), 
+            #         sample_num_e)
+            #     sample_num_sum += sample_num_r
+            
+            #     # Combine EGS + RS
+            #     inputs_A = torch.cat((inputs_A, inputs_A_r), 0)
+            #     inputs_B = torch.cat((inputs_B, inputs_B_r), 0)
+            #     targets_A = torch.cat((targets_A, targets_A_r), 0)
+            #     targets_B = torch.cat((targets_B, targets_B_r), 0)
+                
+            # except TypeError as e:
+            #     print_log(e, logger='current')
+                
+                
+            # GT ordinal relationship
+            target_ratio = torch.div(targets_A+1e-6, targets_B+1e-6)
+            target_weight = torch.abs(targets_A - targets_B) / (torch.max(torch.abs(targets_A - targets_B)) + 1e-6) # avoid nan
+            target_weight = torch.exp(target_weight)
+            # target_weight = torch.abs(targets_A - targets_B)
+            # target_weight = torch.exp(target_weight)
+            
+            mask_eq = target_ratio.lt(1.0 + self.sigma) * target_ratio.gt(1.0/(1.0+self.sigma))
+            labels = torch.zeros_like(target_ratio)
+            labels[target_ratio.ge(1.0 + self.sigma)] = 1
+            labels[target_ratio.le(1.0/(1.0+self.sigma))] = -1
+
+            if self.reweight_target:
+                equal_loss = (inputs_A - inputs_B).pow(2) * mask_eq.double() # can also use the weight
+            else:
+                equal_loss = (inputs_A - inputs_B).pow(2) / target_weight * mask_eq.double() # can also use the weight
+                
+            
+            if self.reweight_target:
+                unequal_loss = torch.log(1 + torch.exp((-inputs_A + inputs_B) / target_weight * labels)) * (~mask_eq).double()
+            else:
+                unequal_loss = torch.log(1 + torch.exp((-inputs_A + inputs_B) * labels)) * (~mask_eq).double()
+
+            # Please comment the regularization term if you don't want to use the multi-scale gradient matching loss !!!
+            loss = loss + self.alpha * equal_loss.mean() + 1.0 * unequal_loss.mean() #+ 0.2 * regularization_loss.double()
+        
+        self.idx += 1
+        return loss[0].float()/n, float(sample_num_sum/n)
+    
+
+def compute_scale_and_shift(prediction, target, mask):
+    # system matrix: A = [[a_00, a_01], [a_10, a_11]]
+    a_00 = torch.sum(mask * prediction * prediction, (1, 2))
+    a_01 = torch.sum(mask * prediction, (1, 2))
+    a_11 = torch.sum(mask, (1, 2))
+
+    # right hand side: b = [b_0, b_1]
+    b_0 = torch.sum(mask * prediction * target, (1, 2))
+    b_1 = torch.sum(mask * target, (1, 2))
+
+    # solution: x = A^-1 . b = [[a_11, -a_01], [-a_10, a_00]] / (a_00 * a_11 - a_01 * a_10) . b
+    x_0 = torch.zeros_like(b_0)
+    x_1 = torch.zeros_like(b_1)
+
+    det = a_00 * a_11 - a_01 * a_01
+    # A needs to be a positive definite matrix.
+    valid = det > 0
+
+    x_0[valid] = (a_11[valid] * b_0[valid] - a_01[valid] * b_1[valid]) / det[valid]
+    x_1[valid] = (-a_01[valid] * b_0[valid] + a_00[valid] * b_1[valid]) / det[valid]
+
+    return x_0, x_1
+
+@MODELS.register_module()
+class ScaleAndShiftInvariantLoss(nn.Module):
+    def __init__(self, **kargs):
+        super().__init__()
+        self.name = "SSILoss"
+
+    def forward(self, prediction, target, mask, interpolate=True, return_interpolated=False):
+        
+        _, _, h_i, w_i = prediction.shape
+        _, _, h_t, w_t = target.shape
+    
+        if h_i != h_t or w_i != w_t:
+            prediction = F.interpolate(prediction, (h_t, w_t), mode='bilinear', align_corners=True)
+
+        prediction, target, mask = prediction.squeeze(), target.squeeze(), mask.squeeze()
+        
+        if torch.sum(mask) <= 1:
+            print_log("torch.sum(mask) <= 1, hack to skip avoiding bugs", logger='current')
+            return input * 0.0
+        
+        assert prediction.shape == target.shape, f"Shape mismatch: Expected same shape but got {prediction.shape} and {target.shape}."
+
+        scale, shift = compute_scale_and_shift(prediction, target, mask)
+
+        scaled_prediction = scale.view(-1, 1, 1) * prediction + shift.view(-1, 1, 1)
+
+        loss = nn.functional.l1_loss(scaled_prediction[mask], target[mask])
+        return loss
+
+@MODELS.register_module()
+class ExistLoss(nn.Module):
+    """ExistLoss loss (pixel-wise)"""
+    def __init__(self, reweight_target):
+        super(ExistLoss, self).__init__()
+        self.name = 'ExistLoss'
+        self.reweight_target = reweight_target
+
+
+    def forward(self, pred_grad, pl_grad, pseudo_edge_area):
+        
+        pred_grad_edge = pred_grad[pseudo_edge_area]
+        pl_grad_edge = pl_grad[pseudo_edge_area]
+        pl_grad_weight = torch.exp(pl_grad_edge)
+        
+        if self.reweight_target:
+            loss = torch.exp(-pred_grad_edge / pl_grad_weight).mean()
+        else:
+            loss = torch.exp(-pred_grad_edge).mean()
+        return loss
diff --git a/estimator/models/patchfusion.py b/estimator/models/patchfusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..470b5248563c9366c613a5b1adcfa04eee383d04
--- /dev/null
+++ b/estimator/models/patchfusion.py
@@ -0,0 +1,449 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Zhenyu Li
+
+import itertools
+
+import math
+import copy
+import torch
+import random
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+import matplotlib.pyplot as plt
+from mmengine import print_log
+from mmengine.config import ConfigDict
+from torchvision.ops import roi_align as torch_roi_align
+from huggingface_hub import PyTorchModelHubMixin
+from transformers import PretrainedConfig
+
+from estimator.registry import MODELS
+from estimator.models import build_model
+from estimator.models.baseline_pretrain import BaselinePretrain
+from estimator.models.utils import generatemask
+
+from zoedepth.models.zoedepth import ZoeDepth
+from zoedepth.models.layers.attractor import AttractorLayer, AttractorLayerUnnormed
+from zoedepth.models.layers.dist_layers import ConditionalLogBinomial
+from zoedepth.models.layers.localbins_layers import (Projector, SeedBinRegressor, SeedBinRegressorUnnormed)
+from zoedepth.models.base_models.midas import Resize as ResizeZoe
+from depth_anything.transform import Resize as ResizeDA
+
+
+
+@MODELS.register_module()
+class PatchFusion(BaselinePretrain, PyTorchModelHubMixin):
+    def __init__(
+        self, 
+        config,):
+        """ZoeDepth model
+        """
+        nn.Module.__init__(self)
+        
+        if isinstance(config, ConfigDict):
+            # convert a ConfigDict to a PretrainedConfig for hf saving
+            config = PretrainedConfig.from_dict(config.to_dict())
+            config.load_branch = True
+        else:
+            # used when loading patchfusion from hf model space
+            config = PretrainedConfig.from_dict(ConfigDict(**config).to_dict())
+            config.load_branch = False
+            config.coarse_branch.pretrained_resource = None
+            config.fine_branch.pretrained_resource = None
+            
+        self.config = config
+        
+        self.min_depth = config.min_depth
+        self.max_depth = config.max_depth
+        
+        self.patch_process_shape = config.patch_process_shape
+        self.tile_cfg = self.prepare_tile_cfg(config.image_raw_shape, config.patch_split_num)
+        
+        self.coarse_branch_cfg = config.coarse_branch
+        if config.coarse_branch.type == 'ZoeDepth':
+            self.coarse_branch = ZoeDepth.build(**config.coarse_branch)
+            self.resizer = ResizeZoe(config.patch_process_shape[1], config.patch_process_shape[0], keep_aspect_ratio=False, ensure_multiple_of=32, resize_method="minimal")
+        elif config.coarse_branch.type == 'DA-ZoeDepth':
+            self.coarse_branch = ZoeDepth.build(**config.coarse_branch)
+            self.resizer = ResizeDA(config.patch_process_shape[1], config.patch_process_shape[0], keep_aspect_ratio=False, ensure_multiple_of=14, resize_method="minimal")
+        else:
+            raise NotImplementedError
+        
+        if config.fine_branch.type == 'ZoeDepth':
+            self.fine_branch = ZoeDepth.build(**config.fine_branch)
+        elif config.fine_branch.type == 'DA-ZoeDepth':
+            self.fine_branch = ZoeDepth.build(**config.fine_branch)
+        else:
+            raise NotImplementedError
+        
+        if config.load_branch:
+            print_log("Loading coarse_branch from {}".format(config.pretrain_model[0]), logger='current') 
+            print_log(self.coarse_branch.load_state_dict(torch.load(config.pretrain_model[0], map_location='cpu')['model_state_dict'], strict=True), logger='current') # coarse ckp
+            print_log("Loading fine_branch from {}".format(config.pretrain_model[1]), logger='current')
+            print_log(self.fine_branch.load_state_dict(torch.load(config.pretrain_model[1], map_location='cpu')['model_state_dict'], strict=True), logger='current')
+        
+        # freeze all these parameters
+        for param in self.coarse_branch.parameters():
+            param.requires_grad = False
+        for param in self.fine_branch.parameters():
+            param.requires_grad = False
+                
+        self.sigloss = build_model(config.sigloss)
+        
+        N_MIDAS_OUT = 32
+        btlnck_features = self.fine_branch.core.output_channels[0]
+        self.fusion_conv_list = nn.ModuleList()
+        for i in range(6):
+            if i == 5:
+                layer = nn.Conv2d(N_MIDAS_OUT * 2, N_MIDAS_OUT, 3, 1, 1)
+            else:
+                layer = nn.Conv2d(btlnck_features * 2, btlnck_features, 3, 1, 1)
+            self.fusion_conv_list.append(layer)
+
+        self.guided_fusion = build_model(config.guided_fusion)
+        
+        # NOTE: a decoder head
+        if self.coarse_branch_cfg.bin_centers_type == "normed":
+            SeedBinRegressorLayer = SeedBinRegressor
+            Attractor = AttractorLayer
+        elif self.coarse_branch_cfg.bin_centers_type == "softplus": # default
+            SeedBinRegressorLayer = SeedBinRegressorUnnormed
+            Attractor = AttractorLayerUnnormed
+        elif self.coarse_branch_cfg.bin_centers_type == "hybrid1":
+            SeedBinRegressorLayer = SeedBinRegressor
+            Attractor = AttractorLayerUnnormed
+        elif self.coarse_branch_cfg.bin_centers_type == "hybrid2":
+            SeedBinRegressorLayer = SeedBinRegressorUnnormed
+            Attractor = AttractorLayer
+        else:
+            raise ValueError(
+                "bin_centers_type should be one of 'normed', 'softplus', 'hybrid1', 'hybrid2'")
+        
+        N_MIDAS_OUT = 32
+        btlnck_features = self.fine_branch.core.output_channels[0]
+        num_out_features = self.fine_branch.core.output_channels[1:] # all of them are the same
+
+        self.seed_bin_regressor = SeedBinRegressorLayer(
+            btlnck_features, n_bins=self.coarse_branch_cfg.n_bins, min_depth=config.min_depth, max_depth=config.max_depth)
+        self.seed_projector = Projector(btlnck_features, self.coarse_branch_cfg.bin_embedding_dim)
+        self.projectors = nn.ModuleList([
+            Projector(num_out, self.coarse_branch_cfg.bin_embedding_dim)
+            for num_out in num_out_features
+        ])
+        # 1000, 2, inv, mean
+        self.attractors = nn.ModuleList([
+            Attractor(self.coarse_branch_cfg.bin_embedding_dim, self.coarse_branch_cfg.n_bins, n_attractors=self.coarse_branch_cfg.n_attractors[i], min_depth=config.min_depth, max_depth=config.max_depth,
+                      alpha=self.coarse_branch_cfg.attractor_alpha, gamma=self.coarse_branch_cfg.attractor_gamma, kind=self.coarse_branch_cfg.attractor_kind, attractor_type=self.coarse_branch_cfg.attractor_type)
+            for i in range(len(num_out_features))
+        ])
+        
+        last_in = N_MIDAS_OUT + 1  # +1 for relative depth
+
+        # use log binomial instead of softmax
+        self.conditional_log_binomial = ConditionalLogBinomial(
+            last_in, self.coarse_branch_cfg.bin_embedding_dim, n_classes=self.coarse_branch_cfg.n_bins, min_temp=self.coarse_branch_cfg.min_temp, max_temp=self.coarse_branch_cfg.max_temp)
+        
+        # NOTE: consistency training
+        self.consistency_training = False
+    
+      
+    def load_dict(self, dict):
+        return self.load_state_dict(dict, strict=False)
+                
+    def get_save_dict(self):
+        current_model_dict = self.state_dict()
+        save_state_dict = {}
+        for k, v in current_model_dict.items():
+            if 'coarse_branch' in k or 'fine_branch' in k:
+                pass
+            else:
+                save_state_dict[k] = v
+        return save_state_dict
+    
+    def coarse_forward(self, image_lr):
+        with torch.no_grad():
+            if self.coarse_branch.training:
+                self.coarse_branch.eval()
+                    
+            deep_model_output_dict = self.coarse_branch(image_lr, return_final_centers=True)
+            deep_features = deep_model_output_dict['temp_features'] # x_d0 1/128, x_blocks_feat_0 1/64, x_blocks_feat_1 1/32, x_blocks_feat_2 1/16, x_blocks_feat_3 1/8, midas_final_feat 1/4 [based on 384x4, 512x4]
+            coarse_prediction = deep_model_output_dict['metric_depth']
+            
+            coarse_features = [
+                deep_features['x_d0'],
+                deep_features['x_blocks_feat_0'],
+                deep_features['x_blocks_feat_1'],
+                deep_features['x_blocks_feat_2'],
+                deep_features['x_blocks_feat_3'],
+                deep_features['midas_final_feat']] # bs, c, h, w
+
+            return coarse_prediction, coarse_features
+    
+    def fine_forward(self, image_hr_crop):
+        with torch.no_grad():
+            if self.fine_branch.training:
+                self.fine_branch.eval()
+            
+            deep_model_output_dict = self.fine_branch(image_hr_crop, return_final_centers=True)
+            deep_features = deep_model_output_dict['temp_features'] # x_d0 1/128, x_blocks_feat_0 1/64, x_blocks_feat_1 1/32, x_blocks_feat_2 1/16, x_blocks_feat_3 1/8, midas_final_feat 1/4 [based on 384x4, 512x4]
+            fine_prediction = deep_model_output_dict['metric_depth']
+            
+            fine_features = [
+                deep_features['x_d0'],
+                deep_features['x_blocks_feat_0'],
+                deep_features['x_blocks_feat_1'],
+                deep_features['x_blocks_feat_2'],
+                deep_features['x_blocks_feat_3'],
+                deep_features['midas_final_feat']] # bs, c, h, w
+            
+            return fine_prediction, fine_features
+    
+    def coarse_postprocess_train(self, coarse_prediction, coarse_features, bboxs, bboxs_feat):
+
+        coarse_features_patch_area = []
+        for idx, feat in enumerate(coarse_features):
+            bs, _, h, w = feat.shape
+            cur_lvl_feat = torch_roi_align(feat, bboxs_feat, (h, w), h/self.patch_process_shape[0], aligned=True)
+            coarse_features_patch_area.append(cur_lvl_feat)
+
+        coarse_prediction_roi = torch_roi_align(coarse_prediction, bboxs_feat, coarse_prediction.shape[-2:], coarse_prediction.shape[-2]/self.patch_process_shape[0], aligned=True)
+
+        return coarse_prediction_roi, coarse_features_patch_area
+    
+
+    def coarse_postprocess_test(self, coarse_prediction, coarse_features, bboxs, bboxs_feat):
+        patch_num = bboxs_feat.shape[0]
+
+        coarse_features_patch_area = []
+        for idx, feat in enumerate(coarse_features):
+            bs, _, h, w = feat.shape
+            feat_extend = feat.repeat(patch_num, 1, 1, 1)
+            cur_lvl_feat = torch_roi_align(feat_extend, bboxs_feat, (h, w), h/self.patch_process_shape[0], aligned=True)
+            coarse_features_patch_area.append(cur_lvl_feat)
+        
+        coarse_prediction = coarse_prediction.repeat(patch_num, 1, 1, 1)
+        coarse_prediction_roi = torch_roi_align(coarse_prediction, bboxs_feat, coarse_prediction.shape[-2:], coarse_prediction.shape[-2]/self.patch_process_shape[0], aligned=True)
+
+        return_dict = {
+            'coarse_depth_roi': coarse_prediction_roi,
+            'coarse_feats_roi': coarse_features_patch_area}
+        
+        return return_dict
+    
+    def fusion_forward(self, fine_depth_pred, crop_input, coarse_model_midas_enc_feats, fine_model_midas_enc_feats, bbox_feat, coarse_depth_roi=None, coarse_feats_roi=None):
+        feat_cat_list = []
+        feat_plus_list = []
+        
+        for l_i, (f_ca, f_c_roi, f_f) in enumerate(zip(coarse_model_midas_enc_feats, coarse_feats_roi, fine_model_midas_enc_feats)):
+            feat_cat = self.fusion_conv_list[l_i](torch.cat([f_c_roi, f_f], dim=1))
+            feat_plus = f_c_roi + f_f
+            feat_cat_list.append(feat_cat)
+            feat_plus_list.append(feat_plus)
+        
+        input_tensor = torch.cat([coarse_depth_roi, fine_depth_pred, crop_input], dim=1)
+        
+        # HACK: hack for depth-anything
+        # if self.coarse_branch_cfg.type == 'DA-ZoeDepth':
+        #     input_tensor = F.interpolate(input_tensor, size=(448, 592), mode='bilinear', align_corners=True)
+            
+        output = self.guided_fusion(
+            input_tensor = input_tensor,
+            guide_plus = feat_plus_list,
+            guide_cat = feat_cat_list,
+            bbox = bbox_feat,
+            fine_feat_crop = fine_model_midas_enc_feats,
+            coarse_feat_whole = coarse_model_midas_enc_feats,
+            coarse_feat_crop = coarse_feats_roi,
+            coarse_feat_whole_hack=None)[::-1] # low -> high
+            
+        x_blocks = output
+        x = x_blocks[0]
+        x_blocks = x_blocks[1:]
+
+        proj_feat_list = []
+        if self.consistency_training:
+            if self.consistency_target == 'unet_feat':
+                proj_feat_list = []
+                for idx, feat in enumerate(output):
+                    proj_feat = self.consistency_projs[idx](feat)
+                    proj_feat_list.append(proj_feat)
+
+        # NOTE: below is ZoeDepth implementation
+        last = x_blocks[-1] # have already been fused in x_blocks
+        bs, c, h, w = last.shape
+        rel_cond = torch.zeros((bs, 1, h, w), device=last.device)
+        _, seed_b_centers = self.seed_bin_regressor(x)
+
+        if self.coarse_branch_cfg.bin_centers_type == 'normed' or self.coarse_branch_cfg.bin_centers_type == 'hybrid2':
+            b_prev = (seed_b_centers - self.min_depth) / \
+                (self.max_depth - self.min_depth)
+        else:
+            b_prev = seed_b_centers
+
+        prev_b_embedding = self.seed_projector(x)
+
+        # unroll this loop for better performance
+        for idx, (projector, attractor, x) in enumerate(zip(self.projectors, self.attractors, x_blocks)):
+            b_embedding = projector(x)
+            b, b_centers = attractor(
+                b_embedding, b_prev, prev_b_embedding, interpolate=True)
+            b_prev = b.clone()
+            prev_b_embedding = b_embedding.clone()
+
+        if self.consistency_training:
+            if self.consistency_target == 'final_feat':
+                proj_feat_1 = self.consistency_projs[0](b_centers)
+                proj_feat_2 = self.consistency_projs[1](last)
+                proj_feat_3 = self.consistency_projs[2](b_embedding)
+                proj_feat_list = [proj_feat_1, proj_feat_2, proj_feat_3]
+
+        rel_cond = nn.functional.interpolate(
+            rel_cond, size=last.shape[2:], mode='bilinear', align_corners=True)
+        last = torch.cat([last, rel_cond], dim=1) # + self.coarse_depth_proj(whole_depth_roi_pred) + self.fine_depth_proj(fine_depth_pred)
+        b_embedding = nn.functional.interpolate(
+            b_embedding, last.shape[-2:], mode='bilinear', align_corners=True)
+        # till here, we have features (attached with a relative depth prediction) and embeddings
+        # post process
+        # final_pred = out * self.blur_mask + whole_depth_roi_pred * (1-self.blur_mask)
+        # out = F.interpolate(out, (540, 960), mode='bilinear', align_corners=True)
+        x = self.conditional_log_binomial(last, b_embedding)
+        b_centers = nn.functional.interpolate(
+            b_centers, x.shape[-2:], mode='bilinear', align_corners=True)
+
+        out = torch.sum(x * b_centers, dim=1, keepdim=True)
+        return out, proj_feat_list
+    
+    
+    def infer_forward(self, imgs_crop, bbox_feat_forward, tile_temp, coarse_temp_dict):
+        
+        fine_prediction, fine_features = self.fine_forward(imgs_crop)
+        
+        depth_prediction, consistency_target = \
+            self.fusion_forward(
+                fine_prediction, 
+                imgs_crop, 
+                tile_temp['coarse_features'], 
+                fine_features, 
+                bbox_feat_forward,
+                **coarse_temp_dict)
+            
+        return depth_prediction
+    
+    
+    def forward(
+        self,
+        mode,
+        image_lr,
+        image_hr,
+        depth_gt=None,
+        crops_image_hr=None,
+        crop_depths=None,
+        bboxs=None,
+        tile_cfg=None,
+        cai_mode='m1',
+        process_num=4):
+        
+        if mode == 'train':
+            bboxs_feat_factor = torch.tensor([
+                1 / self.tile_cfg['image_raw_shape'][1] * self.patch_process_shape[1], 
+                1 / self.tile_cfg['image_raw_shape'][0] * self.patch_process_shape[0], 
+                1 / self.tile_cfg['image_raw_shape'][1] * self.patch_process_shape[1], 
+                1 / self.tile_cfg['image_raw_shape'][0] * self.patch_process_shape[0]], device=bboxs.device).unsqueeze(dim=0)
+            bboxs_feat = bboxs * bboxs_feat_factor
+            inds = torch.arange(bboxs.shape[0]).to(bboxs.device).unsqueeze(dim=-1)
+            bboxs_feat = torch.cat((inds, bboxs_feat), dim=-1)
+        
+            coarse_prediction, coarse_features = self.coarse_forward(image_lr)
+            fine_prediction, fine_features = self.fine_forward(crops_image_hr)
+            coarse_prediction_roi, coarse_features_patch_area = self.coarse_postprocess_train(coarse_prediction, coarse_features, bboxs, bboxs_feat)
+
+            depth_prediction, consistency_target = self.fusion_forward(
+                fine_prediction, 
+                crops_image_hr, 
+                coarse_features, 
+                fine_features, 
+                bboxs_feat,
+                coarse_depth_roi=coarse_prediction_roi,
+                coarse_feats_roi=coarse_features_patch_area,)
+
+            loss_dict = {}
+            loss_dict['sig_loss'] = self.sigloss(depth_prediction, crop_depths, self.min_depth, self.max_depth)
+            loss_dict['total_loss'] = loss_dict['sig_loss']
+            
+            return loss_dict, {'rgb': crops_image_hr, 'depth_pred': depth_prediction, 'depth_gt': crop_depths}
+        
+        else:
+            if tile_cfg is None:
+                tile_cfg = self.tile_cfg
+            else:
+                tile_cfg = self.prepare_tile_cfg(tile_cfg['image_raw_shape'], tile_cfg['patch_split_num'])
+            
+            assert image_hr.shape[0] == 1
+            
+            coarse_prediction, coarse_features = self.coarse_forward(image_lr)
+            
+            tile_temp = {
+                'coarse_prediction': coarse_prediction,
+                'coarse_features': coarse_features,}
+            
+            blur_mask = generatemask((self.patch_process_shape[0], self.patch_process_shape[1])) + 1e-3
+            blur_mask = torch.tensor(blur_mask, device=image_hr.device)
+            avg_depth_map = self.regular_tile(
+                offset=[0, 0], 
+                offset_process=[0, 0], 
+                image_hr=image_hr[0], 
+                init_flag=True, 
+                tile_temp=tile_temp, 
+                blur_mask=blur_mask,
+                tile_cfg=tile_cfg,
+                process_num=process_num)
+
+            if cai_mode == 'm2' or cai_mode[0] == 'r':
+                avg_depth_map = self.regular_tile(
+                    offset=[0, tile_cfg['patch_raw_shape'][1]//2], 
+                    offset_process=[0, self.patch_process_shape[1]//2], 
+                    image_hr=image_hr[0], init_flag=False, tile_temp=tile_temp, blur_mask=blur_mask, avg_depth_map=avg_depth_map, tile_cfg=tile_cfg, process_num=process_num)
+                avg_depth_map = self.regular_tile(
+                    offset=[tile_cfg['patch_raw_shape'][0]//2, 0],
+                    offset_process=[self.patch_process_shape[0]//2, 0], 
+                    image_hr=image_hr[0], init_flag=False, tile_temp=tile_temp, blur_mask=blur_mask, avg_depth_map=avg_depth_map, tile_cfg=tile_cfg, process_num=process_num)
+                avg_depth_map = self.regular_tile(
+                    offset=[tile_cfg['patch_raw_shape'][0]//2, tile_cfg['patch_raw_shape'][1]//2],
+                    offset_process=[self.patch_process_shape[0]//2, self.patch_process_shape[1]//2], 
+                    init_flag=False, image_hr=image_hr[0], tile_temp=tile_temp, blur_mask=blur_mask, avg_depth_map=avg_depth_map, tile_cfg=tile_cfg, process_num=process_num)
+                
+            if cai_mode[0] == 'r':
+                blur_mask = generatemask((tile_cfg['patch_raw_shape'][0], tile_cfg['patch_raw_shape'][1])) + 1e-3
+                blur_mask = torch.tensor(blur_mask, device=image_hr.device)
+                avg_depth_map.resize(tile_cfg['image_raw_shape'])
+                patch_num = int(cai_mode[1:]) // process_num
+                for i in range(patch_num):
+                    avg_depth_map = self.random_tile(
+                        image_hr=image_hr[0], tile_temp=tile_temp, blur_mask=blur_mask, avg_depth_map=avg_depth_map, tile_cfg=tile_cfg, process_num=process_num)
+
+            depth = avg_depth_map.average_map
+            depth = depth.unsqueeze(dim=0).unsqueeze(dim=0)
+
+            return depth, {'rgb': image_lr, 'depth_pred': depth, 'depth_gt': depth_gt}
+
diff --git a/estimator/models/utils.py b/estimator/models/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0da1a7f6c2906ade65337012ffd334f9abeb5ce3
--- /dev/null
+++ b/estimator/models/utils.py
@@ -0,0 +1,47 @@
+import torch
+import cv2
+import numpy as np
+import torch.nn.functional as F
+
+def get_activation(name, bank):
+    def hook(model, input, output):
+        bank[name] = output
+    return hook
+
+class HookTool: 
+    def __init__(self):
+        self.feat = None 
+
+    def hook_in_fun(self, module, fea_in, fea_out):
+        self.feat = fea_in
+        
+    def hook_out_fun(self, module, fea_in, fea_out):
+        self.feat = fea_out
+
+class RunningAverageMap:
+    """ Saving avg depth estimation results."""
+    def __init__(self, average_map, count_map):
+        self.average_map = average_map
+        self.count_map = count_map
+        self.average_map = self.average_map / self.count_map
+
+    def update(self, pred_map, ct_map):
+        self.average_map = (pred_map + self.count_map * self.average_map) / (self.count_map + ct_map)
+        self.count_map = self.count_map + ct_map
+        
+    def resize(self, resolution):
+        temp_avg_map = self.average_map.unsqueeze(0).unsqueeze(0)
+        temp_count_map = self.count_map.unsqueeze(0).unsqueeze(0)
+        self.average_map = F.interpolate(temp_avg_map, size=resolution).squeeze()
+        self.count_map = F.interpolate(temp_count_map, size=resolution, mode='bilinear', align_corners=True).squeeze()
+
+def generatemask(size):
+    # Generates a Guassian mask
+    mask = np.zeros(size, dtype=np.float32)
+    sigma = int(size[0]/16)
+    k_size = int(2 * np.ceil(2 * int(size[0]/16)) + 1)
+    mask[int(0.1*size[0]):size[0] - int(0.1*size[0]), int(0.1*size[1]): size[1] - int(0.1*size[1])] = 1
+    mask = cv2.GaussianBlur(mask, (int(k_size), int(k_size)), sigma)
+    mask = (mask - mask.min()) / (mask.max() - mask.min())
+    mask = mask.astype(np.float32)
+    return mask
\ No newline at end of file
diff --git a/estimator/registry/__init__.py b/estimator/registry/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3d95162f0cdd94ecf5145171eac84e54b399593
--- /dev/null
+++ b/estimator/registry/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .registry import (MODELS, DATASETS)
+
+__all__ = [
+    'MODELS', 'DATASETS'
+]
\ No newline at end of file
diff --git a/estimator/registry/registry.py b/estimator/registry/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..e709ac36a4d940614e22e3ee0f84d589d0fce834
--- /dev/null
+++ b/estimator/registry/registry.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmengine import Registry
+from mmengine.registry import MODELS as MMENGINE_MODELS
+from mmengine.registry import DATASETS as MMENGINE_DATASETS
+
+MODELS = Registry('model', parent=MMENGINE_MODELS, locations=['estimator.models'])
+DATASETS = Registry('dataset', parent=MMENGINE_DATASETS, locations=['estimator.datasets'])
diff --git a/estimator/tester/__init__.py b/estimator/tester/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9628add32aeb78ce57ea0a039cb3271fe27dd5e
--- /dev/null
+++ b/estimator/tester/__init__.py
@@ -0,0 +1 @@
+from .tester import Tester
\ No newline at end of file
diff --git a/estimator/tester/tester.py b/estimator/tester/tester.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cb9202e5e3a24f5d891f68e7e4daf9641d36712
--- /dev/null
+++ b/estimator/tester/tester.py
@@ -0,0 +1,90 @@
+import os
+import cv2
+import wandb
+import numpy as np
+import torch
+import mmengine
+from mmengine.optim import build_optim_wrapper
+import torch.optim as optim
+import matplotlib.pyplot as plt
+from mmengine.dist import get_dist_info, collect_results_cpu, collect_results_gpu
+from mmengine import print_log
+from estimator.utils import colorize, colorize_infer_pfv1, colorize_rescale
+import torch.nn.functional as F
+from tqdm import tqdm
+from mmengine.utils import mkdir_or_exist
+import copy
+from skimage import io
+import kornia
+from PIL import Image
+
+class Tester:
+    """
+    Tester class
+    """
+    def __init__(
+        self, 
+        config,
+        runner_info,
+        dataloader,
+        model):
+       
+        self.config = config
+        self.runner_info = runner_info
+        self.dataloader = dataloader
+        self.model = model
+        self.collect_input_args = config.collect_input_args
+    
+    def collect_input(self, batch_data):
+        collect_batch_data = dict()
+        for k, v in batch_data.items():
+            if isinstance(v, torch.Tensor):
+                if k in self.collect_input_args:
+                    collect_batch_data[k] = v.cuda()
+        return collect_batch_data
+    
+    @torch.no_grad()
+    def run(self, cai_mode='p16', process_num=4):
+        
+        results = []
+        dataset = self.dataloader.dataset
+        loader_indices = self.dataloader.batch_sampler
+        
+        rank, world_size = get_dist_info()
+        if self.runner_info.rank == 0:
+            prog_bar = mmengine.utils.ProgressBar(len(dataset))
+
+        for idx, (batch_indices, batch_data) in enumerate(zip(loader_indices, self.dataloader)):
+            
+            batch_data_collect = self.collect_input(batch_data)
+            result, log_dict = self.model(mode='infer', cai_mode=cai_mode, process_num=process_num, **batch_data_collect) # might use test/val to split cases
+            
+            if self.runner_info.save:
+                
+                color_pred = colorize(result, cmap='magma_r')[:, :, [2, 1, 0]]
+                cv2.imwrite(os.path.join(self.runner_info.work_dir, '{}.png'.format(batch_data['img_file_basename'][0])), color_pred)
+            
+                # Save as PNG
+                raw_depth = Image.fromarray((result.clone().squeeze().detach().cpu().numpy()*256).astype('uint16'))
+                raw_depth.save(os.path.join(self.runner_info.work_dir, '{}_uint16.png'.format(batch_data['img_file_basename'][0])))
+
+            if batch_data_collect.get('depth_gt', None) is not None:
+                metrics = dataset.get_metrics(
+                    batch_data_collect['depth_gt'], 
+                    result, 
+                    seg_image=batch_data_collect.get('seg_image', None),
+                    disp_gt_edges=batch_data.get('boundary', None), 
+                    image_hr=batch_data.get('image_hr', None))
+                results.extend([metrics])
+            
+            if self.runner_info.rank == 0:
+                batch_size = len(result) * world_size
+                for _ in range(batch_size):
+                    prog_bar.update()
+        
+        if batch_data_collect.get('depth_gt', None) is not None:   
+            results = collect_results_gpu(results, len(dataset))
+            if self.runner_info.rank == 0:
+                ret_dict = dataset.evaluate(results)
+    
+    
\ No newline at end of file
diff --git a/estimator/trainer/__init__.py b/estimator/trainer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c0a8a4a91724515aee0aecd8217cfe16ee5ec80
--- /dev/null
+++ b/estimator/trainer/__init__.py
@@ -0,0 +1 @@
+from .trainer import *
diff --git a/estimator/trainer/trainer.py b/estimator/trainer/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..800afd7d96c115fd865d3099f702f4e5c358e5bb
--- /dev/null
+++ b/estimator/trainer/trainer.py
@@ -0,0 +1,314 @@
+import os
+import wandb
+import numpy as np
+import torch
+import mmengine
+from mmengine.optim import build_optim_wrapper
+import torch.optim as optim
+import matplotlib.pyplot as plt
+import torch.distributed as dist
+from mmengine.dist import get_dist_info, collect_results_cpu, collect_results_gpu
+from mmengine import print_log
+import torch.nn.functional as F
+from tqdm import tqdm
+from estimator.utils import colorize
+
+class Trainer:
+    """
+    Trainer class
+    """
+    def __init__(
+        self, 
+        config,
+        runner_info,
+        train_sampler,
+        train_dataloader,
+        val_dataloader,
+        model):
+       
+        self.config = config
+        self.runner_info = runner_info
+        
+        self.train_sampler = train_sampler
+        self.train_dataloader = train_dataloader
+        self.val_dataloader = val_dataloader
+        self.model = model
+
+        # build opt and schedule
+        self.optimizer_wrapper = build_optim_wrapper(self.model, config.optim_wrapper)
+        self.scheduler = optim.lr_scheduler.OneCycleLR(
+            self.optimizer_wrapper.optimizer, [l['lr'] for l in self.optimizer_wrapper.optimizer.param_groups], epochs=self.config.train_cfg.max_epochs, steps_per_epoch=len(self.train_dataloader),
+            cycle_momentum=config.param_scheduler.cycle_momentum, base_momentum=config.param_scheduler.get('base_momentum', 0.85), max_momentum=config.param_scheduler.get('max_momentum', 0.95),
+            div_factor=config.param_scheduler.div_factor, final_div_factor=config.param_scheduler.final_div_factor, pct_start=config.param_scheduler.pct_start, three_phase=config.param_scheduler.three_phase)
+    
+        # I'd like use wandb log_name
+        self.train_step = 0 # for training
+        self.val_step = 0 # for validation
+
+        self.iters_per_train_epoch = len(self.train_dataloader)
+        self.iters_per_val_epoch = len(self.val_dataloader)
+        self.grad_scaler = torch.cuda.amp.GradScaler()
+        self.collect_input_args = config.collect_input_args
+        print_log('successfully init trainer', logger='current')
+    
+    
+    def log_images(self, log_dict, prefix="", scalar_cmap="turbo_r", min_depth=1e-3, max_depth=80, step=0):
+        # Custom log images. Please add more items to the log dict returned from the model
+        
+        wimages = dict()
+        wimages['{}/step'.format(prefix)] = step
+        
+        rgb = log_dict.get('rgb')[0]
+        _, h_rgb, w_rgb = rgb.shape
+        
+        if 'depth_pred' in log_dict.keys():
+            depth_pred = log_dict.get('depth_pred')[0]
+            depth_pred = depth_pred.squeeze()
+            depth_gt = log_dict.get('depth_gt')[0]
+            depth_gt = depth_gt.squeeze()
+            invalid_mask = torch.logical_or(depth_gt<=min_depth, depth_gt>=max_depth).detach().cpu().squeeze().numpy() # (h, w)
+            
+            if np.sum(np.logical_not(invalid_mask)) == 0: # all pixels in gt are invalid
+                return
+            
+            depth_gt_color = colorize(depth_gt, vmin=None, vmax=None, invalid_mask=invalid_mask, cmap=scalar_cmap)
+            depth_pred_color = colorize(depth_pred, vmin=None, vmax=None)
+            
+            depth_gt_img = wandb.Image(depth_gt_color, caption='depth_gt')
+            depth_pred_img = wandb.Image(depth_pred_color, caption='depth_pred')
+            rgb = wandb.Image(rgb, caption='rgb')
+            
+            wimages['{}/LogImageDepth'.format(prefix)] = [rgb, depth_gt_img, depth_pred_img]
+            
+        if 'seg_pred' in log_dict.keys():
+            seg_pred = log_dict.get('seg_pred')[0]
+            seg_pred = seg_pred.squeeze()
+            seg_gt = log_dict.get('seg_gt')[0]
+            seg_gt = seg_gt.squeeze()
+            # class_labels = {0: "good", 1: "refine", 2: "oor", 3: "sky"}
+            class_labels = {0: "bg", 1: "edge"}
+            
+            mask_img = wandb.Image(
+                rgb,
+                masks={
+                    "predictions": {"mask_data": seg_pred.detach().cpu().numpy(), "class_labels": class_labels},
+                    "ground_truth": {"mask_data": seg_gt.detach().cpu().numpy(), "class_labels": class_labels},
+                },
+                caption='segmentation')
+            wimages['{}/LogImageSeg'.format(prefix)] = [mask_img]
+        
+        if 'mask' in log_dict.keys():
+            mask = log_dict.get('mask')[0]
+            mask = mask.squeeze().float()*255
+            mask_img = wandb.Image(
+                mask.detach().cpu().numpy(),
+                caption='segmentation')
+            cur_log = wimages['{}/LogImageDepth'.format(prefix)]
+            cur_log.append(mask_img)
+            wimages['{}/LogImageDepth'.format(prefix)] = cur_log
+        
+        # some other things
+        if 'pseudo_gt' in log_dict.keys():
+            pseudo_gt = log_dict.get('pseudo_gt')[0]
+            pseudo_gt = pseudo_gt.squeeze()
+            pseudo_gt_color = colorize(pseudo_gt, vmin=None, vmax=None, cmap=scalar_cmap)
+            pseudo_gt_img = wandb.Image(pseudo_gt_color, caption='pseudo_gt')
+            cur_log = wimages['{}/LogImageDepth'.format(prefix)]
+            cur_log.append(pseudo_gt_img)
+            # pseudo_gt = log_dict.get('pseudo_gt')[0][0]
+            # pseudo_gt = pseudo_gt * 255
+            # pseudo_gt = pseudo_gt.astype(np.uint8)
+            # pseudo_gt_img = wandb.Image(pseudo_gt, caption='pseudo_gt')
+            # cur_log = wimages['{}/LogImageDepth'.format(prefix)]
+            # cur_log.append(pseudo_gt_img)
+            
+        wandb.log(wimages)
+            
+
+    def collect_input(self, batch_data):
+        collect_batch_data = dict()
+        for k, v in batch_data.items():
+            if isinstance(v, torch.Tensor):
+                if k in self.collect_input_args:
+                    collect_batch_data[k] = v.cuda()
+        return collect_batch_data
+                    
+                    
+    @torch.no_grad()
+    def val_epoch(self):
+        results = []
+        results_list = [[] for _ in range(8)]
+        
+        self.model.eval()
+        dataset = self.val_dataloader.dataset
+        loader_indices = self.val_dataloader.batch_sampler
+
+        rank, world_size = get_dist_info()
+        if self.runner_info.rank == 0:
+            prog_bar = mmengine.utils.ProgressBar(len(dataset))
+
+        for idx, (batch_indices, batch_data) in enumerate(zip(loader_indices, self.val_dataloader)):
+            self.val_step += 1
+
+            batch_data_collect = self.collect_input(batch_data)
+            # result, log_dict = self.model(mode='infer',  **batch_data_collect)
+            result, log_dict = self.model(mode='infer', cai_mode='m1', process_num=4, **batch_data_collect) # might use test/val to split cases
+
+            if isinstance(result, list):
+                # in case you have multiple results
+                for num_res in range(len(result)):
+                    metrics = dataset.get_metrics(
+                        batch_data_collect['depth_gt'], 
+                        result[num_res], 
+                        disp_gt_edges=batch_data.get('boundary', None), 
+                        additional_mask=log_dict.get('mask', None),
+                        image_hr=batch_data.get('image_hr', None))
+                    results_list[num_res].extend([metrics])
+            
+            else:
+                metrics = dataset.get_metrics(
+                    batch_data_collect['depth_gt'], 
+                    result, 
+                    seg_image=batch_data_collect.get('seg_image', None),
+                    disp_gt_edges=batch_data.get('boundary', None), 
+                    additional_mask=log_dict.get('mask', None), 
+                    image_hr=batch_data.get('image_hr', None))
+                results.extend([metrics])
+
+            if self.runner_info.rank == 0:
+                if isinstance(result, list):
+                    batch_size = len(result[0]) * world_size
+                else:
+                    batch_size = len(result) * world_size
+                for _ in range(batch_size):
+                    prog_bar.update()
+
+            if self.runner_info.rank == 0 and self.config.debug == False and (idx + 1) % self.config.train_cfg.val_log_img_interval == False:
+                self.log_images(log_dict=log_dict, prefix="Val", min_depth=self.config.model.min_depth, max_depth=self.config.model.max_depth, step=self.val_step)
+            
+        # collect results from all ranks
+        if isinstance(result, list):
+            results_collect = []
+            for results in results_list:
+                results = collect_results_gpu(results, len(dataset))
+                results_collect.append(results)
+        else:
+            results = collect_results_gpu(results, len(dataset))
+            
+        if self.runner_info.rank == 0:
+            if isinstance(result, list):
+                for num_refine in range(len(result)):
+                    ret_dict = dataset.evaluate(results_collect[num_refine])
+            else:
+                ret_dict = dataset.evaluate(results)
+
+        if self.runner_info.rank == 0 and self.config.debug == False:
+            wdict = dict()
+            for k, v in ret_dict.items():
+                wdict["Val/{}".format(k)] = v.item()
+            wdict['Val/step'] = self.val_step
+            wandb.log(wdict)
+        
+        torch.cuda.empty_cache()
+        if self.runner_info.distributed is True:
+            torch.distributed.barrier()
+        
+        self.model.train() # avoid changing model state 
+        
+        
+    def train_epoch(self, epoch_idx):
+        
+        self.model.train()
+        if self.runner_info.distributed:
+            dist.barrier()
+        
+        pbar = tqdm(enumerate(self.train_dataloader), desc=f"Epoch: {epoch_idx + 1}/{self.config.train_cfg.max_epochs}. Loop: Train",
+                    total=self.iters_per_train_epoch) if self.runner_info.rank == 0 else enumerate(self.train_dataloader)
+        for idx, batch_data in pbar:
+            self.train_step += 1
+
+            batch_data_collect = self.collect_input(batch_data)
+            loss_dict, log_dict = self.model(mode='train', **batch_data_collect)
+            
+            total_loss = loss_dict['total_loss']
+            # total_loss = self.grad_scaler.scale(loss_dict['total_loss'])
+            
+            self.optimizer_wrapper.update_params(total_loss)
+            self.scheduler.step()
+            
+            # log something here
+            if self.runner_info.rank == 0:
+                log_info = 'Epoch: [{:02d}/{:02d}]'.format(epoch_idx + 1, self.config.train_cfg.max_epochs, idx + 1, len(self.train_dataloader))
+                for k, v in loss_dict.items():
+                    log_info += ' - {}: {:.2f}'.format(k, v.item())
+                pbar.set_description(log_info)
+                    
+            if (idx + 1) % self.config.train_cfg.log_interval == 0:
+                log_info = 'Epoch: [{:02d}/{:02d}] - Step: [{:05d}/{:05d}] - Time: [{}/{}] - Total Loss: {}'.format(epoch_idx + 1, self.config.train_cfg.max_epochs, idx + 1, len(self.train_dataloader), 1, 1, total_loss)
+                for k, v in loss_dict.items():
+                    if k != 'total_loss':
+                        log_info += ' - {}: {}'.format(k, v)
+                print_log(log_info, logger='current')
+                
+                if self.runner_info.rank == 0 and self.config.debug == False:
+                    wdict = dict()
+                    wdict['Train/total_loss'] = total_loss.item()
+                    wdict['Train/LR'] = self.optimizer_wrapper.get_lr()['lr'][0]
+                    wdict['Train/momentum'] = self.optimizer_wrapper.get_momentum()['momentum'][0]
+                    wdict['Train/step'] = self.train_step
+                    for k, v in loss_dict.items():
+                        if k != 'total_loss':
+                            if isinstance(v, torch.Tensor):
+                                wdict['Train/{}'.format(k)] = v.item()
+                            else:
+                                wdict['Train/{}'.format(k)] = v
+                    wandb.log(wdict)
+   
+            if self.runner_info.rank == 0 and self.config.debug == False and (idx + 1) % self.config.train_cfg.train_log_img_interval == False:
+                self.log_images(log_dict=log_dict, prefix="Train", min_depth=self.config.model.min_depth, max_depth=self.config.model.max_depth, step=self.train_step)
+            
+            if self.config.train_cfg.val_type == 'iter_base':
+                if (self.train_step + 1) % self.config.train_cfg.val_interval == 0 and (self.train_step + 1) >= self.config.train_cfg.get('eval_start', 0):
+                    self.val_epoch()
+                
+    def save_checkpoint(self, epoch_idx):
+        # As default, the model is wrappered by DDP!!! Hence, even if you're using one gpu, please use dist_train.sh
+        if hasattr(self.model.module, 'get_save_dict'):
+            print_log('Saving ckp, but use the inner get_save_dict fuction to get model_dict', logger='current')
+            # print_log('For saving space. Would you like to save base model several times? :>', logger='current')
+            model_dict = self.model.module.get_save_dict()
+        else:
+            model_dict = self.model.module.state_dict() 
+            
+        checkpoint_dict = {
+            'epoch': epoch_idx, 
+            'model_state_dict': model_dict, 
+            'optim_state_dict': self.optimizer_wrapper.state_dict(),
+            'schedule_state_dict': self.scheduler.state_dict()}
+        
+        if self.runner_info.rank == 0:
+            torch.save(checkpoint_dict, os.path.join(self.runner_info.work_dir, 'checkpoint_{:02d}.pth'.format(epoch_idx + 1)))
+        log_info = 'save checkpoint_{:02d}.pth at {}'.format(epoch_idx + 1, self.runner_info.work_dir)
+        print_log(log_info, logger='current')
+    
+    def run(self):
+        for name, param in self.model.named_parameters():
+            if param.requires_grad:
+                print_log('training param: {}'.format(name), logger='current')
+        
+        # self.val_epoch() # do you want to debug val step?
+        for epoch_idx in range(self.config.train_cfg.max_epochs):
+            if self.runner_info.distributed:
+                self.train_sampler.set_epoch(epoch_idx)
+            self.train_epoch(epoch_idx) 
+            if (epoch_idx + 1) % self.config.train_cfg.val_interval == 0 and (epoch_idx + 1) >= self.config.train_cfg.get('eval_start', 0) and self.config.train_cfg.val_type == 'epoch_base':
+                self.val_epoch()
+            if (epoch_idx + 1) % self.config.train_cfg.save_checkpoint_interval == 0:
+                self.save_checkpoint(epoch_idx)
+            if (epoch_idx + 1) % self.config.train_cfg.get('early_stop_epoch', 9999999) == 0: # Are you using 99999999+ epochs?
+                print_log('early stop at epoch: {}'.format(epoch_idx), logger='current')
+                break
+        
+        if self.config.train_cfg.val_type == 'iter_base':
+            self.val_epoch()
+            
\ No newline at end of file
diff --git a/estimator/utils/__init__.py b/estimator/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d085064158673bde9fde36ef47efb85396114c76
--- /dev/null
+++ b/estimator/utils/__init__.py
@@ -0,0 +1,7 @@
+from .runner import RunnerInfo
+from .image_ops import get_boundaries
+from .dist import setup_env
+from .misc import log_env, fix_random_seed, ConfigType, OptConfigType, MultiConfig, OptMultiConfig
+from .metric import compute_metrics, extract_edges
+from .color import colorize, colorize_infer_pfv1, colorize_rescale
+from .type import *
diff --git a/estimator/utils/color.py b/estimator/utils/color.py
new file mode 100644
index 0000000000000000000000000000000000000000..5995c54b1ec23d65af1bb4267df33384426ab192
--- /dev/null
+++ b/estimator/utils/color.py
@@ -0,0 +1,158 @@
+
+import matplotlib
+import matplotlib.cm
+import numpy as np
+import torch
+
+
+def colorize_infer_pfv1(value, cmap='magma_r', vmin=None, vmax=None):
+    # normalize
+    vmin = value.min() if vmin is None else vmin
+    # vmax = value.max() if vmax is None else vmax
+    vmax = np.percentile(value, 95) if vmax is None else vmax
+
+    if vmin != vmax:
+        value = (value - vmin) / (vmax - vmin)  # vmin..vmax
+    else:
+        value = value * 0.
+
+    cmapper = matplotlib.cm.get_cmap(cmap)
+    value = cmapper(value, bytes=True)  # ((1)xhxwx4)
+
+    value = value[:, :, :3] # bgr -> rgb
+    rgb_value = value[..., ::-1]
+
+    return rgb_value
+
+
+def colorize_rescale(value, vmin=None, vmax=None, cmap='turbo_r', invalid_val=-99, invalid_mask=None, background_color=(128, 128, 128, 255), gamma_corrected=False, value_transform=None, vminp=2, vmaxp=95):
+    """Converts a depth map to a color image.
+
+    Args:
+        value (torch.Tensor, numpy.ndarry): Input depth map. Shape: (H, W) or (1, H, W) or (1, 1, H, W). All singular dimensions are squeezed
+        vmin (float, optional): vmin-valued entries are mapped to start color of cmap. If None, value.min() is used. Defaults to None.
+        vmax (float, optional):  vmax-valued entries are mapped to end color of cmap. If None, value.max() is used. Defaults to None.
+        cmap (str, optional): matplotlib colormap to use. Defaults to 'magma_r'.
+        invalid_val (int, optional): Specifies value of invalid pixels that should be colored as 'background_color'. Defaults to -99.
+        invalid_mask (numpy.ndarray, optional): Boolean mask for invalid regions. Defaults to None.
+        background_color (tuple[int], optional): 4-tuple RGB color to give to invalid pixels. Defaults to (128, 128, 128, 255).
+        gamma_corrected (bool, optional): Apply gamma correction to colored image. Defaults to False.
+        value_transform (Callable, optional): Apply transform function to valid pixels before coloring. Defaults to None.
+
+    Returns:
+        numpy.ndarray, dtype - uint8: Colored depth map. Shape: (H, W, 4)
+    """
+    if isinstance(value, torch.Tensor):
+        value = value.detach().cpu().numpy()
+
+    value = value.squeeze()
+    if invalid_mask is None:
+        invalid_mask = value == invalid_val
+        
+    mask = np.logical_not(invalid_mask)
+
+    # normalize
+    # vmin = np.percentile(value[mask],2) if vmin is None else vmin
+    # vmax = np.percentile(value[mask],85) if vmax is None else vmax
+    
+    # if vminp is None:
+    #     vmin = value.min()
+    # else:
+    # vmin = np.percentile(value[mask],vminp) if vmin is None else vmin
+    # vmax = np.percentile(value[mask],vmaxp) if vmax is None else vmax
+    vmin = value.min() if vmin is None else vmin
+    vmax = value.max() if vmax is None else vmax
+    
+    if vmin != vmax:
+        value = (value - vmin) / (vmax - vmin)  # vmin..vmax
+    else:
+        # Avoid 0-division
+        value = value * 0.
+
+    # squeeze last dim if it exists
+    # grey out the invalid values
+
+    value[invalid_mask] = np.nan
+    cmapper = matplotlib.cm.get_cmap(cmap)
+    if value_transform:
+        value = value_transform(value)
+        # value = value / value.max()
+    value = cmapper(value, bytes=True)  # (nxmx4)
+
+    # img = value[:, :, :]
+    img = value[...]
+    img[invalid_mask] = background_color
+
+    #     return img.transpose((2, 0, 1))
+    if gamma_corrected:
+        # gamma correction
+        img = img / 255
+        img = np.power(img, 2.2)
+        img = img * 255
+        img = img.astype(np.uint8)
+    return img
+
+def colorize(value, vmin=None, vmax=None, cmap='turbo_r', invalid_val=-99, invalid_mask=None, background_color=(128, 128, 128, 255), gamma_corrected=False, value_transform=None, vminp=2, vmaxp=95):
+    """Converts a depth map to a color image.
+
+    Args:
+        value (torch.Tensor, numpy.ndarry): Input depth map. Shape: (H, W) or (1, H, W) or (1, 1, H, W). All singular dimensions are squeezed
+        vmin (float, optional): vmin-valued entries are mapped to start color of cmap. If None, value.min() is used. Defaults to None.
+        vmax (float, optional):  vmax-valued entries are mapped to end color of cmap. If None, value.max() is used. Defaults to None.
+        cmap (str, optional): matplotlib colormap to use. Defaults to 'magma_r'.
+        invalid_val (int, optional): Specifies value of invalid pixels that should be colored as 'background_color'. Defaults to -99.
+        invalid_mask (numpy.ndarray, optional): Boolean mask for invalid regions. Defaults to None.
+        background_color (tuple[int], optional): 4-tuple RGB color to give to invalid pixels. Defaults to (128, 128, 128, 255).
+        gamma_corrected (bool, optional): Apply gamma correction to colored image. Defaults to False.
+        value_transform (Callable, optional): Apply transform function to valid pixels before coloring. Defaults to None.
+
+    Returns:
+        numpy.ndarray, dtype - uint8: Colored depth map. Shape: (H, W, 4)
+    """
+    if isinstance(value, torch.Tensor):
+        value = value.detach().cpu().numpy()
+
+    value = value.squeeze()
+    if invalid_mask is None:
+        invalid_mask = value == invalid_val
+        
+    mask = np.logical_not(invalid_mask)
+
+    # normalize
+    # vmin = np.percentile(value[mask],2) if vmin is None else vmin
+    # vmax = np.percentile(value[mask],85) if vmax is None else vmax
+    
+    # if vminp is None:
+    #     vmin = value.min()
+    # else:
+    vmin = np.percentile(value[mask],vminp) if vmin is None else vmin
+    vmax = np.percentile(value[mask],vmaxp) if vmax is None else vmax
+    
+    if vmin != vmax:
+        value = (value - vmin) / (vmax - vmin)  # vmin..vmax
+    else:
+        # Avoid 0-division
+        value = value * 0.
+
+    # squeeze last dim if it exists
+    # grey out the invalid values
+
+    value[invalid_mask] = np.nan
+    cmapper = matplotlib.cm.get_cmap(cmap)
+    if value_transform:
+        value = value_transform(value)
+        # value = value / value.max()
+    value = cmapper(value, bytes=True)  # (nxmx4)
+
+    # img = value[:, :, :]
+    img = value[...]
+    img[invalid_mask] = background_color
+
+    #     return img.transpose((2, 0, 1))
+    if gamma_corrected:
+        # gamma correction
+        img = img / 255
+        img = np.power(img, 2.2)
+        img = img * 255
+        img = img.astype(np.uint8)
+    return img
\ No newline at end of file
diff --git a/estimator/utils/dist.py b/estimator/utils/dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..2058333bdf6c4aae68165c993c7c1fe53723fe76
--- /dev/null
+++ b/estimator/utils/dist.py
@@ -0,0 +1,43 @@
+import time
+import torch
+
+from mmengine.dist import (broadcast, get_dist_info, init_dist, is_distributed, get_local_rank)
+from mmengine.utils.dl_utils import (set_multi_processing)
+def setup_env(env_cfg, distributed, launcher):
+    """Setup environment.
+
+    An example of ``env_cfg``::
+
+        env_cfg = dict(
+            cudnn_benchmark=True,
+            mp_cfg=dict(
+                mp_start_method='fork',
+                opencv_num_threads=0
+            ),
+            dist_cfg=dict(backend='nccl', timeout=1800),
+            resource_limit=4096
+        )
+
+    Args:
+        env_cfg (dict): Config for setting environment.
+    """
+    if env_cfg.get('cudnn_benchmark'):
+        torch.backends.cudnn.benchmark = True
+
+    mp_cfg: dict = env_cfg.get('mp_cfg', {})
+    set_multi_processing(**mp_cfg, distributed=distributed)
+
+    # init distributed env first, since logger depends on the dist info.
+    if distributed and not is_distributed():
+        dist_cfg: dict = env_cfg.get('dist_cfg', {})
+        init_dist(launcher, **dist_cfg)
+
+    _rank, _world_size = get_dist_info()
+    # _local_rank = get_local_rank()
+
+    timestamp = torch.tensor(time.time(), dtype=torch.float64)
+    # broadcast timestamp from 0 process to other processes
+    broadcast(timestamp)
+    _timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime(timestamp.item()))
+    return _rank, _world_size, _timestamp
+
diff --git a/estimator/utils/image_ops.py b/estimator/utils/image_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c35c95d875cbfb7adea2fa15139e9c0b47a8244a
--- /dev/null
+++ b/estimator/utils/image_ops.py
@@ -0,0 +1,36 @@
+import cv2
+import numpy as np
+
+def shift_2d_replace(data, dx, dy, constant=False):
+    shifted_data = np.roll(data, dx, axis=1)
+    if dx < 0:
+        shifted_data[:, dx:] = constant
+    elif dx > 0:
+        shifted_data[:, 0:dx] = constant
+
+    shifted_data = np.roll(shifted_data, dy, axis=0)
+    if dy < 0:
+        shifted_data[dy:, :] = constant
+    elif dy > 0:
+        shifted_data[0:dy, :] = constant
+    return shifted_data
+
+def soft_edge_error(pred, gt, radius=1):
+    abs_diff=[]
+    for i in range(-radius, radius + 1):
+        for j in range(-radius, radius + 1):
+            abs_diff.append(np.abs(shift_2d_replace(gt, i, j, 0) - pred))
+    return np.minimum.reduce(abs_diff)
+
+def get_boundaries(disp, th=1., dilation=10):
+    edges_y = np.logical_or(np.pad(np.abs(disp[1:, :] - disp[:-1, :]) > th, ((1, 0), (0, 0))),
+                            np.pad(np.abs(disp[:-1, :] - disp[1:, :]) > th, ((0, 1), (0, 0))))
+    edges_x = np.logical_or(np.pad(np.abs(disp[:, 1:] - disp[:, :-1]) > th, ((0, 0), (1, 0))),
+                            np.pad(np.abs(disp[:, :-1] - disp[:,1:]) > th, ((0, 0), (0, 1))))
+    edges = np.logical_or(edges_y,  edges_x).astype(np.float32)
+
+    if dilation > 0:
+        kernel = np.ones((dilation, dilation), np.uint8)
+        edges = cv2.dilate(edges, kernel, iterations=1)
+
+    return edges
diff --git a/estimator/utils/metric.py b/estimator/utils/metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..92ff139e9ed90dff6f1ebd05b2472f19192c0bf1
--- /dev/null
+++ b/estimator/utils/metric.py
@@ -0,0 +1,206 @@
+import cv2
+import torch
+import numpy as np
+import torch.nn as nn
+import matplotlib.pyplot as plt
+from scipy import ndimage
+from skimage.feature import canny
+import kornia
+
+def compute_errors(gt, pred):
+    """Compute metrics for 'pred' compared to 'gt'
+
+    Args:
+        gt (numpy.ndarray): Ground truth values
+        pred (numpy.ndarray): Predicted values
+
+        gt.shape should be equal to pred.shape
+
+    Returns:
+        dict: Dictionary containing the following metrics:
+            'a1': Delta1 accuracy: Fraction of pixels that are within a scale factor of 1.25
+            'a2': Delta2 accuracy: Fraction of pixels that are within a scale factor of 1.25^2
+            'a3': Delta3 accuracy: Fraction of pixels that are within a scale factor of 1.25^3
+            'abs_rel': Absolute relative error
+            'rmse': Root mean squared error
+            'log_10': Absolute log10 error
+            'sq_rel': Squared relative error
+            'rmse_log': Root mean squared error on the log scale
+            'silog': Scale invariant log error
+    """
+    thresh = np.maximum((gt / pred), (pred / gt))
+    a1 = (thresh < 1.25).mean()
+    a2 = (thresh < 1.25 ** 2).mean()
+    a3 = (thresh < 1.25 ** 3).mean()
+
+    abs_rel = np.mean(np.abs(gt - pred) / gt)
+    sq_rel = np.mean(((gt - pred) ** 2) / gt)
+
+    rmse = (gt - pred) ** 2
+    rmse = np.sqrt(rmse.mean())
+
+    rmse_log = (np.log(gt) - np.log(pred)) ** 2
+    rmse_log = np.sqrt(rmse_log.mean())
+
+    err = np.log(pred) - np.log(gt)
+    silog = np.sqrt(np.mean(err ** 2) - np.mean(err) ** 2) * 100
+
+    log_10 = (np.abs(np.log10(gt) - np.log10(pred))).mean()
+    return dict(a1=a1, a2=a2, a3=a3, abs_rel=abs_rel, rmse=rmse, log_10=log_10, rmse_log=rmse_log,
+                silog=silog, sq_rel=sq_rel)
+
+
+def shift_2d_replace(data, dx, dy, constant=False):
+    shifted_data = np.roll(data, dx, axis=1)
+    if dx < 0:
+        shifted_data[:, dx:] = constant
+    elif dx > 0:
+        shifted_data[:, 0:dx] = constant
+
+    shifted_data = np.roll(shifted_data, dy, axis=0)
+    if dy < 0:
+        shifted_data[dy:, :] = constant
+    elif dy > 0:
+        shifted_data[0:dy, :] = constant
+    return shifted_data
+
+def soft_edge_error(pred, gt, radius=1):
+    abs_diff=[]
+    for i in range(-radius, radius + 1):
+        for j in range(-radius, radius + 1):
+            abs_diff.append(np.abs(shift_2d_replace(gt, i, j, 0) - pred))
+    return np.minimum.reduce(abs_diff)
+
+def get_boundaries(disp, th=1., dilation=10):
+    edges_y = np.logical_or(np.pad(np.abs(disp[1:, :] - disp[:-1, :]) > th, ((1, 0), (0, 0))),
+                            np.pad(np.abs(disp[:-1, :] - disp[1:, :]) > th, ((0, 1), (0, 0))))
+    edges_x = np.logical_or(np.pad(np.abs(disp[:, 1:] - disp[:, :-1]) > th, ((0, 0), (1, 0))),
+                            np.pad(np.abs(disp[:, :-1] - disp[:,1:]) > th, ((0, 0), (0, 1))))
+    edges = np.logical_or(edges_y,  edges_x).astype(np.float32)
+
+    if dilation > 0:
+        kernel = np.ones((dilation, dilation), np.uint8)
+        edges = cv2.dilate(edges, kernel, iterations=1)
+
+    return edges
+
+def compute_metrics(gt, pred, interpolate=True, garg_crop=False, eigen_crop=True, dataset='nyu', min_depth_eval=0.1, max_depth_eval=10, disp_gt_edges=None, additional_mask=None):
+    """Compute metrics of predicted depth maps. Applies cropping and masking as necessary or specified via arguments. Refer to compute_errors for more details on metrics.
+    """
+
+    if gt.shape[-2:] != pred.shape[-2:] and interpolate:
+        pred = nn.functional.interpolate(
+            # pred, gt.shape[-2:], mode='bilinear', align_corners=True).squeeze()
+            pred, gt.shape[-2:], mode='bilinear', align_corners=False).squeeze()
+
+    pred = pred.squeeze().cpu().numpy()
+    pred[pred < min_depth_eval] = min_depth_eval
+    pred[pred > max_depth_eval] = max_depth_eval
+    pred[np.isinf(pred)] = max_depth_eval
+    pred[np.isnan(pred)] = min_depth_eval
+
+    gt_depth = gt.squeeze().cpu().numpy()
+    valid_mask = np.logical_and(
+        gt_depth > min_depth_eval, gt_depth < max_depth_eval)
+
+    eval_mask = np.ones(valid_mask.shape)
+    if garg_crop or eigen_crop:
+        gt_height, gt_width = gt_depth.shape
+        eval_mask = np.zeros(valid_mask.shape)
+
+        if garg_crop:
+            eval_mask[int(0.40810811 * gt_height):int(0.99189189 * gt_height),
+                      int(0.03594771 * gt_width):int(0.96405229 * gt_width)] = 1
+
+        elif eigen_crop:
+            # print("-"*10, " EIGEN CROP ", "-"*10)
+            if dataset == 'kitti':
+                eval_mask[int(0.3324324 * gt_height):int(0.91351351 * gt_height),
+                          int(0.0359477 * gt_width):int(0.96405229 * gt_width)] = 1
+            else:
+                # assert gt_depth.shape == (480, 640), "Error: Eigen crop is currently only valid for (480, 640) images"
+                eval_mask[45:471, 41:601] = 1
+        else:
+            eval_mask = np.ones(valid_mask.shape)
+            
+    valid_mask = np.logical_and(valid_mask, eval_mask)
+    
+    # for prompt depth
+    if additional_mask is not None:
+        additional_mask = additional_mask.squeeze().detach().cpu().numpy()
+        valid_mask = np.logical_and(valid_mask, additional_mask)
+        
+    metrics = compute_errors(gt_depth[valid_mask], pred[valid_mask])
+        
+    if disp_gt_edges is not None:
+        
+        edges = disp_gt_edges.squeeze().numpy()
+        mask = valid_mask.squeeze() # squeeze
+        mask = np.logical_and(mask, edges)
+
+        see_depth = torch.tensor([0])
+        if mask.sum() > 0:
+            see_depth_map = soft_edge_error(pred, gt_depth)
+            see_depth_map_valid = see_depth_map[mask]
+            see_depth = see_depth_map_valid.mean()
+        metrics['see'] = see_depth
+    
+    return metrics
+
+
+def eps(x):
+    """Return the `eps` value for the given `input` dtype. (default=float32 ~= 1.19e-7)"""
+    dtype = torch.float32 if x is None else x.dtype
+    return torch.finfo(dtype).eps
+
+def to_log(depth):
+    """Convert linear depth into log depth."""
+    depth = torch.tensor(depth)
+    depth = (depth > 0) * depth.clamp(min=eps(depth)).log()
+    return depth
+
+def to_inv(depth):
+    """Convert linear depth into disparity."""
+    depth = torch.tensor(depth)
+    disp = (depth > 0) / depth.clamp(min=eps(depth))
+    return disp
+
+def extract_edges(depth,
+                  preprocess=None,
+                  sigma=1,
+                  mask=None,
+                  use_canny=True):
+    """Detect edges in a dense LiDAR depth map.
+
+    :param depth: (ndarray) (h, w, 1) Dense depth map to extract edges.
+    :param preprocess: (str) Additional depth map post-processing. (log, inv, none)
+    :param sigma: (int) Gaussian blurring sigma.
+    :param mask: (Optional[ndarray]) Optional boolean mask of valid pixels to keep.
+    :param use_canny: (bool) If `True`, use `Canny` edge detection, otherwise `Sobel`.
+    :return: (ndarray) (h, w) Detected depth edges in the image.
+    """
+    if preprocess not in {'log', 'inv', 'none', None}:
+        raise ValueError(f'Invalid depth preprocessing. ({preprocess})')
+
+    depth = depth.squeeze()
+    if preprocess == 'log':
+        depth = to_log(depth)
+    elif preprocess == 'inv':
+        depth = to_inv(depth)
+        depth -= depth.min()
+        depth /= depth.max()
+    else:
+        depth = torch.tensor(depth)
+        input_value = (depth > 0) * depth.clamp(min=eps(depth))
+        # depth = torch.log(input_value) / torch.log(torch.tensor(1.9))
+        # depth = torch.log(input_value) / torch.log(torch.tensor(1.9))
+        depth = torch.log(input_value) / torch.log(torch.tensor(1.5))
+        
+    depth = depth.numpy()
+
+    if use_canny:
+        edges = canny(depth, sigma=sigma, mask=mask)
+    else:
+        raise NotImplementedError("Sobel edge detection is not implemented yet.")
+
+    return edges
diff --git a/estimator/utils/misc.py b/estimator/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..37fd6afd91a83d807fa0a1d6c4fdba93b0541cad
--- /dev/null
+++ b/estimator/utils/misc.py
@@ -0,0 +1,54 @@
+
+import random
+import numpy
+import torch
+from typing import List, Optional, Sequence, Tuple, Union
+from mmengine.config import ConfigDict
+from mmengine.utils.dl_utils import collect_env
+from collections import OrderedDict
+
+
+ConfigType = Union[ConfigDict, dict]
+OptConfigType = Optional[ConfigType]
+MultiConfig = Union[ConfigType, List[ConfigType]]
+OptMultiConfig = Optional[MultiConfig]
+
+def fix_random_seed(seed: int):
+
+    random.seed(seed)
+    numpy.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+    torch.backends.cudnn.deterministic = True
+    # torch.backends.cudnn.deterministic = False # speed up maybe
+    torch.backends.cudnn.benchmark = True
+
+def log_env(cfg, env_cfg, runner_info, logger):
+    """Logging environment information of the current task.
+
+    Args:
+        env_cfg (dict): The environment config of the runner.
+    """
+    # Collect and log environment information.
+    env = collect_env()
+    runtime_env = OrderedDict()
+    runtime_env.update(env_cfg)
+    runtime_env['seed'] = runner_info.seed
+    runtime_env['Distributed launcher'] = runner_info.launcher
+    runtime_env['Distributed training'] = runner_info.distributed
+    runtime_env['GPU number'] = runner_info.world_size
+
+    env_info = '\n    ' + '\n    '.join(f'{k}: {v}'
+                                        for k, v in env.items())
+    runtime_env_info = '\n    ' + '\n    '.join(
+        f'{k}: {v}' for k, v in runtime_env.items())
+    dash_line = '-' * 60
+    logger.info('\n' + dash_line + '\nSystem environment:' +
+                    env_info + '\n'
+                    '\nRuntime environment:' + runtime_env_info + '\n' +
+                    dash_line + '\n')
+
+    if cfg._cfg_dict:
+        logger.info(f'Config:\n{cfg.pretty_text}')
\ No newline at end of file
diff --git a/estimator/utils/runner.py b/estimator/utils/runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..da4be84fa453261ad615e14a1661aa6bb1b3d8fd
--- /dev/null
+++ b/estimator/utils/runner.py
@@ -0,0 +1,22 @@
+
+
+class RunnerInfo:
+    ''' A dynamic dict saving temp information during running
+    '''
+    def __init__(self):
+        self._attributes = {}
+
+    def __setattr__(self, name, value):
+        if name == '_attributes':
+            super().__setattr__(name, value)
+        else:
+            self._attributes[name] = value
+
+    def __getattr__(self, name):
+        if name in self._attributes:
+            return self._attributes[name]
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
+    
+    def __repr__(self):
+        attrs = ''.join(f"\n{key}={value!r}" for key, value in self._attributes.items())
+        return f"{type(self).__name__}({attrs})"
\ No newline at end of file
diff --git a/estimator/utils/type.py b/estimator/utils/type.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bc09c7f2530bec67895efefb1af32ab7873305c
--- /dev/null
+++ b/estimator/utils/type.py
@@ -0,0 +1,71 @@
+# Copyright 2023 Toyota Research Institute.  All rights reserved.
+
+from argparse import Namespace
+
+import numpy as np
+import torch
+
+
+def is_numpy(data):
+    """Checks if data is a numpy array."""
+    return isinstance(data, np.ndarray)
+
+
+def is_tensor(data):
+    """Checks if data is a torch tensor."""
+    return type(data) == torch.Tensor
+
+
+def is_tuple(data):
+    """Checks if data is a tuple."""
+    return isinstance(data, tuple)
+
+
+def is_list(data):
+    """Checks if data is a list."""
+    return isinstance(data, list) or isinstance(data, torch.nn.ModuleList)
+
+
+def is_double_list(data):
+    """Checks if data is a double list (list of lists)"""
+    return is_list(data) and len(data) > 0 and is_list(data[0])
+
+
+def is_dict(data):
+    """Checks if data is a dictionary."""
+    return isinstance(data, dict) or isinstance(data, torch.nn.ModuleDict)
+
+
+def is_module_dict(data):
+    """Checks if data is a dictionary."""
+    return isinstance(data, torch.nn.ModuleDict)
+
+
+def is_str(data):
+    """Checks if data is a string."""
+    return isinstance(data, str)
+
+
+def is_int(data):
+    """Checks if data is an integer."""
+    return isinstance(data, int)
+
+
+def is_seq(data):
+    """Checks if data is a list or tuple."""
+    return is_tuple(data) or is_list(data)
+
+
+def is_double_seq(data):
+    """Checks if data is a double list (list of lists)"""
+    return is_seq(data) and len(data) > 0 and is_seq(data[0])
+
+
+def is_namespace(data):
+    """Check if data is a Namespace"""
+    return isinstance(data, Namespace)
+
+
+def exists(data):
+    """Check if data exists (it is not None)"""
+    return data is not None
\ No newline at end of file
diff --git a/examples/1_depth.png b/examples/1_depth.png
deleted file mode 100644
index d09e90b0ac670b61b307fde9c598e8c510d36889..0000000000000000000000000000000000000000
--- a/examples/1_depth.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c6f3f5b9ae0ae7c01ae16ce445288a8fbde163295103fcdd661b2278c73c45fb
-size 1064280
diff --git a/examples/1_gen.png b/examples/1_gen.png
deleted file mode 100644
index d5d8bc83527203e6512defdbe5db8d80963e5f6b..0000000000000000000000000000000000000000
--- a/examples/1_gen.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:da6b1e5c98e35135dcc270dc2f0c6e82b71921833fde6713c8ccce88235b3ec5
-size 1975389
diff --git a/examples/2_depth.png b/examples/2_depth.png
deleted file mode 100644
index 49429a15dca4bf0a91fa866822ced434429ee81f..0000000000000000000000000000000000000000
--- a/examples/2_depth.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5ce9252ecd51fda75cad472c5a5929ae8aacfa177545d045157fcdec0ee07c14
-size 1109266
diff --git a/examples/2_gen.png b/examples/2_gen.png
deleted file mode 100644
index 8d1a9ad98d8269b2186809988672923389733167..0000000000000000000000000000000000000000
--- a/examples/2_gen.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b06f5359258e08c7f6079c406c4681c6e6e63b19a88b297130fc5362c72520c1
-size 1744771
diff --git a/examples/4_depth.png b/examples/4_depth.png
deleted file mode 100644
index f2213da7455eb4ec24843d929b6b48f75fa4688a..0000000000000000000000000000000000000000
--- a/examples/4_depth.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0242a8ed7444750753c9b674ec9e5e2f33fd1af0c37e0378083b92186e00d464
-size 1321726
diff --git a/examples/4_gen.png b/examples/4_gen.png
deleted file mode 100644
index c48079aeefdf541da9db168febc98e7dac9b71d5..0000000000000000000000000000000000000000
--- a/examples/4_gen.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:93fc1719a9a3c2b99bbfdc55ceb6d69a12df50ea73e5dfb542cc8ae5f31d71d0
-size 1486540
diff --git a/examples/example_1.jpeg b/examples/example_1.jpeg
index a0e0534032b1301e13e09c22b0824473363e84d0..a192d4ca9875eaa4a4c67459da2e6e9d0dc41bf3 100644
--- a/examples/example_1.jpeg
+++ b/examples/example_1.jpeg
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a1d0a358e28e6d23363688a54e46ca66fb13eea4be58ab071be3a6149b32087a
-size 691598
+oid sha256:682ee368ec3420c09d0bd8b3452e7aa0c202c03581312592ae41ed6dad2d2b44
+size 13630302
diff --git a/examples/example_1_coarse.png b/examples/example_1_coarse.png
new file mode 100644
index 0000000000000000000000000000000000000000..c152192d1057260c4fca6bfdfc177b65df76e575
--- /dev/null
+++ b/examples/example_1_coarse.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c79f44e0e697b024e1f7965f7a532e35d8f726958b521a22d5178b5564b1aa1
+size 377683
diff --git a/examples/example_1_fine.png b/examples/example_1_fine.png
new file mode 100644
index 0000000000000000000000000000000000000000..68395be1313217abeacd072d9ed2798fa1e0eb3f
--- /dev/null
+++ b/examples/example_1_fine.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b7f711904ce789010f6ce6941881986adad10d2f085d414155fe2a6e3e80713
+size 2251860
diff --git a/examples/example_2.jpeg b/examples/example_2.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..43b0731b20a7c1093e006d6282c3cccb976d23fd
--- /dev/null
+++ b/examples/example_2.jpeg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2a7d175bf88d61d641b9b593873c150651bd9aaf419769901bee2a88d86147e
+size 2994014
diff --git a/examples/example_2_coarse.png b/examples/example_2_coarse.png
new file mode 100644
index 0000000000000000000000000000000000000000..e3314c046e125a648fb3c590b2aa24be7ecec148
--- /dev/null
+++ b/examples/example_2_coarse.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93ada4b81395c7f09b9a9035eef85f8faa6ca07a440d5e31a9a7cfaccfeaf355
+size 431907
diff --git a/examples/example_2_fine.png b/examples/example_2_fine.png
new file mode 100644
index 0000000000000000000000000000000000000000..0d23fc3d37d89259c6a599c22cfe8decc63250e2
--- /dev/null
+++ b/examples/example_2_fine.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e010e13ee62e9fec5d0e4e6004c0003d0d17f6f894fdf57a168d68eca0662a54
+size 4650200
diff --git a/examples/example_3.jpeg b/examples/example_3.jpeg
index ab30a826ddd72d067b558dc94926c7548b50143c..3cdeaffb2b3ff9d7691ff30e2fb826719348a60d 100644
--- a/examples/example_3.jpeg
+++ b/examples/example_3.jpeg
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:18970a06513eb394b2f8c74b330efa7e19416468a6802eb6898b4e7d6dd29cd8
-size 5590293
+oid sha256:786c0b925f85f1cbe382215a497597445132db96a1faf219d3837c595ac10ea4
+size 2371789
diff --git a/examples/example_3_coarse.png b/examples/example_3_coarse.png
new file mode 100644
index 0000000000000000000000000000000000000000..cc97485151ed916c5557e99a906fcb3105c3454b
--- /dev/null
+++ b/examples/example_3_coarse.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:748fe9681ff2c3ce88fc0061fc5f9c1e1066f2db0f481bbdb2c46297960277d7
+size 242156
diff --git a/examples/example_3_fine.png b/examples/example_3_fine.png
new file mode 100644
index 0000000000000000000000000000000000000000..f58a9952b4a9c684033949037a5f4f4542d1802e
--- /dev/null
+++ b/examples/example_3_fine.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e38b742c9d439599039d285979c7317478cef8f0aa95d6ec9912d661484222d6
+size 1796781
diff --git a/examples/example_4.jpeg b/examples/example_4.jpeg
deleted file mode 100644
index 92f61f2a9002943b34499c9676ec873737e21685..0000000000000000000000000000000000000000
--- a/examples/example_4.jpeg
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a4271093e47346fd6dff0ab179acd90cba7a1c1e2d5f171ee110a99d172f71d0
-size 730371
diff --git a/examples/example_6.png b/examples/example_6.png
deleted file mode 100644
index a820db3384a1d6b7282f88ccb647491d3eaa24ad..0000000000000000000000000000000000000000
--- a/examples/example_6.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6c5ef3c57fc74b7f9cea4ca6075fac88d72b804d076f39bb6f9ee42edf92168a
-size 7582237
diff --git a/infer_user.py b/infer_user.py
deleted file mode 100644
index 71504603843aaf447b825718b585531eeb722a82..0000000000000000000000000000000000000000
--- a/infer_user.py
+++ /dev/null
@@ -1,1059 +0,0 @@
-# MIT License
-
-# Copyright (c) 2022 Intelligent Systems Lab Org
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-# File author: Zhenyu Li
-
-import os
-import cv2
-import argparse
-from zoedepth.utils.config import get_config_user
-from zoedepth.models.builder import build_model
-from zoedepth.utils.arg_utils import parse_unknown
-import numpy as np
-from zoedepth.models.base_models.midas import Resize
-from torchvision.transforms import Compose
-from torchvision.transforms import Normalize
-import torch
-import torch.nn as nn
-import matplotlib.pyplot as plt
-import copy
-from zoedepth.utils.misc import get_boundaries
-from zoedepth.utils.misc import compute_metrics, RunningAverageDict
-from tqdm import tqdm
-import matplotlib
-import torch.nn.functional as F
-from zoedepth.data.middleburry import readPFM
-import random
-import imageio
-from PIL import Image
-
-def load_state_dict(model, state_dict):
-    """Load state_dict into model, handling DataParallel and DistributedDataParallel. Also checks for "model" key in state_dict.
-
-    DataParallel prefixes state_dict keys with 'module.' when saving.
-    If the model is not a DataParallel model but the state_dict is, then prefixes are removed.
-    If the model is a DataParallel model but the state_dict is not, then prefixes are added.
-    """
-    state_dict = state_dict.get('model', state_dict)
-    # if model is a DataParallel model, then state_dict keys are prefixed with 'module.'
-
-    do_prefix = isinstance(
-        model, (torch.nn.DataParallel, torch.nn.parallel.DistributedDataParallel))
-    state = {}
-    for k, v in state_dict.items():
-        if k.startswith('module.') and not do_prefix:
-            k = k[7:]
-
-        if not k.startswith('module.') and do_prefix:
-            k = 'module.' + k
-
-        state[k] = v
-
-    model.load_state_dict(state, strict=True)
-    # model.load_state_dict(state, strict=False)
-    print("Loaded successfully")
-    return model
-
-
-def load_wts(model, checkpoint_path):
-    ckpt = torch.load(checkpoint_path, map_location='cpu')
-    return load_state_dict(model, ckpt)
-
-def load_ckpt(model, checkpoint):
-    model = load_wts(model, checkpoint)
-    print("Loaded weights from {0}".format(checkpoint))
-    return model
-
-
-#### def dataset
-def read_image(path, dataset_name):
-    if dataset_name == 'u4k':
-        img = np.fromfile(open(path, 'rb'), dtype=np.uint8).reshape(2160, 3840, 3) / 255.0
-        img = img.astype(np.float32)[:, :, ::-1].copy()
-    elif dataset_name == 'mid':
-        img = cv2.imread(path)
-        if img.ndim == 2:
-            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
-        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0
-        
-        img = F.interpolate(torch.tensor(img).unsqueeze(dim=0).permute(0, 3, 1, 2), IMG_RESOLUTION, mode='bicubic', align_corners=True)
-        img = img.squeeze().permute(1, 2, 0)
-    
-    elif dataset_name == 'nyu':
-        img = Image.open(path)
-        img = np.asarray(img, dtype=np.float32) / 255.0
-        
-    else:
-        img = cv2.imread(path)
-        if img.ndim == 2:
-            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
-        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0
-        print(img.shape)
-        img = F.interpolate(torch.tensor(img).unsqueeze(dim=0).permute(0, 3, 1, 2), IMG_RESOLUTION, mode='bicubic', align_corners=True)
-        img = img.squeeze().permute(1, 2, 0)
-    return img
-
-class Images:
-    def __init__(self, root_dir, files, index, dataset_name=None):
-        self.root_dir = root_dir
-        name = files[index]
-        self.dataset_name = dataset_name
-        self.rgb_image = read_image(os.path.join(self.root_dir, name), dataset_name)
-        name = name.replace(".jpg", "")
-        name = name.replace(".png", "")
-        name = name.replace(".jpeg", "")
-        self.name = name
-        
-class DepthMap:
-    def __init__(self, root_dir, files, index, dataset_name, pred=False):
-        self.root_dir = root_dir
-        name = files[index]
-
-        gt_path = os.path.join(self.root_dir, name)
-        if dataset_name == 'u4k':
-            depth_factor = gt_path.replace('val_gt', 'val_factor')
-            depth_factor = depth_factor.replace('.npy', '.txt')
-            with open(depth_factor, 'r') as f:
-                df = f.readline()
-            df = float(df)
-
-            gt_disp = np.load(gt_path, mmap_mode='c')
-            gt_disp = gt_disp.astype(np.float32)
-            edges = get_boundaries(gt_disp, th=1, dilation=0)
-
-            gt_depth = df/gt_disp
-            self.gt = gt_depth
-            self.edge = edges
-
-        elif dataset_name == 'gta':
-            gt_depth = imageio.imread(gt_path)
-            gt_depth = np.array(gt_depth).astype(np.float32) / 256
-            edges = get_boundaries(gt_depth, th=1, dilation=0)
-            self.gt = gt_depth
-            self.edge = edges
-        
-        elif dataset_name == 'mid':
-            
-            depth_factor = gt_path.replace('gts', 'calibs')
-            depth_factor = depth_factor.replace('.pfm', '.txt')
-            with open(depth_factor, 'r') as f:
-                ext_l = f.readlines()
-            cam_info = ext_l[0].strip()
-            cam_info_f = float(cam_info.split(' ')[0].split('[')[1])
-            base = float(ext_l[3].strip().split('=')[1])
-            doffs = float(ext_l[2].strip().split('=')[1])
-            depth_factor = base * cam_info_f
-            
-            height = 1840
-            width = 2300
-            
-            disp_gt, scale = readPFM(gt_path)
-            disp_gt = disp_gt.astype(np.float32)
-
-            disp_gt_copy = disp_gt.copy()
-            disp_gt = disp_gt
-            invalid_mask = disp_gt == np.inf
-            
-            depth_gt = depth_factor / (disp_gt + doffs)
-            depth_gt = depth_gt / 1000
-            depth_gt[invalid_mask] = 0 # set to a invalid number
-            disp_gt_copy[invalid_mask] = 0
-            edges = get_boundaries(disp_gt_copy, th=1, dilation=0)
-
-            self.gt = depth_gt
-            self.edge = edges
-        
-        elif dataset_name == 'nyu':
-            if pred:
-                depth_gt = np.load(gt_path.replace('png', 'npy'))
-                depth_gt = nn.functional.interpolate(
-                    torch.tensor(depth_gt).unsqueeze(dim=0).unsqueeze(dim=0), (480, 640), mode='bilinear', align_corners=True).squeeze().numpy()
-                
-                edges = get_boundaries(depth_gt, th=1, dilation=0)
-            else:
-                depth_gt = np.asarray(Image.open(gt_path), dtype=np.float32) / 1000
-                edges = get_boundaries(depth_gt, th=1, dilation=0)
-            self.gt = depth_gt
-            self.edge = edges
-            
-            
-        else:
-            raise NotImplementedError
-        
-        name = name.replace(".npy", "") # u4k
-        name = name.replace(".exr", "") # gta
-        self.name = name
-
-class ImageDataset:
-    def __init__(self, rgb_image_dir, gt_dir=None, dataset_name=''):
-        self.rgb_image_dir = rgb_image_dir
-        self.files = sorted(os.listdir(self.rgb_image_dir))
-        self.gt_dir = gt_dir
-        self.dataset_name = dataset_name
-
-        if gt_dir is not None:
-            self.gt_dir = gt_dir
-            self.gt_files = sorted(os.listdir(self.gt_dir))
-            
-    def __len__(self):
-        return len(self.files)
-
-    def __getitem__(self, index):
-        if self.dataset_name == 'nyu':
-            return Images(self.rgb_image_dir, self.files, index, self.dataset_name), DepthMap(self.gt_dir, self.gt_files, index, self.dataset_name), DepthMap('/ibex/ai/home/liz0l/codes/ZoeDepth/nfs/save/nyu', self.gt_files, index, self.dataset_name, pred=True)
-        if self.gt_dir is not None:
-            return Images(self.rgb_image_dir, self.files, index, self.dataset_name), DepthMap(self.gt_dir, self.gt_files, index, self.dataset_name)
-        else:
-            return Images(self.rgb_image_dir, self.files, index)
-
-def crop(img, crop_bbox):
-    crop_y1, crop_y2, crop_x1, crop_x2 = crop_bbox
-    templete = torch.zeros((1, 1, img.shape[-2], img.shape[-1]), dtype=torch.float)
-    templete[:, :, crop_y1:crop_y2, crop_x1:crop_x2] = 1.0
-    img = img[:, :, crop_y1:crop_y2, crop_x1:crop_x2]
-    return img, templete
-
-# def generatemask(size):
-#     # Generates a Guassian mask
-#     mask = np.zeros(size, dtype=np.float32)
-#     sigma = int(size[0]/16)
-#     k_size = int(2 * np.ceil(2 * int(size[0]/16)) + 1)
-#     mask[int(0.15*size[0]):size[0] - int(0.15*size[0]), int(0.15*size[1]): size[1] - int(0.15*size[1])] = 1
-#     mask = cv2.GaussianBlur(mask, (int(k_size), int(k_size)), sigma)
-#     mask = (mask - mask.min()) / (mask.max() - mask.min())
-#     mask = mask.astype(np.float32)
-#     return mask
-
-def generatemask(size):
-    # Generates a Guassian mask
-    mask = np.zeros(size, dtype=np.float32)
-    sigma = int(size[0]/16)
-    k_size = int(2 * np.ceil(2 * int(size[0]/16)) + 1)
-    mask[int(0.1*size[0]):size[0] - int(0.1*size[0]), int(0.1*size[1]): size[1] - int(0.1*size[1])] = 1
-    mask = cv2.GaussianBlur(mask, (int(k_size), int(k_size)), sigma)
-    mask = (mask - mask.min()) / (mask.max() - mask.min())
-    mask = mask.astype(np.float32)
-    return mask
-
-
-def generatemask_coarse(size):
-    # Generates a Guassian mask
-    mask = np.zeros(size, dtype=np.float32)
-    sigma = int(size[0]/64)
-    k_size = int(2 * np.ceil(2 * int(size[0]/64)) + 1)
-    mask[int(0.001*size[0]):size[0] - int(0.001*size[0]), int(0.001*size[1]): size[1] - int(0.001*size[1])] = 1
-    mask = cv2.GaussianBlur(mask, (int(k_size), int(k_size)), sigma)
-    mask = (mask - mask.min()) / (mask.max() - mask.min())
-    mask = mask.astype(np.float32)
-    return mask
-
-class RunningAverageMap:
-    """A dictionary of running averages."""
-    def __init__(self, average_map, count_map):
-        self.average_map = average_map
-        self.count_map = count_map
-        self.average_map = self.average_map / self.count_map
-
-    def update(self, pred_map, ct_map):
-        self.average_map = (pred_map + self.count_map * self.average_map) / (self.count_map + ct_map)
-        self.count_map = self.count_map + ct_map
-
-# default size [540, 960]
-# x_start, y_start = [0, 540, 1080, 1620], [0, 960, 1920, 2880]
-def regular_tile(model, image, offset_x=0, offset_y=0, img_lr=None, iter_pred=None, boundary=0, update=False, avg_depth_map=None, blr_mask=False):
-    # crop size
-    # height = 540
-    # width = 960
-    height = CROP_SIZE[0]
-    width = CROP_SIZE[1]
-
-    assert offset_x >= 0 and offset_y >= 0
-    
-    tile_num_x = (IMG_RESOLUTION[1] - offset_x) // width
-    tile_num_y = (IMG_RESOLUTION[0] - offset_y) // height
-    x_start = [width * x + offset_x for x in range(tile_num_x)]
-    y_start = [height * y + offset_y for y in range(tile_num_y)]
-    imgs_crop = []
-    crop_areas = []
-    bboxs_roi = []
-    bboxs_raw = []
-
-    if iter_pred is not None:
-        iter_pred = iter_pred.unsqueeze(dim=0).unsqueeze(dim=0)
-
-    iter_priors = []
-    for x in x_start: # w
-        for y in y_start: # h
-            bbox = (int(y), int(y+height), int(x), int(x+width))
-            img_crop, crop_area = crop(image, bbox)
-            imgs_crop.append(img_crop)
-            crop_areas.append(crop_area)
-            crop_y1, crop_y2, crop_x1, crop_x2 = bbox
-            bbox_roi = torch.tensor([crop_x1 / IMG_RESOLUTION[1] * 512, crop_y1 / IMG_RESOLUTION[0] * 384, crop_x2 / IMG_RESOLUTION[1] * 512, crop_y2 / IMG_RESOLUTION[0] * 384])
-            bboxs_roi.append(bbox_roi)
-            bbox_raw = torch.tensor([crop_x1, crop_y1, crop_x2, crop_y2]) 
-            bboxs_raw.append(bbox_raw)
-
-            if iter_pred is not None:
-                iter_prior, _ = crop(iter_pred, bbox)
-                iter_priors.append(iter_prior)
-
-    crop_areas = torch.cat(crop_areas, dim=0)
-    imgs_crop = torch.cat(imgs_crop, dim=0)
-    bboxs_roi = torch.stack(bboxs_roi, dim=0)
-    bboxs_raw = torch.stack(bboxs_raw, dim=0)
-
-    if iter_pred is not None:
-        iter_priors = torch.cat(iter_priors, dim=0)
-        iter_priors = TRANSFORM(iter_priors)
-        iter_priors = iter_priors.cuda().float()
-
-    crop_areas = TRANSFORM(crop_areas)
-    imgs_crop = TRANSFORM(imgs_crop)
-
-    imgs_crop = imgs_crop.cuda().float()
-    bboxs_roi = bboxs_roi.cuda().float()
-    crop_areas = crop_areas.cuda().float()
-    img_lr = img_lr.cuda().float()
-    
-    pred_depth_crops = []
-    with torch.no_grad():
-        for i, (img, bbox, crop_area) in enumerate(zip(imgs_crop, bboxs_roi, crop_areas)):
-
-            if iter_pred is not None:
-                iter_prior = iter_priors[i].unsqueeze(dim=0)
-            else:
-                iter_prior = None
-
-            if i == 0:
-                out_dict = model(img.unsqueeze(dim=0), mode='eval', image_raw=img_lr, bbox=bbox.unsqueeze(dim=0), crop_area=crop_area.unsqueeze(dim=0), iter_prior=iter_prior if update is True else None)
-                whole_depth_pred = out_dict['coarse_depth_pred']
-                # return whole_depth_pred.squeeze()
-                # pred_depth_crop = out_dict['fine_depth_pred']
-                pred_depth_crop = out_dict['metric_depth']
-            else:
-                pred_depth_crop = model(img.unsqueeze(dim=0), mode='eval', image_raw=img_lr, bbox=bbox.unsqueeze(dim=0), crop_area=crop_area.unsqueeze(dim=0), iter_prior=iter_prior if update is True else None)['metric_depth']
-                # pred_depth_crop = model(img.unsqueeze(dim=0), mode='eval', image_raw=img_lr, bbox=bbox.unsqueeze(dim=0), crop_area=crop_area.unsqueeze(dim=0), iter_prior=iter_prior if update is True else None)['fine_depth_pred']
-
-
-            pred_depth_crop = nn.functional.interpolate(
-                pred_depth_crop, (height, width), mode='bilinear', align_corners=True)
-            # pred_depth_crop = nn.functional.interpolate(
-            #     pred_depth_crop, (height, width), mode='nearest')
-            pred_depth_crops.append(pred_depth_crop.squeeze())
-
-    whole_depth_pred = whole_depth_pred.squeeze()
-    whole_depth_pred = nn.functional.interpolate(whole_depth_pred.unsqueeze(dim=0).unsqueeze(dim=0), IMG_RESOLUTION, mode='bilinear', align_corners=True).squeeze()
-
-    ####### stich part
-    inner_idx = 0
-    init_flag = False
-    if offset_x == 0 and offset_y == 0:
-        init_flag = True
-        # pred_depth = whole_depth_pred
-        pred_depth = torch.zeros(IMG_RESOLUTION, device=pred_depth_crops[inner_idx].device)
-    else:
-        iter_pred = iter_pred.squeeze()
-        pred_depth = iter_pred
-
-    blur_mask = generatemask((height, width)) + 1e-3
-    count_map = torch.zeros(IMG_RESOLUTION, device=pred_depth_crops[inner_idx].device)
-
-    for ii, x in enumerate(x_start):
-        for jj, y in enumerate(y_start):
-            if init_flag:
-                # pred_depth[y: y+height, x: x+width] = blur_mask * pred_depth_crops[inner_idx] + (1 - blur_mask) * crop_temp
-                # pred_depth[y: y+height, x: x+width] = blur_mask * pred_depth_crops[inner_idx] + (1 - blur_mask) * crop_temp
-                blur_mask = torch.tensor(blur_mask, device=whole_depth_pred.device)
-                count_map[y: y+height, x: x+width] = blur_mask
-                pred_depth[y: y+height, x: x+width] = pred_depth_crops[inner_idx] * blur_mask
-
-            else:
-                # ensemble with running mean
-                if blr_mask:
-                    blur_mask = torch.tensor(blur_mask, device=whole_depth_pred.device)
-                    count_map = torch.zeros(IMG_RESOLUTION, device=pred_depth_crops[inner_idx].device)
-                    count_map[y: y+height, x: x+width] = blur_mask
-                    pred_map = torch.zeros(IMG_RESOLUTION, device=pred_depth_crops[inner_idx].device)
-                    pred_map[y: y+height, x: x+width] = pred_depth_crops[inner_idx] * blur_mask
-                    avg_depth_map.update(pred_map, count_map)
-                else:
-                    if boundary != 0:
-                        count_map = torch.zeros(IMG_RESOLUTION, device=pred_depth_crops[inner_idx].device)
-                        count_map[y+boundary: y+height-boundary, x+boundary: x+width-boundary] = 1
-                        pred_map = torch.zeros(IMG_RESOLUTION, device=pred_depth_crops[inner_idx].device)
-                        pred_map[y+boundary: y+height-boundary, x+boundary: x+width-boundary] = pred_depth_crops[inner_idx][boundary:-boundary, boundary:-boundary] 
-                        avg_depth_map.update(pred_map, count_map)
-                    else:
-                        count_map = torch.zeros(IMG_RESOLUTION, device=pred_depth_crops[inner_idx].device)
-                        count_map[y: y+height, x: x+width] = 1
-                        pred_map = torch.zeros(IMG_RESOLUTION, device=pred_depth_crops[inner_idx].device)
-                        pred_map[y: y+height, x: x+width] = pred_depth_crops[inner_idx]
-                        avg_depth_map.update(pred_map, count_map)
-
-
-            inner_idx += 1
-
-    if init_flag:
-        avg_depth_map = RunningAverageMap(pred_depth, count_map)
-        # blur_mask = generatemask_coarse(IMG_RESOLUTION)
-        # blur_mask = torch.tensor(blur_mask, device=whole_depth_pred.device)
-        # count_map = (1 - blur_mask)
-        # pred_map = whole_depth_pred * (1 - blur_mask)
-        # avg_depth_map.update(pred_map, count_map)
-        return avg_depth_map
-
-def regular_tile_param(model, image, offset_x=0, offset_y=0, img_lr=None, iter_pred=None, boundary=0, update=False, avg_depth_map=None, blr_mask=False, crop_size=None,
-    img_resolution=None, transform=None):
-    # crop size
-    # height = 540
-    # width = 960
-    height = crop_size[0]
-    width = crop_size[1]
-
-    assert offset_x >= 0 and offset_y >= 0
-    
-    tile_num_x = (img_resolution[1] - offset_x) // width
-    tile_num_y = (img_resolution[0] - offset_y) // height
-    x_start = [width * x + offset_x for x in range(tile_num_x)]
-    y_start = [height * y + offset_y for y in range(tile_num_y)]
-    imgs_crop = []
-    crop_areas = []
-    bboxs_roi = []
-    bboxs_raw = []
-
-    if iter_pred is not None:
-        iter_pred = iter_pred.unsqueeze(dim=0).unsqueeze(dim=0)
-
-    iter_priors = []
-    for x in x_start: # w
-        for y in y_start: # h
-            bbox = (int(y), int(y+height), int(x), int(x+width))
-            img_crop, crop_area = crop(image, bbox)
-            imgs_crop.append(img_crop)
-            crop_areas.append(crop_area)
-            crop_y1, crop_y2, crop_x1, crop_x2 = bbox
-            bbox_roi = torch.tensor([crop_x1 / img_resolution[1] * 512, crop_y1 / img_resolution[0] * 384, crop_x2 / img_resolution[1] * 512, crop_y2 / img_resolution[0] * 384])
-            bboxs_roi.append(bbox_roi)
-            bbox_raw = torch.tensor([crop_x1, crop_y1, crop_x2, crop_y2]) 
-            bboxs_raw.append(bbox_raw)
-
-            if iter_pred is not None:
-                iter_prior, _ = crop(iter_pred, bbox)
-                iter_priors.append(iter_prior)
-
-    crop_areas = torch.cat(crop_areas, dim=0)
-    imgs_crop = torch.cat(imgs_crop, dim=0)
-    bboxs_roi = torch.stack(bboxs_roi, dim=0)
-    bboxs_raw = torch.stack(bboxs_raw, dim=0)
-
-    if iter_pred is not None:
-        iter_priors = torch.cat(iter_priors, dim=0)
-        iter_priors = transform(iter_priors)
-        iter_priors = iter_priors.to(image.device).float()
-
-    crop_areas = transform(crop_areas)
-    imgs_crop = transform(imgs_crop)
-
-    imgs_crop = imgs_crop.to(image.device).float()
-    bboxs_roi = bboxs_roi.to(image.device).float()
-    crop_areas = crop_areas.to(image.device).float()
-    img_lr = img_lr.to(image.device).float()
-    
-    pred_depth_crops = []
-    with torch.no_grad():
-        for i, (img, bbox, crop_area) in enumerate(zip(imgs_crop, bboxs_roi, crop_areas)):
-
-            if iter_pred is not None:
-                iter_prior = iter_priors[i].unsqueeze(dim=0)
-            else:
-                iter_prior = None
-
-            if i == 0:
-                out_dict = model(img.unsqueeze(dim=0), mode='eval', image_raw=img_lr, bbox=bbox.unsqueeze(dim=0), crop_area=crop_area.unsqueeze(dim=0), iter_prior=iter_prior if update is True else None)
-                whole_depth_pred = out_dict['coarse_depth_pred']
-                # return whole_depth_pred.squeeze()
-                # pred_depth_crop = out_dict['fine_depth_pred']
-                pred_depth_crop = out_dict['metric_depth']
-            else:
-                pred_depth_crop = model(img.unsqueeze(dim=0), mode='eval', image_raw=img_lr, bbox=bbox.unsqueeze(dim=0), crop_area=crop_area.unsqueeze(dim=0), iter_prior=iter_prior if update is True else None)['metric_depth']
-                # pred_depth_crop = model(img.unsqueeze(dim=0), mode='eval', image_raw=img_lr, bbox=bbox.unsqueeze(dim=0), crop_area=crop_area.unsqueeze(dim=0), iter_prior=iter_prior if update is True else None)['fine_depth_pred']
-
-
-            pred_depth_crop = nn.functional.interpolate(
-                pred_depth_crop, (height, width), mode='bilinear', align_corners=True)
-            # pred_depth_crop = nn.functional.interpolate(
-            #     pred_depth_crop, (height, width), mode='nearest')
-            pred_depth_crops.append(pred_depth_crop.squeeze())
-
-    whole_depth_pred = whole_depth_pred.squeeze()
-    whole_depth_pred = nn.functional.interpolate(whole_depth_pred.unsqueeze(dim=0).unsqueeze(dim=0), img_resolution, mode='bilinear', align_corners=True).squeeze()
-
-    ####### stich part
-    inner_idx = 0
-    init_flag = False
-    if offset_x == 0 and offset_y == 0:
-        init_flag = True
-        # pred_depth = whole_depth_pred
-        pred_depth = torch.zeros(img_resolution, device=pred_depth_crops[inner_idx].device)
-    else:
-        iter_pred = iter_pred.squeeze()
-        pred_depth = iter_pred
-
-    blur_mask = generatemask((height, width)) + 1e-3
-    count_map = torch.zeros(img_resolution, device=pred_depth_crops[inner_idx].device)
-
-    for ii, x in enumerate(x_start):
-        for jj, y in enumerate(y_start):
-            if init_flag:
-                # pred_depth[y: y+height, x: x+width] = blur_mask * pred_depth_crops[inner_idx] + (1 - blur_mask) * crop_temp
-                # pred_depth[y: y+height, x: x+width] = blur_mask * pred_depth_crops[inner_idx] + (1 - blur_mask) * crop_temp
-                blur_mask = torch.tensor(blur_mask, device=whole_depth_pred.device)
-                count_map[y: y+height, x: x+width] = blur_mask
-                pred_depth[y: y+height, x: x+width] = pred_depth_crops[inner_idx] * blur_mask
-
-            else:
-                # ensemble with running mean
-                if blr_mask:
-                    blur_mask = torch.tensor(blur_mask, device=whole_depth_pred.device)
-                    count_map = torch.zeros(img_resolution, device=pred_depth_crops[inner_idx].device)
-                    count_map[y: y+height, x: x+width] = blur_mask
-                    pred_map = torch.zeros(img_resolution, device=pred_depth_crops[inner_idx].device)
-                    pred_map[y: y+height, x: x+width] = pred_depth_crops[inner_idx] * blur_mask
-                    avg_depth_map.update(pred_map, count_map)
-                else:
-                    if boundary != 0:
-                        count_map = torch.zeros(img_resolution, device=pred_depth_crops[inner_idx].device)
-                        count_map[y+boundary: y+height-boundary, x+boundary: x+width-boundary] = 1
-                        pred_map = torch.zeros(img_resolution, device=pred_depth_crops[inner_idx].device)
-                        pred_map[y+boundary: y+height-boundary, x+boundary: x+width-boundary] = pred_depth_crops[inner_idx][boundary:-boundary, boundary:-boundary] 
-                        avg_depth_map.update(pred_map, count_map)
-                    else:
-                        count_map = torch.zeros(img_resolution, device=pred_depth_crops[inner_idx].device)
-                        count_map[y: y+height, x: x+width] = 1
-                        pred_map = torch.zeros(img_resolution, device=pred_depth_crops[inner_idx].device)
-                        pred_map[y: y+height, x: x+width] = pred_depth_crops[inner_idx]
-                        avg_depth_map.update(pred_map, count_map)
-
-
-            inner_idx += 1
-
-    if init_flag:
-        avg_depth_map = RunningAverageMap(pred_depth, count_map)
-        # blur_mask = generatemask_coarse(img_resolution)
-        # blur_mask = torch.tensor(blur_mask, device=whole_depth_pred.device)
-        # count_map = (1 - blur_mask)
-        # pred_map = whole_depth_pred * (1 - blur_mask)
-        # avg_depth_map.update(pred_map, count_map)
-        return avg_depth_map
-
-def random_tile(model, image, img_lr=None, iter_pred=None, boundary=0, update=False, avg_depth_map=None, blr_mask=False):
-    height = CROP_SIZE[0]
-    width = CROP_SIZE[1]
-    
-    
-    x_start = [random.randint(0, IMG_RESOLUTION[1] - width - 1)]
-    y_start = [random.randint(0, IMG_RESOLUTION[0] - height - 1)]
-    
-    imgs_crop = []
-    crop_areas = []
-    bboxs_roi = []
-    bboxs_raw = []
-
-    if iter_pred is not None:
-        iter_pred = iter_pred.unsqueeze(dim=0).unsqueeze(dim=0)
-
-    iter_priors = []
-    for x in x_start: # w
-        for y in y_start: # h
-            bbox = (int(y), int(y+height), int(x), int(x+width))
-            img_crop, crop_area = crop(image, bbox)
-            imgs_crop.append(img_crop)
-            crop_areas.append(crop_area)
-            crop_y1, crop_y2, crop_x1, crop_x2 = bbox
-            bbox_roi = torch.tensor([crop_x1 / IMG_RESOLUTION[1] * 512, crop_y1 / IMG_RESOLUTION[0] * 384, crop_x2 / IMG_RESOLUTION[1] * 512, crop_y2 / IMG_RESOLUTION[0] * 384])
-            bboxs_roi.append(bbox_roi)
-            bbox_raw = torch.tensor([crop_x1, crop_y1, crop_x2, crop_y2]) 
-            bboxs_raw.append(bbox_raw)
-
-            if iter_pred is not None:
-                iter_prior, _ = crop(iter_pred, bbox)
-                iter_priors.append(iter_prior)
-
-    crop_areas = torch.cat(crop_areas, dim=0)
-    imgs_crop = torch.cat(imgs_crop, dim=0)
-    bboxs_roi = torch.stack(bboxs_roi, dim=0)
-    bboxs_raw = torch.stack(bboxs_raw, dim=0)
-
-    if iter_pred is not None:
-        iter_priors = torch.cat(iter_priors, dim=0)
-        iter_priors = TRANSFORM(iter_priors)
-        iter_priors = iter_priors.cuda().float()
-
-    crop_areas = TRANSFORM(crop_areas)
-    imgs_crop = TRANSFORM(imgs_crop)
-    
-
-
-    imgs_crop = imgs_crop.cuda().float()
-    bboxs_roi = bboxs_roi.cuda().float()
-    crop_areas = crop_areas.cuda().float()
-    img_lr = img_lr.cuda().float()
-    
-    pred_depth_crops = []
-    with torch.no_grad():
-        for i, (img, bbox, crop_area) in enumerate(zip(imgs_crop, bboxs_roi, crop_areas)):
-
-            if iter_pred is not None:
-                iter_prior = iter_priors[i].unsqueeze(dim=0)
-            else:
-                iter_prior = None
-
-            if i == 0:
-                out_dict = model(img.unsqueeze(dim=0), mode='eval', image_raw=img_lr, bbox=bbox.unsqueeze(dim=0), crop_area=crop_area.unsqueeze(dim=0), iter_prior=iter_prior if update is True else None)
-                whole_depth_pred = out_dict['coarse_depth_pred']
-                pred_depth_crop = out_dict['metric_depth']
-                # return whole_depth_pred.squeeze()
-            else:
-                pred_depth_crop = model(img.unsqueeze(dim=0), mode='eval', image_raw=img_lr, bbox=bbox.unsqueeze(dim=0), crop_area=crop_area.unsqueeze(dim=0), iter_prior=iter_prior if update is True else None)['metric_depth']
-
-
-            pred_depth_crop = nn.functional.interpolate(
-                pred_depth_crop, (height, width), mode='bilinear', align_corners=True)
-            # pred_depth_crop = nn.functional.interpolate(
-            #     pred_depth_crop, (height, width), mode='nearest')
-            pred_depth_crops.append(pred_depth_crop.squeeze())
-
-    whole_depth_pred = whole_depth_pred.squeeze()
-
-    ####### stich part
-    inner_idx = 0
-    init_flag = False
-    iter_pred = iter_pred.squeeze()
-    pred_depth = iter_pred
-
-    blur_mask = generatemask((height, width)) + 1e-3
-    for ii, x in enumerate(x_start):
-        for jj, y in enumerate(y_start):
-            if init_flag:
-                # wont be here
-                crop_temp = copy.deepcopy(whole_depth_pred[y: y+height, x: x+width])
-                blur_mask = torch.ones((height, width))
-                blur_mask = torch.tensor(blur_mask, device=whole_depth_pred.device)
-                pred_depth[y: y+height, x: x+width] = blur_mask * pred_depth_crops[inner_idx]+ (1 - blur_mask) * crop_temp
-            else:
-                if blr_mask:
-                    blur_mask = torch.tensor(blur_mask, device=whole_depth_pred.device)
-                    count_map = torch.zeros(IMG_RESOLUTION, device=pred_depth_crops[inner_idx].device)
-                    count_map[y: y+height, x: x+width] = blur_mask
-                    pred_map = torch.zeros(IMG_RESOLUTION, device=pred_depth_crops[inner_idx].device)
-                    pred_map[y: y+height, x: x+width] = pred_depth_crops[inner_idx] * blur_mask
-                    avg_depth_map.update(pred_map, count_map)
-                else:
-                    # ensemble with running mean
-                    if boundary != 0:
-                        count_map = torch.zeros(IMG_RESOLUTION, device=pred_depth_crops[inner_idx].device)
-                        count_map[y+boundary: y+height-boundary, x+boundary: x+width-boundary] = 1
-                        pred_map = torch.zeros(IMG_RESOLUTION, device=pred_depth_crops[inner_idx].device)
-                        pred_map[y+boundary: y+height-boundary, x+boundary: x+width-boundary] = pred_depth_crops[inner_idx][boundary:-boundary, boundary:-boundary] 
-                        avg_depth_map.update(pred_map, count_map)
-                    else:
-                        count_map = torch.zeros(IMG_RESOLUTION, device=pred_depth_crops[inner_idx].device)
-                        count_map[y: y+height, x: x+width] = 1
-                        pred_map = torch.zeros(IMG_RESOLUTION, device=pred_depth_crops[inner_idx].device)
-                        pred_map[y: y+height, x: x+width] = pred_depth_crops[inner_idx]
-                        avg_depth_map.update(pred_map, count_map)
-
-            inner_idx += 1
-
-    if avg_depth_map is None:
-        return pred_depth
-
-
-def random_tile_param(model, image, img_lr=None, iter_pred=None, boundary=0, update=False, avg_depth_map=None, blr_mask=False, crop_size=None,
-    img_resolution=None, transform=None):
-    height = crop_size[0]
-    width = crop_size[1]
-    
-    
-    x_start = [random.randint(0, img_resolution[1] - width - 1)]
-    y_start = [random.randint(0, img_resolution[0] - height - 1)]
-    
-    imgs_crop = []
-    crop_areas = []
-    bboxs_roi = []
-    bboxs_raw = []
-
-    if iter_pred is not None:
-        iter_pred = iter_pred.unsqueeze(dim=0).unsqueeze(dim=0)
-
-    iter_priors = []
-    for x in x_start: # w
-        for y in y_start: # h
-            bbox = (int(y), int(y+height), int(x), int(x+width))
-            img_crop, crop_area = crop(image, bbox)
-            imgs_crop.append(img_crop)
-            crop_areas.append(crop_area)
-            crop_y1, crop_y2, crop_x1, crop_x2 = bbox
-            bbox_roi = torch.tensor([crop_x1 / img_resolution[1] * 512, crop_y1 / img_resolution[0] * 384, crop_x2 / img_resolution[1] * 512, crop_y2 / img_resolution[0] * 384])
-            bboxs_roi.append(bbox_roi)
-            bbox_raw = torch.tensor([crop_x1, crop_y1, crop_x2, crop_y2]) 
-            bboxs_raw.append(bbox_raw)
-
-            if iter_pred is not None:
-                iter_prior, _ = crop(iter_pred, bbox)
-                iter_priors.append(iter_prior)
-
-    crop_areas = torch.cat(crop_areas, dim=0)
-    imgs_crop = torch.cat(imgs_crop, dim=0)
-    bboxs_roi = torch.stack(bboxs_roi, dim=0)
-    bboxs_raw = torch.stack(bboxs_raw, dim=0)
-
-    if iter_pred is not None:
-        iter_priors = torch.cat(iter_priors, dim=0)
-        iter_priors = transform(iter_priors)
-        iter_priors = iter_priors.cuda().float()
-
-    crop_areas = transform(crop_areas)
-    imgs_crop = transform(imgs_crop)
-    
-    imgs_crop = imgs_crop.cuda().float()
-    bboxs_roi = bboxs_roi.cuda().float()
-    crop_areas = crop_areas.cuda().float()
-    img_lr = img_lr.cuda().float()
-    
-    pred_depth_crops = []
-    with torch.no_grad():
-        for i, (img, bbox, crop_area) in enumerate(zip(imgs_crop, bboxs_roi, crop_areas)):
-
-            if iter_pred is not None:
-                iter_prior = iter_priors[i].unsqueeze(dim=0)
-            else:
-                iter_prior = None
-
-            if i == 0:
-                out_dict = model(img.unsqueeze(dim=0), mode='eval', image_raw=img_lr, bbox=bbox.unsqueeze(dim=0), crop_area=crop_area.unsqueeze(dim=0), iter_prior=iter_prior if update is True else None)
-                whole_depth_pred = out_dict['coarse_depth_pred']
-                pred_depth_crop = out_dict['metric_depth']
-                # return whole_depth_pred.squeeze()
-            else:
-                pred_depth_crop = model(img.unsqueeze(dim=0), mode='eval', image_raw=img_lr, bbox=bbox.unsqueeze(dim=0), crop_area=crop_area.unsqueeze(dim=0), iter_prior=iter_prior if update is True else None)['metric_depth']
-
-
-            pred_depth_crop = nn.functional.interpolate(
-                pred_depth_crop, (height, width), mode='bilinear', align_corners=True)
-            # pred_depth_crop = nn.functional.interpolate(
-            #     pred_depth_crop, (height, width), mode='nearest')
-            pred_depth_crops.append(pred_depth_crop.squeeze())
-
-    whole_depth_pred = whole_depth_pred.squeeze()
-
-    ####### stich part
-    inner_idx = 0
-    init_flag = False
-    iter_pred = iter_pred.squeeze()
-    pred_depth = iter_pred
-
-    blur_mask = generatemask((height, width)) + 1e-3
-    for ii, x in enumerate(x_start):
-        for jj, y in enumerate(y_start):
-            if init_flag:
-                # wont be here
-                crop_temp = copy.deepcopy(whole_depth_pred[y: y+height, x: x+width])
-                blur_mask = torch.ones((height, width))
-                blur_mask = torch.tensor(blur_mask, device=whole_depth_pred.device)
-                pred_depth[y: y+height, x: x+width] = blur_mask * pred_depth_crops[inner_idx]+ (1 - blur_mask) * crop_temp
-            else:
-
-                if blr_mask:
-                    blur_mask = torch.tensor(blur_mask, device=whole_depth_pred.device)
-                    count_map = torch.zeros(img_resolution, device=pred_depth_crops[inner_idx].device)
-                    count_map[y: y+height, x: x+width] = blur_mask
-                    pred_map = torch.zeros(img_resolution, device=pred_depth_crops[inner_idx].device)
-                    pred_map[y: y+height, x: x+width] = pred_depth_crops[inner_idx] * blur_mask
-                    avg_depth_map.update(pred_map, count_map)
-                else:
-                    # ensemble with running mean
-                    if boundary != 0:
-                        count_map = torch.zeros(img_resolution, device=pred_depth_crops[inner_idx].device)
-                        count_map[y+boundary: y+height-boundary, x+boundary: x+width-boundary] = 1
-                        pred_map = torch.zeros(img_resolution, device=pred_depth_crops[inner_idx].device)
-                        pred_map[y+boundary: y+height-boundary, x+boundary: x+width-boundary] = pred_depth_crops[inner_idx][boundary:-boundary, boundary:-boundary] 
-                        avg_depth_map.update(pred_map, count_map)
-                    else:
-                        count_map = torch.zeros(img_resolution, device=pred_depth_crops[inner_idx].device)
-                        count_map[y: y+height, x: x+width] = 1
-                        pred_map = torch.zeros(img_resolution, device=pred_depth_crops[inner_idx].device)
-                        pred_map[y: y+height, x: x+width] = pred_depth_crops[inner_idx]
-                        avg_depth_map.update(pred_map, count_map)
-
-            inner_idx += 1
-
-    if avg_depth_map is None:
-        return pred_depth
-
-
-def colorize_infer(value, cmap='magma_r', vmin=None, vmax=None):
-    # normalize
-    vmin = value.min() if vmin is None else vmin
-    # vmax = value.max() if vmax is None else vmax
-    vmax = np.percentile(value, 95) if vmax is None else vmax
-
-    if vmin != vmax:
-        value = (value - vmin) / (vmax - vmin)  # vmin..vmax
-    else:
-        value = value * 0.
-
-    cmapper = matplotlib.cm.get_cmap(cmap)
-    value = cmapper(value, bytes=True)  # ((1)xhxwx4)
-
-    value = value[:, :, :3] # bgr -> rgb
-    rgb_value = value[..., ::-1]
-
-    return rgb_value
-
-
-def colorize(value, vmin=None, vmax=None, cmap='turbo_r', invalid_val=-99, invalid_mask=None, background_color=(128, 128, 128, 255), gamma_corrected=False, value_transform=None, dataset_name=None):
-    """Converts a depth map to a color image.
-
-    Args:
-        value (torch.Tensor, numpy.ndarry): Input depth map. Shape: (H, W) or (1, H, W) or (1, 1, H, W). All singular dimensions are squeezed
-        vmin (float, optional): vmin-valued entries are mapped to start color of cmap. If None, value.min() is used. Defaults to None.
-        vmax (float, optional):  vmax-valued entries are mapped to end color of cmap. If None, value.max() is used. Defaults to None.
-        cmap (str, optional): matplotlib colormap to use. Defaults to 'magma_r'.
-        invalid_val (int, optional): Specifies value of invalid pixels that should be colored as 'background_color'. Defaults to -99.
-        invalid_mask (numpy.ndarray, optional): Boolean mask for invalid regions. Defaults to None.
-        background_color (tuple[int], optional): 4-tuple RGB color to give to invalid pixels. Defaults to (128, 128, 128, 255).
-        gamma_corrected (bool, optional): Apply gamma correction to colored image. Defaults to False.
-        value_transform (Callable, optional): Apply transform function to valid pixels before coloring. Defaults to None.
-
-    Returns:
-        numpy.ndarray, dtype - uint8: Colored depth map. Shape: (H, W, 4)
-    """
-    if isinstance(value, torch.Tensor):
-        value = value.detach().cpu().numpy()
-
-    value = value.squeeze()
-    if invalid_mask is None:
-        invalid_mask = value == invalid_val
-    mask = np.logical_not(invalid_mask)
-
-    # normalize
-    # vmin = np.percentile(value[mask],2) if vmin is None else vmin
-    # vmin = value.min() if vmin is None else vmin
-    # vmax = np.percentile(value[mask],95) if vmax is None else vmax
-    
-    # mid gt
-    if dataset_name == 'mid':
-        vmin = np.percentile(value[mask],2) if vmin is None else vmin
-        vmax = np.percentile(value[mask],85) if vmax is None else vmax
-    else:
-        vmin = value.min() if vmin is None else vmin
-        vmax = np.percentile(value[mask],95) if vmax is None else vmax
-        
-    if vmin != vmax:
-        value = (value - vmin) / (vmax - vmin)  # vmin..vmax
-    else:
-        # Avoid 0-division
-        value = value * 0.
-
-    # squeeze last dim if it exists
-    # grey out the invalid values
-
-    value[invalid_mask] = np.nan
-    cmapper = matplotlib.cm.get_cmap(cmap)
-    if value_transform:
-        value = value_transform(value)
-        # value = value / value.max()
-    value = cmapper(value, bytes=True)  # (nxmx4)
-
-    # img = value[:, :, :]
-    img = value[...]
-    img[invalid_mask] = background_color
-
-    #     return img.transpose((2, 0, 1))
-    if gamma_corrected:
-        # gamma correction
-        img = img / 255
-        img = np.power(img, 2.2)
-        img = img * 255
-        img = img.astype(np.uint8)
-    return img
-
-def rescale(A, lbound=0, ubound=1):
-    """
-    Rescale an array to [lbound, ubound].
-
-    Parameters:
-    - A: Input data as numpy array
-    - lbound: Lower bound of the scale, default is 0.
-    - ubound: Upper bound of the scale, default is 1.
-
-    Returns:
-    - Rescaled array
-    """
-    A_min = np.min(A)
-    A_max = np.max(A)
-    return (ubound - lbound) * (A - A_min) / (A_max - A_min) + lbound
-
-def run(model, dataset, gt_dir=None, show_path=None, show=False, save_flag=False, save_path=None, mode=None, dataset_name=None, base_zoed=False, blr_mask=False):
-    data_len = len(dataset)
-
-    if gt_dir is not None:
-        metrics_avg = RunningAverageDict()
-
-    for image_ind in tqdm(range(data_len)):
-        if dataset_name == 'nyu':
-            images, depths, pred_depths = dataset[image_ind]
-        else:
-            if gt_dir is None:
-                images = dataset[image_ind]
-            else:
-                images, depths = dataset[image_ind]
-
-        # Load image from dataset
-        img = torch.tensor(images.rgb_image).unsqueeze(dim=0).permute(0, 3, 1, 2) # shape: 1, 3, h, w
-        img_lr = TRANSFORM(img)
-        
-        
-        if base_zoed:
-            with torch.no_grad():
-                pred_depth = model(img.cuda())['metric_depth'].squeeze()
-            avg_depth_map = RunningAverageMap(pred_depth)
-            
-        else:
-            # pred_depth, count_map = regular_tile(model, img, offset_x=0, offset_y=0, img_lr=img_lr)
-            # avg_depth_map = RunningAverageMap(pred_depth, count_map)
-            avg_depth_map = regular_tile(model, img, offset_x=0, offset_y=0, img_lr=img_lr)
-        
-            if mode== 'p16':
-                pass
-            elif mode== 'p49':
-                regular_tile(model, img, offset_x=CROP_SIZE[1]//2, offset_y=0, img_lr=img_lr, iter_pred=avg_depth_map.average_map, boundary=BOUNDARY, update=True, avg_depth_map=avg_depth_map, blr_mask=blr_mask)
-                regular_tile(model, img, offset_x=0, offset_y=CROP_SIZE[0]//2, img_lr=img_lr, iter_pred=avg_depth_map.average_map, boundary=BOUNDARY, update=True, avg_depth_map=avg_depth_map, blr_mask=blr_mask)
-                regular_tile(model, img, offset_x=CROP_SIZE[1]//2, offset_y=CROP_SIZE[0]//2, img_lr=img_lr, iter_pred=avg_depth_map.average_map, boundary=BOUNDARY, update=True, avg_depth_map=avg_depth_map, blr_mask=blr_mask)
-
-            elif mode[0] == 'r':
-                regular_tile(model, img, offset_x=CROP_SIZE[1]//2, offset_y=0, img_lr=img_lr, iter_pred=avg_depth_map.average_map, boundary=BOUNDARY, update=True, avg_depth_map=avg_depth_map, blr_mask=blr_mask)
-                regular_tile(model, img, offset_x=0, offset_y=CROP_SIZE[0]//2, img_lr=img_lr, iter_pred=avg_depth_map.average_map, boundary=BOUNDARY, update=True, avg_depth_map=avg_depth_map, blr_mask=blr_mask)
-                regular_tile(model, img, offset_x=CROP_SIZE[1]//2, offset_y=CROP_SIZE[0]//2, img_lr=img_lr, iter_pred=avg_depth_map.average_map, boundary=BOUNDARY, update=True, avg_depth_map=avg_depth_map, blr_mask=blr_mask)
-
-                for i in tqdm(range(int(mode[1:]))):
-                    random_tile(model, img, img_lr=img_lr, iter_pred=avg_depth_map.average_map, boundary=BOUNDARY, update=True, avg_depth_map=avg_depth_map, blr_mask=blr_mask)
-                
-        if show:
-            color_map = copy.deepcopy(avg_depth_map.average_map)
-            color_map = colorize_infer(color_map.detach().cpu().numpy())
-            cv2.imwrite(os.path.join(show_path, '{}.png'.format(images.name)), color_map)
-
-        if save_flag:
-            np.save(os.path.join(save_path, '{}.npy'.format(images.name)), avg_depth_map.average_map.squeeze().detach().cpu().numpy())
-            # np.save(os.path.join(save_path, '{}.npy'.format(images.name)), depths.gt)
-            
-        if gt_dir is not None:
-            if dataset_name == 'nyu':
-                metrics = compute_metrics(torch.tensor(depths.gt), avg_depth_map.average_map, disp_gt_edges=depths.edge, min_depth_eval=1e-3, max_depth_eval=10, garg_crop=False, eigen_crop=True, dataset='nyu', pred_depths=torch.tensor(pred_depths.gt))
-                # metrics = compute_metrics(torch.tensor(depths.gt), avg_depth_map.average_map, disp_gt_edges=depths.edge, min_depth_eval=1e-3, max_depth_eval=10, garg_crop=False, eigen_crop=True, dataset='nyu')
-            else:
-                metrics = compute_metrics(torch.tensor(depths.gt), avg_depth_map.average_map, disp_gt_edges=depths.edge, min_depth_eval=1e-3, max_depth_eval=80, garg_crop=False, eigen_crop=False, dataset='')
-            metrics_avg.update(metrics)
-            print(metrics)
-            
-    if gt_dir is not None:
-        print(metrics_avg.get_value())
-    else:
-        print("successful!")
-    return avg_depth_map
-
-####
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--rgb_dir', type=str, required=True)
-    parser.add_argument('--show_path', type=str, required=None)
-    parser.add_argument("--ckp_path", type=str, required=True)
-    parser.add_argument("-m", "--model", type=str, default="zoedepth")
-    parser.add_argument("--model_cfg_path", type=str, default="")
-    parser.add_argument("--gt_dir", type=str, default=None)
-    parser.add_argument("--dataset_name", type=str, default=None)
-    parser.add_argument("--show", action='store_true')
-    parser.add_argument("--save", action='store_true')
-    parser.add_argument("--save_path", type=str, default=None)
-    parser.add_argument("--img_resolution", type=str, default=None)
-    parser.add_argument("--crop_size", type=str, default=None)
-    parser.add_argument("--mode", type=str, default=None)
-    parser.add_argument("--base_zoed", action='store_true')
-    parser.add_argument("--boundary", type=int, default=0)
-    parser.add_argument("--blur_mask", action='store_true')
-    args, unknown_args = parser.parse_known_args()
-
-    # prepare some global args
-    global IMG_RESOLUTION
-    if args.dataset_name == 'u4k':
-        IMG_RESOLUTION = (2160, 3840)
-    elif args.dataset_name == 'gta':
-        IMG_RESOLUTION = (1080, 1920)
-    elif args.dataset_name == 'nyu':
-        IMG_RESOLUTION = (480, 640)
-    else:
-        IMG_RESOLUTION = (2160, 3840)
-    
-    global TRANSFORM 
-    TRANSFORM = Compose([Resize(512, 384, keep_aspect_ratio=False, ensure_multiple_of=32, resize_method="minimal")])
-    global BOUNDARY 
-    BOUNDARY = args.boundary
-    
-    if args.img_resolution is not None:
-        IMG_RESOLUTION = (int(args.img_resolution.split('x')[0]), int(args.img_resolution.split('x')[1]))
-    global CROP_SIZE 
-    CROP_SIZE = (int(IMG_RESOLUTION[0] // 4), int(IMG_RESOLUTION[1] // 4))
-    if args.crop_size is not None:
-        CROP_SIZE = (int(args.crop_size.split('x')[0]), int(args.crop_size.split('x')[1]))
-    print("\nCurrent image resolution: {}\n Current crop size: {}".format(IMG_RESOLUTION, CROP_SIZE))
-    
-    overwrite_kwargs = parse_unknown(unknown_args)
-    overwrite_kwargs['model_cfg_path'] = args.model_cfg_path
-    overwrite_kwargs["model"] = args.model
-        
-    # blur_mask_crop = generatemask(CROP_SIZE)
-    # plt.imshow(blur_mask_crop)
-    # plt.savefig('./nfs/results_show/crop_mask.png')
-    # blur_mask_crop = generatemask_coarse(IMG_RESOLUTION)
-    # plt.imshow(blur_mask_crop)
-    # plt.savefig('./nfs/results_show/whole_mask.png')
-
-    config = get_config_user(args.model, **overwrite_kwargs)
-    config["pretrained_resource"] = ''
-    model = build_model(config)
-    model = load_ckpt(model, args.ckp_path)
-    model.eval()
-    model.cuda()
-
-    # Create dataset from input images
-    dataset_custom = ImageDataset(args.rgb_dir, args.gt_dir, args.dataset_name)
-
-    # start running
-    if args.show:
-        os.makedirs(args.show_path, exist_ok=True)
-    if args.save:
-        os.makedirs(args.save_path, exist_ok=True)
-        
-    run(model, dataset_custom, args.gt_dir, args.show_path, args.show, args.save, args.save_path, mode=args.mode, dataset_name=args.dataset_name, base_zoed=args.base_zoed, blr_mask=args.blur_mask)
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 73ffa3d0ab4fdd95992da9008316bfe5a2ac3be3..0000000000000000000000000000000000000000
--- a/requirements.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-gradio
-trimesh==3.9.42
-
-h5py
-matplotlib
-numpy==1.24.1
-torch==1.13.1
-scipy==1.10.0
-torchaudio==0.13.1
-torchvision==0.14.1
-json5==0.9.14
-einops==0.7.0
-huggingface-hub==0.19.
-timm==0.6.12
-tqdm==4.64.1
-opencv-python==4.8.1.78
-imageio==2.31.6
-transformers==4.35.2
-open_clip_torch==2.0.2
-pytorch-lightning==1.5.0
-omegaconf==2.1.1
\ No newline at end of file
diff --git a/tools/convert_huggingface.py b/tools/convert_huggingface.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb5969dfcb9f014b2083bce40a179b1eff71c4cd
--- /dev/null
+++ b/tools/convert_huggingface.py
@@ -0,0 +1,86 @@
+import os
+import os.path as osp
+import argparse
+import torch
+import time
+from torch.utils.data import DataLoader
+from mmengine.utils import mkdir_or_exist
+from mmengine.config import Config, DictAction
+from mmengine.logging import MMLogger
+
+from estimator.utils import RunnerInfo, setup_env, log_env, fix_random_seed
+from estimator.models.builder import build_model
+from estimator.datasets.builder import build_dataset
+from estimator.tester import Tester
+from estimator.models.patchfusion import PatchFusion
+
+from mmengine import print_log
+
+from transformers import PretrainedConfig
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a segmentor')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--ckp-path',
+        type=str,
+        help='ckp_path')
+    parser.add_argument(
+        '--save-path',
+        type=str,
+        help='ckp_path')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    # When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
+    # will pass the `--local-rank` parameter to `tools/train.py` instead
+    # of `--local_rank`.
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    return args
+
+def main():
+    args = parse_args()
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    cfg.ckp_path = args.ckp_path
+
+    # folder_name = os.path.dirname(args.save_path)
+    # print(folder_name)
+    # exit(100)
+    
+    # build model
+    
+    model = build_model(cfg.model)
+    print_log('Checkpoint Path: {}'.format(cfg.ckp_path), logger='current')
+    if hasattr(model, 'load_dict'):
+        print_log(model.load_dict(torch.load(cfg.ckp_path)['model_state_dict']), logger='current')
+    else:
+        print_log(model.load_state_dict(torch.load(cfg.ckp_path)['model_state_dict'], strict=True), logger='current')
+    model.eval()
+    
+    model.save_pretrained(args.save_path)
+    model.config.to_json_file(os.path.join(args.save_path, "config.json"))
+    
+    
+    # model = PatchFusion.from_pretrained('Zhyever/patchfusion_depth_anything_vits14')
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/tools/dist_test.sh b/tools/dist_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..72f060b66855a38fb1d7ab93d50e740dd08071b0
--- /dev/null
+++ b/tools/dist_test.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+PORT=${PORT:-38423}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/test.py $CONFIG --launcher pytorch ${@:3}
diff --git a/tools/dist_train.sh b/tools/dist_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..69610ab13e00cdc3fac562d53b520653befadf8a
--- /dev/null
+++ b/tools/dist_train.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+PORT=${PORT:-38423}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}
diff --git a/tools/test.py b/tools/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..995000a974b20f00012623dbcabc2ffc76442514
--- /dev/null
+++ b/tools/test.py
@@ -0,0 +1,231 @@
+import os
+import os.path as osp
+import argparse
+import torch
+import time
+from torch.utils.data import DataLoader
+from mmengine.utils import mkdir_or_exist
+from mmengine.config import Config, DictAction
+from mmengine.logging import MMLogger
+
+from estimator.utils import RunnerInfo, setup_env, log_env, fix_random_seed
+from estimator.models.builder import build_model
+from estimator.datasets.builder import build_dataset
+from estimator.tester import Tester
+from estimator.models.patchfusion import PatchFusion
+from mmengine import print_log
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a segmentor')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--work-dir', 
+        help='the dir to save logs and models', 
+        default=None)
+    parser.add_argument(
+        '--test-type',
+        type=str,
+        default='normal',
+        help='evaluation type')
+    parser.add_argument(
+        '--ckp-path',
+        type=str,
+        help='ckp_path')
+    parser.add_argument(
+        '--amp',
+        action='store_true',
+        default=False,
+        help='enable automatic-mixed-precision training')
+    parser.add_argument(
+        '--save',
+        action='store_true',
+        default=False,
+        help='save colored prediction & depth predictions')
+    parser.add_argument(
+        '--cai-mode', 
+        type=str,
+        default='m1',
+        help='m1, m2, or rx')
+    parser.add_argument(
+        '--process-num',
+        type=int, default=4,
+        help='batchsize number for inference')
+    parser.add_argument(
+        '--tag',
+        type=str, default='',
+        help='infer_infos')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    # When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
+    # will pass the `--local-rank` parameter to `tools/train.py` instead
+    # of `--local_rank`.
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    return args
+
+def main():
+    args = parse_args()
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    
+    cfg.launcher = args.launcher
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use ckp path as default work_dir if cfg.work_dir is None
+        if '.pth' in args.ckp_path:
+            args.work_dir = osp.dirname(args.ckp_path)
+        else:
+            args.work_dir = osp.join('work_dir', args.ckp_path.split('/')[1])
+        cfg.work_dir = args.work_dir
+        
+    mkdir_or_exist(cfg.work_dir)
+    cfg.ckp_path = args.ckp_path
+    
+    # fix seed
+    seed = cfg.get('seed', 5621)
+    fix_random_seed(seed)
+    
+    # start dist training
+    if cfg.launcher == 'none':
+        distributed = False
+        timestamp = torch.tensor(time.time(), dtype=torch.float64)
+        timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime(timestamp.item()))
+        rank = 0
+        world_size = 1
+        env_cfg = cfg.get('env_cfg')
+    else:
+        distributed = True
+        env_cfg = cfg.get('env_cfg', dict(dist_cfg=dict(backend='nccl')))
+        rank, world_size, timestamp = setup_env(env_cfg, distributed, cfg.launcher)
+    
+    # build dataloader
+    if args.test_type == 'consistency':
+        dataset = build_dataset(cfg.val_consistency_dataloader.dataset)
+    elif args.test_type == 'normal':
+        dataset = build_dataset(cfg.val_dataloader.dataset)
+    elif args.test_type == 'test_in':
+        dataset = build_dataset(cfg.test_in_dataloader.dataset)
+    elif args.test_type == 'test_out':
+        dataset = build_dataset(cfg.test_out_dataloader.dataset)
+    elif args.test_type == 'general':
+        dataset = build_dataset(cfg.general_dataloader.dataset)
+    else:
+        dataset = build_dataset(cfg.val_dataloader.dataset)
+        
+    # extract experiment name from cmd
+    config_path = args.config
+    exp_cfg_filename = config_path.split('/')[-1].split('.')[0]
+    ckp_name = args.ckp_path.replace('/', '_').replace('.pth', '')
+    dataset_name = dataset.dataset_name
+    # log_filename = 'eval_{}_{}_{}_{}.log'.format(timestamp, exp_cfg_filename, ckp_name, dataset_name)
+    log_filename = 'eval_{}_{}_{}_{}_{}.log'.format(exp_cfg_filename, args.tag, ckp_name, dataset_name, timestamp)
+    
+    # prepare basic text logger
+    log_file = osp.join(args.work_dir, log_filename)
+    log_cfg = dict(log_level='INFO', log_file=log_file)
+    log_cfg.setdefault('name', timestamp)
+    log_cfg.setdefault('logger_name', 'patchstitcher')
+    # `torch.compile` in PyTorch 2.0 could close all user defined handlers
+    # unexpectedly. Using file mode 'a' can help prevent abnormal
+    # termination of the FileHandler and ensure that the log file could
+    # be continuously updated during the lifespan of the runner.
+    log_cfg.setdefault('file_mode', 'a')
+    logger = MMLogger.get_instance(**log_cfg)
+    
+    # save some information useful during the training
+    runner_info = RunnerInfo()
+    runner_info.config = cfg # ideally, cfg should not be changed during process. information should be temp saved in runner_info
+    runner_info.logger = logger # easier way: use print_log("infos", logger='current')
+    runner_info.rank = rank
+    runner_info.distributed = distributed
+    runner_info.launcher = cfg.launcher
+    runner_info.seed = seed
+    runner_info.world_size = world_size
+    runner_info.work_dir = cfg.work_dir
+    runner_info.timestamp = timestamp
+    runner_info.save = args.save
+    runner_info.log_filename = log_filename
+    
+    if runner_info.save:
+        mkdir_or_exist(args.work_dir)
+        runner_info.work_dir = args.work_dir
+    # log_env(cfg, env_cfg, runner_info, logger)
+    
+    # build model
+    if '.pth' in cfg.ckp_path:
+        model = build_model(cfg.model)
+        print_log('Checkpoint Path: {}. Loading from a local file'.format(cfg.ckp_path), logger='current')
+        if hasattr(model, 'load_dict'):
+            print_log(model.load_dict(torch.load(cfg.ckp_path)['model_state_dict']), logger='current')
+        else:
+            print_log(model.load_state_dict(torch.load(cfg.ckp_path)['model_state_dict'], strict=True), logger='current')
+    else:
+        print_log('Checkpoint Path: {}. Loading from the huggingface repo'.format(cfg.ckp_path), logger='current')
+        assert cfg.ckp_path in \
+            ['Zhyever/patchfusion_depth_anything_vits14', 
+             'Zhyever/patchfusion_depth_anything_vitb14', 
+             'Zhyever/patchfusion_depth_anything_vitl14', 
+             'Zhyever/patchfusion_zoedepth'], 'Invalid model name'
+        model = PatchFusion.from_pretrained(cfg.ckp_path)
+    model.eval()
+    
+    if runner_info.distributed:
+        torch.cuda.set_device(runner_info.rank)
+        model.cuda(runner_info.rank)
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[runner_info.rank], output_device=runner_info.rank,
+                                                          find_unused_parameters=cfg.get('find_unused_parameters', False))
+        logger.info(model)
+    else:
+        model.cuda()
+        
+    if runner_info.distributed:
+        val_sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=False)
+    else:
+        val_sampler = None
+    
+    val_dataloader = DataLoader(
+        dataset,
+        batch_size=1,
+        shuffle=False,
+        num_workers=cfg.val_dataloader.num_workers,
+        pin_memory=True,
+        persistent_workers=True,
+        sampler=val_sampler)
+
+    # build tester
+    tester = Tester(
+        config=cfg,
+        runner_info=runner_info,
+        dataloader=val_dataloader,
+        model=model)
+    
+    if args.test_type == 'consistency':
+        tester.run_consistency()
+    else:
+        tester.run(args.cai_mode, process_num=args.process_num)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/tools/train.py b/tools/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ea5e2e8bb579b27f27ca0ada5c59153de95054f
--- /dev/null
+++ b/tools/train.py
@@ -0,0 +1,219 @@
+import os
+import os.path as osp
+import argparse
+import torch
+import torch.nn as nn
+import wandb
+from torch.utils.data import DataLoader
+from mmengine.utils import mkdir_or_exist
+from mmengine.config import Config, DictAction
+from mmengine.logging import MMLogger
+
+from estimator.utils import RunnerInfo, setup_env, log_env, fix_random_seed
+from estimator.models.builder import build_model
+from estimator.datasets.builder import build_dataset
+from estimator.trainer import Trainer
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a segmentor')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--resume',
+        action='store_true',
+        default=False,
+        help='resume from the latest checkpoint in the work_dir automatically')
+    parser.add_argument(
+        '--debug',
+        action='store_true',
+        default=False,
+        help='debug mode')
+    parser.add_argument(
+        '--log-name',
+        type=str, default='',
+        help='log_name for wandb')
+    parser.add_argument(
+        '--tags',
+        type=str, default='',
+        help='tags for wandb')
+    parser.add_argument(
+        '--amp',
+        action='store_true',
+        default=False,
+        help='enable automatic-mixed-precision training')
+    parser.add_argument(
+        '--seed',
+        type=int, default=621,
+        help='for debug')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    # When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
+    # will pass the `--local-rank` parameter to `tools/train.py` instead
+    # of `--local_rank`.
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    return args
+
+def main():
+    
+    args = parse_args()
+    
+    # if args.debug:
+    #     torch.autograd.set_detect_anomaly(True) # for debug
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    
+    cfg.launcher = args.launcher
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    cfg.work_dir = args.work_dir
+    cfg.work_dir = osp.join(cfg.work_dir, args.log_name)
+    
+    mkdir_or_exist(cfg.work_dir)
+    cfg.debug = args.debug
+    cfg.log_name = args.log_name
+    tags = args.tags
+    if ',' in tags:
+        tag_list = tags.split(',')
+    else:
+        tag_list = [tags]
+    cfg.tags = tag_list
+    
+    # fix seed
+    seed = args.seed
+    fix_random_seed(seed)
+    
+    # start dist training
+    if cfg.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+    env_cfg = cfg.get('env_cfg', dict(dist_cfg=dict(backend='nccl')))
+    rank, world_size, timestamp = setup_env(env_cfg, distributed, cfg.launcher)
+    
+    # prepare basic text logger
+    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
+    log_cfg = dict(log_level='INFO', log_file=log_file)
+    log_cfg.setdefault('name', timestamp)
+    log_cfg.setdefault('logger_name', 'patchstitcher')
+    # `torch.compile` in PyTorch 2.0 could close all user defined handlers
+    # unexpectedly. Using file mode 'a' can help prevent abnormal
+    # termination of the FileHandler and ensure that the log file could
+    # be continuously updated during the lifespan of the runner.
+    log_cfg.setdefault('file_mode', 'a')
+    logger = MMLogger.get_instance(**log_cfg)
+    
+    # save some information useful during the training
+    runner_info = RunnerInfo()
+    runner_info.config = cfg # ideally, cfg should not be changed during process. information should be temp saved in runner_info
+    runner_info.logger = logger # easier way: use print_log("infos", logger='current')
+    runner_info.rank = rank
+    runner_info.distributed = distributed
+    runner_info.launcher = cfg.launcher
+    runner_info.seed = seed
+    runner_info.world_size = world_size
+    runner_info.work_dir = cfg.work_dir
+    runner_info.timestamp = timestamp
+    
+    # start wandb
+    if runner_info.rank == 0 and cfg.debug == False:
+        wandb.init(
+            project=cfg.project, 
+            name=cfg.log_name+"_"+runner_info.timestamp, 
+            tags=cfg.tags, 
+            dir=runner_info.work_dir,
+            config=cfg, # have a test
+            settings=wandb.Settings(start_method="fork"))
+        
+        wandb.define_metric("Val/step")
+        wandb.define_metric("Val/*", step_metric="Val/step")
+        wandb.define_metric("Train/step")
+        wandb.define_metric("Train/*", step_metric="Train/step")
+    
+    log_env(cfg, env_cfg, runner_info, logger)
+    
+    # resume training (future)
+    cfg.resume = args.resume
+    
+    # build model
+    model = build_model(cfg.model)
+    if runner_info.distributed:
+        torch.cuda.set_device(runner_info.rank)
+        if cfg.get('convert_syncbn', False):
+            model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
+        model = model.cuda(runner_info.rank)
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[runner_info.rank], output_device=runner_info.rank,
+                                                          find_unused_parameters=cfg.get('find_unused_parameters', False))
+        logger.info(model)
+    else:
+        model = model.cuda(runner_info.rank)
+        logger.info(model)
+        
+    # build dataloader
+    dataset = build_dataset(cfg.train_dataloader.dataset)
+    if runner_info.distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
+    else:
+        train_sampler = None
+    
+    train_dataloader = DataLoader(
+        dataset,
+        batch_size=cfg.train_dataloader.batch_size,
+        shuffle=(train_sampler is None),
+        num_workers=cfg.train_dataloader.num_workers,
+        pin_memory=True,
+        persistent_workers=True,
+        sampler=train_sampler)
+
+
+    dataset = build_dataset(cfg.val_dataloader.dataset)
+    if runner_info.distributed:
+        val_sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=False)
+    else:
+        val_sampler = None
+    
+    val_dataloader = DataLoader(
+        dataset,
+        batch_size=1,
+        shuffle=False,
+        num_workers=cfg.val_dataloader.num_workers,
+        pin_memory=True,
+        persistent_workers=True,
+        sampler=val_sampler)
+
+    # everything is ready, start training. But before that, save your config!
+    cfg.dump(osp.join(cfg.work_dir, 'config.py'))
+    
+    # build trainer
+    trainer = Trainer(
+        config=cfg,
+        runner_info=runner_info,
+        train_sampler=train_sampler,
+        train_dataloader=train_dataloader,
+        val_dataloader=val_dataloader,
+        model=model)
+    
+    trainer.run()
+    wandb.finish()
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/torchhub/README.md b/torchhub/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..eade757c3b0a25c350ba6bf3b5d2e6f048ad1de6
--- /dev/null
+++ b/torchhub/README.md
@@ -0,0 +1,3 @@
+# Local PyTorch Hub
+
+This directory is for loading the DINOv2 encoder locally in case of no Internet connection.
diff --git a/torchhub/facebookresearch_dinov2_main/CODE_OF_CONDUCT.md b/torchhub/facebookresearch_dinov2_main/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000000000000000000000000000000000..3232ed665566ec047ce55a929db1581dbda266a1
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/CODE_OF_CONDUCT.md
@@ -0,0 +1,80 @@
+# Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+This Code of Conduct also applies outside the project spaces when there is a
+reasonable belief that an individual's behavior may have a negative impact on
+the project or its community.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <opensource-conduct@meta.com>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/torchhub/facebookresearch_dinov2_main/CONTRIBUTING.md b/torchhub/facebookresearch_dinov2_main/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..afc89823fc90b920f0758f50e4d808df6a884a34
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/CONTRIBUTING.md
@@ -0,0 +1,31 @@
+# Contributing to DINOv2
+We want to make contributing to this project as easy and transparent as
+possible.
+
+## Pull Requests
+We actively welcome your pull requests.
+
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Meta's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+## License
+By contributing to DINOv2, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
diff --git a/torchhub/facebookresearch_dinov2_main/LICENSE b/torchhub/facebookresearch_dinov2_main/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..a115f899f8d09ef3b1def4a16c7bae1a0bd50fbe
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/LICENSE
@@ -0,0 +1,400 @@
+
+Attribution-NonCommercial 4.0 International
+
+=======================================================================
+
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+
+Using Creative Commons Public Licenses
+
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+	wiki.creativecommons.org/Considerations_for_licensors
+
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More_considerations
+     for the public: 
+	wiki.creativecommons.org/Considerations_for_licensees
+
+=======================================================================
+
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+
+Section 1 -- Definitions.
+
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+
+  c. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  d. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+
+  e. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+
+  f. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+
+  g. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+
+  h. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+
+  i. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+
+  j. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+
+  k. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+
+  l. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+
+Section 2 -- Scope.
+
+  a. License grant.
+
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+
+       5. Downstream recipients.
+
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+
+            b. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+
+  b. Other rights.
+
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+
+Section 3 -- License Conditions.
+
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+
+  a. Attribution.
+
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+
+                ii. a copyright notice;
+
+               iii. a notice that refers to this Public License;
+
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+
+       4. If You Share Adapted Material You produce, the Adapter's
+          License You apply must not prevent recipients of the Adapted
+          Material from complying with this Public License.
+
+Section 4 -- Sui Generis Database Rights.
+
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material; and
+
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+
+Section 6 -- Term and Termination.
+
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+
+       2. upon express reinstatement by the Licensor.
+
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+
+Section 7 -- Other Terms and Conditions.
+
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+
+Section 8 -- Interpretation.
+
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+
+=======================================================================
+
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+
+Creative Commons may be contacted at creativecommons.org.
diff --git a/torchhub/facebookresearch_dinov2_main/MODEL_CARD.md b/torchhub/facebookresearch_dinov2_main/MODEL_CARD.md
new file mode 100644
index 0000000000000000000000000000000000000000..5cd35748eb3c5d8f607f83ff068367a0102117c5
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/MODEL_CARD.md
@@ -0,0 +1,201 @@
+# Model Card for DINOv2-S/B/L/g
+
+These are Vision Transformer models trained following the method described in the paper:
+"DINOv2: Learning Robust Visual Features without Supervision"
+
+We provide 4 models: 1 ViT-g trained from scratch, and 3 ViT-S/B/L models distilled from the ViT-g.
+
+## Model Details
+The model takes an image as input and returns a class token and patch tokens.
+
+The embedding dimension is: 
+- 384 for ViT-S.
+- 768 for ViT-B.
+- 1024 for ViT-L.
+- 1536 for ViT-g.
+
+The models follow a Transformer architecture, with a patch size of 14.
+
+For a 224x224 image, this results in 1 class token + 256 patch tokens.
+
+The models can accept larger images provided the image shapes are multiples of the patch size (14). 
+If this condition is not verified, the model will crop to the closest smaller multiple of the patch size.
+
+### Model Description
+
+- **Developed by:** Meta AI
+- **Model type:** Vision Transformer
+- **License:** CC-BY-NC
+
+- **Repository:** https://github.com/facebookresearch/dinov2
+- **Paper:** https://arxiv.org/abs/2304.07193
+- **Demo:** https://dinov2.metademolab.com/
+
+## Uses
+
+The models are vision backbones providing multi-purpose features for downstream tasks.
+
+### Direct Use
+
+The models can be used without fine-tuning, with downstream classifiers as simple as linear layers, to obtain competitive results:
+- on depth estimation, semantic segmentation, using linear layers.
+- on image classification, using k-NN classifiers on the class token.
+- on image classification, with logistic regression classifiers applied on the class token.
+- on image classification, with a linear layer applied on the class token and the average of the patch tokens.
+- on image retrieval using nearest neighbors.
+
+### Downstream Use
+
+It is technically possible to perform fine-tuning on the models, for small gains (we measured +2% on ImageNet-1k classification). 
+We recommend keeping this as a very last step and only when necessary, as the features already provide good performance out-of-the-box.
+
+## Bias, Risks, and Limitations
+
+Despite improvements thanks to the training method not using annotations, we still observe significant biases in our models toward rich households from Western countries.
+
+### Recommendations
+
+We expect fine-tuning will increase the biases in the features produced by the model as they will be tuned to the fine-tuning labels.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+```python
+import torch
+dinov2_vits14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')
+dinov2_vitb14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitb14')
+dinov2_vitl14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')
+dinov2_vitg14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitg14')
+```
+
+## Training Details
+
+### Training Data
+
+- **Training data:** LVD-142M (see paper)
+- **Training regime:** fp16 using PyTorch-FSDP mixed-precision.
+
+### Training Procedure 
+
+- **Training objective:**
+  - DINO self-distillation loss with multi-crop
+  - iBOT masked-image modeling loss
+  - KoLeo regularization on [CLS] tokens
+- **Architectures:**
+  - ViT-S (21M params): Patch size 14, embedding dimension 384, 6 heads, MLP FFN
+  - ViT-B (86M params): Patch size 14, embedding dimension 768, 12 heads, MLP FFN
+  - ViT-L (0.3B params): Patch size 14, embedding dimension 1024, 16 heads, MLP FFN
+  - ViT-g (1.1B params): Patch size 14, embedding dimension 1536, 24 heads, SwiGLU FFN
+- **Distillation:**
+  - Distillation follows the standard DINOv2 pretraining procedure, except the teacher is a pretrained ViT-g, frozen.
+
+## Evaluation
+
+We refer users to the associated paper for the evaluation protocols.
+
+<table>
+  <tr>
+    <th>model</th>
+    <th colspan="3">ImageNet-1k</th>
+    <th>NYU-Depth v2</th>
+    <th>SUN-RGBD</th>
+    <th>ADE20k</th>
+    <th>iNaturalist 2018</th>
+    <th>Oxford-H</th>
+  </tr>
+  <tr>
+    <th rowspan="2">task</th>
+    <th>classif. (acc)</th>
+    <th>classif. (acc)</th>
+    <th>classif. V2 (acc)</th>
+    <th>depth (RMSE)</th>
+    <th>depth (RMSE)</th>
+    <th>segm. (mAP)</th>
+    <th>classif. (acc)</th>
+    <th>retrieval (mAP)</th>
+  </tr>
+  <tr>
+    <!-- <th>^</th> -->
+    <th>k-NN</th>
+    <th>linear</th>
+    <th>linear</th>
+    <th>linear<br />4 layers</th>
+    <th>NYU-D transfer</th>
+    <th>multiscale</th>
+    <th>linear</th>
+    <th>nearest neighbor</th>
+  </tr>
+  <tr>
+    <td>ViT-S/14</td>
+    <td align="right">79.0%</td>
+    <td align="right">81.1%</td>
+    <td align="right">70.8%</td> 
+    <td align="right">0.417</td> 
+    <td align="right">0.431</td> 
+    <td align="right">47.2</td> 
+    <td align="right">69.5%</td> 
+    <td align="right">43.2</td> 
+  </tr>
+  <tr>
+    <td>ViT-B/14</td>
+    <td align="right">82.1%</td>
+    <td align="right">84.5%</td>
+    <td align="right">74.9%</td>
+    <td align="right">0.362</td> 
+    <td align="right">0.400</td> 
+    <td align="right">51.3</td> 
+    <td align="right">76.3%</td> 
+    <td align="right">49.5</td> 
+  </tr>
+  <tr>
+    <td>ViT-L/14</td>
+    <td align="right">83.5%</td>
+    <td align="right">86.3%</td>
+    <td align="right">77.6%</td>
+    <td align="right">0.333</td> 
+    <td align="right">0.396</td> 
+    <td align="right">53.1</td> 
+    <td align="right">79.8%</td> 
+    <td align="right">54.0</td> 
+  </tr>
+  <tr>
+    <td>ViT-g/14</td>
+    <td align="right">83.5%</td>
+    <td align="right">86.5%</td>
+    <td align="right">78.4%</td>
+    <td align="right">0.298</td> 
+    <td align="right">0.362</td> 
+    <td align="right">53.0</td> 
+    <td align="right">81.6%</td> 
+    <td align="right">52.3</td> 
+  </tr>
+</table>
+
+## Environmental Impact
+
+- **Hardware Type:** Nvidia A100
+- **Hours used:** 22,000 for ViT-g, 4,500 for ViT-S distillation, 5,300 for ViT-B distillation, 8,000 for ViT-L distillation
+- **Cloud Provider:** Private infra
+- **Compute Region:** USA
+- **Carbon Emitted:** 7t CO2eq
+
+#### Hardware
+
+Nvidia A100 GPUs
+
+#### Software
+
+PyTorch 2.0,
+xFormers 0.0.18
+
+**BibTeX**
+
+```
+@misc{oquab2023dinov2,
+  title={DINOv2: Learning Robust Visual Features without Supervision},
+  author={Oquab, Maxime and Darcet, Timothée and Moutakanni, Theo and Vo, Huy and Szafraniec, Marc and Khalidov, Vasil and Fernandez, Pierre and Haziza, Daniel and Massa, Francisco and El-Nouby, Alaaeldin and Howes, Russell and Huang, Po-Yao and Xu, Hu and Sharma, Vasu and Li, Shang-Wen and Galuba, Wojciech and Rabbat, Mike and Assran, Mido and Ballas, Nicolas and Synnaeve, Gabriel and Misra, Ishan and Jegou, Herve and Mairal, Julien and Labatut, Patrick and Joulin, Armand and Bojanowski, Piotr},
+  journal={arXiv:2304.07193},
+  year={2023}
+}
+```
diff --git a/torchhub/facebookresearch_dinov2_main/README.md b/torchhub/facebookresearch_dinov2_main/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..96453e567dee10148be83b5e92d91f347f8521d5
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/README.md
@@ -0,0 +1,277 @@
+# DINOv2: Learning Robust Visual Features without Supervision
+
+**[Meta AI Research, FAIR](https://ai.facebook.com/research/)**
+
+Maxime Oquab,
+Timothée Darcet,
+Théo Moutakanni,
+Huy V. Vo,
+Marc Szafraniec,
+Vasil Khalidov,
+Patrick Labatut,
+Armand Joulin,
+Piotr Bojanowski
+
+[[`Paper`](https://arxiv.org/abs/2304.07193)] [[`Blog`](https://ai.facebook.com/blog/dino-v2-computer-vision-self-supervised-learning/)] [[`Demo`](https://dinov2.metademolab.com)] [[`BibTeX`](#citing-dinov2)]
+
+PyTorch implementation and pretrained models for DINOv2. For details, see the paper: **[DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193)**.
+
+DINOv2 models produce high-performance visual features that can be directly employed with classifiers as simple as linear layers on a variety of computer vision tasks; these visual features are robust and perform well across domains without any requirement for fine-tuning. The models were pretrained on a dataset of 142 M images without using any labels or annotations.
+
+https://github.com/facebookresearch/dinov2/assets/60359573/f168823e-7922-415a-b429-578badf5c356
+
+<div align="center">
+  Visualization of the three first principal components of the patch features of all frames, mapped to RGB values.
+</div>
+
+## Pretrained models
+
+<table style="margin: auto">
+  <tr>
+    <th>model</th>
+    <th># of<br />params</th>
+    <th>ImageNet<br />k-NN</th>
+    <th>ImageNet<br />linear</th>
+    <th>download</th>
+  </tr>
+  <tr>
+    <td>ViT-S/14 distilled</td>
+    <td align="right">21 M</td>
+    <td align="right">79.0%</td>
+    <td align="right">81.1%</td>
+    <td><a href="https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_pretrain.pth">backbone only</a></td>
+  </tr>
+  <tr>
+    <td>ViT-B/14 distilled</td>
+    <td align="right">86 M</td>
+    <td align="right">82.1%</td>
+    <td align="right">84.5%</td>
+    <td><a href="https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_pretrain.pth">backbone only</a></td>
+  </tr>
+  <tr>
+    <td>ViT-L/14 distilled</td>
+    <td align="right">300 M</td>
+    <td align="right">83.5%</td>
+    <td align="right">86.3%</td>
+    <td><a href="https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_pretrain.pth">backbone only</a></td>
+  </tr>
+  <tr>
+    <td>ViT-g/14</td>
+    <td align="right">1,100 M</td>
+    <td align="right">83.5%</td>
+    <td align="right">86.5%</td>
+    <td><a href="https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_pretrain.pth">backbone only</a></td>
+  </tr>
+</table>
+
+### Pretrained models via PyTorch Hub
+
+Please follow the instructions [here](https://pytorch.org/get-started/locally/) to install PyTorch (the only required dependency for loading the model). Installing PyTorch with CUDA support is strongly recommended.
+
+A corresponding [model card](MODEL_CARD.md) is included in the repository.
+
+```python
+import torch
+
+dinov2_vits14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')
+dinov2_vitb14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitb14')
+dinov2_vitl14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')
+dinov2_vitg14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitg14')
+```
+
+## Installation
+
+The training and evaluation code requires PyTorch 2.0 and [xFormers](https://github.com/facebookresearch/xformers) 0.0.18 as well as a number of other 3rd party packages. Note that the code has only been tested with the specified versions and also expects a Linux environment. To setup all the required dependencies for training and evaluation, please follow the instructions below:
+
+*[conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html)* **(Recommended)** - Clone the repository and then create and activate a `dinov2` conda environment using the provided environment definition:
+
+```shell
+conda env create -f conda.yaml
+conda activate dinov2
+```
+
+*[pip](https://pip.pypa.io/en/stable/getting-started/)* - Clone the repository and then use the provided `requirements.txt` to install the dependencies:
+
+```shell
+pip install -r requirements.txt
+```
+
+## Data preparation
+
+### ImageNet-1k
+
+The root directory of the dataset should hold the following contents:
+
+- `<ROOT>/test/ILSVRC2012_test_00000001.JPEG`
+- `<ROOT>/test/[..]`
+- `<ROOT>/test/ILSVRC2012_test_00100000.JPEG`
+- `<ROOT>/train/n01440764/n01440764_10026.JPEG`
+- `<ROOT>/train/[...]`
+- `<ROOT>/train/n15075141/n15075141_9993.JPEG`
+- `<ROOT>/val/n01440764/ILSVRC2012_val_00000293.JPEG`
+- `<ROOT>/val/[...]`
+- `<ROOT>/val/n15075141/ILSVRC2012_val_00049174.JPEG`
+- `<ROOT>/labels.txt`
+
+The provided dataset implementation expects a few additional metadata files to be present under the extra directory:
+
+- `<EXTRA>/class-ids-TRAIN.npy`
+- `<EXTRA>/class-ids-VAL.npy`
+- `<EXTRA>/class-names-TRAIN.npy`
+- `<EXTRA>/class-names-VAL.npy`
+- `<EXTRA>/entries-TEST.npy`
+- `<EXTRA>/entries-TRAIN.npy`
+- `<EXTRA>/entries-VAL.npy`
+
+These metadata files can be generated (once) with the following lines of Python code:
+
+```python
+from dinov2.data.datasets import ImageNet
+
+for split in ImageNet.Split:
+    dataset = ImageNet(split=split, root="<ROOT>", extra="<EXTRA>")
+    dataset.dump_extra()
+```
+
+Note that the root and extra directories do not have to be distinct directories.
+
+### ImageNet-22k
+
+Please adapt the [dataset class](dinov2/data/datasets/image_net_22k.py) to match your local setup.
+
+<br />
+
+:warning: To execute the commands provided in the next sections for training and evaluation, the `dinov2` package should be included in the Python module search path, i.e. simply prefix the command to run with `PYTHONPATH=.`.
+
+## Training
+
+### Fast setup: training DINOv2 ViT-L/16 on ImageNet-1k
+
+Run DINOv2 training on 4 A100-80GB nodes (32 GPUs) in a SLURM cluster environment with submitit:
+
+```shell
+python dinov2/run/train/train.py \
+    --nodes 4 \
+    --config-file dinov2/configs/train/vitl16_short.yaml \
+    --output-dir <PATH/TO/OUTPUT/DIR> \
+    train.dataset_path=ImageNet:split=TRAIN:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
+```
+
+Training time is approximately 1 day and the resulting checkpoint should reach 81.6% on k-NN eval and 82.9% on linear eval.
+
+The training code saves the weights of the teacher in the `eval` folder every 12500 iterations for evaluation.
+
+### Long setup: training DINOv2 ViT-L/14 on ImageNet-22k
+
+Run DINOv2 training on 12 A100-80GB nodes (96 GPUs) in a SLURM cluster environment with submitit:
+
+```shell
+python dinov2/run/train/train.py \
+    --nodes 12 \
+    --config-file dinov2/configs/train/vitl14.yaml \
+    --output-dir <PATH/TO/OUTPUT/DIR> \
+    train.dataset_path=ImageNet22k:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
+```
+
+Training time is approximately 3.3 days and the resulting checkpoint should reach 82.0% on k-NN eval and 84.5% on linear eval.
+
+The training code saves the weights of the teacher in the `eval` folder every 12500 iterations for evaluation.
+
+
+## Evaluation
+
+The training code regularly saves the teacher weights. In order to evaluate the model, run the following evaluation on a single node:
+
+### k-NN classification on ImageNet-1k
+
+```shell
+python dinov2/run/eval/knn.py \
+    --config-file <PATH/TO/OUTPUT/DIR>/config.yaml \
+    --pretrained-weights <PATH/TO/OUTPUT/DIR>/eval/training_24999/teacher_checkpoint.pth \
+    --output-dir <PATH/TO/OUTPUT/DIR>/eval/training_24999/knn \
+    --train-dataset ImageNet:split=TRAIN:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET> \
+    --val-dataset ImageNet:split=VAL:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
+```
+
+### Logistic regression classification on ImageNet-1k
+
+```shell
+python dinov2/run/eval/log_regression.py \
+    --config-file <PATH/TO/OUTPUT/DIR>/config.yaml \
+    --pretrained-weights <PATH/TO/OUTPUT/DIR>/eval/training_24999/teacher_checkpoint.pth \
+    --output-dir <PATH/TO/OUTPUT/DIR>/eval/training_24999/logreg \
+    --train-dataset ImageNet:split=TRAIN:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET> \
+    --val-dataset ImageNet:split=VAL:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
+```
+
+### Linear classification with data augmentation on ImageNet-1k
+
+```shell
+python dinov2/run/eval/linear.py \
+    --config-file <PATH/TO/OUTPUT/DIR>/config.yaml \
+    --pretrained-weights <PATH/TO/OUTPUT/DIR>/eval/training_24999/teacher_checkpoint.pth \
+    --output-dir <PATH/TO/OUTPUT/DIR>/eval/training_24999/linear \
+    --train-dataset ImageNet:split=TRAIN:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET> \
+    --val-dataset ImageNet:split=VAL:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
+```
+
+We release the weights from evaluating the different models:
+
+<table style="margin: auto">
+  <tr>
+    <th>model</th>
+    <th>ImageNet<br />top-1</th>
+    <th>linear evaluation</th>
+  </tr>
+  <tr>
+    <td>ViT-S/14 distilled</td>
+    <td align="right">81.1%</td>
+    <td><a href="https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_linear_head.pth">linear head weights</a></td>
+  </tr>
+  <tr>
+    <td>ViT-B/14 distilled</td>
+    <td align="right">84.5%</td>
+    <td><a href="https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_linear_head.pth">linear head weights</a></td>
+  </tr>
+  <tr>
+    <td>ViT-L/14 distilled</td>
+    <td align="right">86.3%</td>
+    <td><a href="https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_linear_head.pth">linear head weights</a></td>
+  </tr>
+  <tr>
+    <td>ViT-g/14</td>
+    <td align="right">86.5%</td>
+    <td><a href="https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_linear_head.pth">linear head weights</a></td>
+  </tr>
+</table>
+
+The performance of the provided pretrained model weights can be evaluated as follows on ImageNet-1k:
+
+```shell
+python dinov2/run/eval/linear.py \
+    --config-file dinov2/configs/eval/vitg14_pretrain.yaml \
+    --pretrained-weights https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_pretrain.pth \
+    --train-dataset ImageNet:split=TRAIN:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET> \
+    --val-dataset ImageNet:split=VAL:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
+```
+
+## License
+
+DINOv2 code and model weights are released under the CC-BY-NC 4.0 license. See [LICENSE](LICENSE) for additional details.
+
+## Contributing
+
+See [contributing](CONTRIBUTING.md) and the [code of conduct](CODE_OF_CONDUCT.md).
+
+## Citing DINOv2
+
+If you find this repository useful, please consider giving a star :star: and citation :t-rex::
+
+```
+@misc{oquab2023dinov2,
+  title={DINOv2: Learning Robust Visual Features without Supervision},
+  author={Oquab, Maxime and Darcet, Timothée and Moutakanni, Theo and Vo, Huy V. and Szafraniec, Marc and Khalidov, Vasil and Fernandez, Pierre and Haziza, Daniel and Massa, Francisco and El-Nouby, Alaaeldin and Howes, Russell and Huang, Po-Yao and Xu, Hu and Sharma, Vasu and Li, Shang-Wen and Galuba, Wojciech and Rabbat, Mike and Assran, Mido and Ballas, Nicolas and Synnaeve, Gabriel and Misra, Ishan and Jegou, Herve and Mairal, Julien and Labatut, Patrick and Joulin, Armand and Bojanowski, Piotr},
+  journal={arXiv:2304.07193},
+  year={2023}
+}
+```
diff --git a/torchhub/facebookresearch_dinov2_main/conda.yaml b/torchhub/facebookresearch_dinov2_main/conda.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..35dfc30adc275da51b58ff2340dd1d53d2cb9250
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/conda.yaml
@@ -0,0 +1,22 @@
+name: dinov2
+channels:
+  - defaults
+  - pytorch
+  - nvidia
+  - xformers
+  - conda-forge
+dependencies:
+  - python=3.9
+  - pytorch::pytorch=2.0.0
+  - pytorch::pytorch-cuda=11.7.0
+  - pytorch::torchvision=0.15.0
+  - omegaconf
+  - torchmetrics=0.10.3
+  - fvcore
+  - iopath
+  - xformers::xformers=0.0.18
+  - pip
+  - pip:
+    - git+https://github.com/facebookincubator/submitit
+    - --extra-index-url https://pypi.nvidia.com
+    - cuml-cu11
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/__init__.py b/torchhub/facebookresearch_dinov2_main/dinov2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b4afb514783786adf76744f9b97f3e1db1d6081
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+__version__ = "0.0.1"
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/configs/__init__.py b/torchhub/facebookresearch_dinov2_main/dinov2/configs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..033c35660450afec6612adb342c7c30e1ccd15ee
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/configs/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import pathlib
+
+from omegaconf import OmegaConf
+
+
+def load_config(config_name: str):
+    config_filename = config_name + ".yaml"
+    return OmegaConf.load(pathlib.Path(__file__).parent.resolve() / config_filename)
+
+
+dinov2_default_config = load_config("ssl_default_config")
+
+
+def load_and_merge_config(config_name: str):
+    default_config = OmegaConf.create(dinov2_default_config)
+    loaded_config = load_config(config_name)
+    return OmegaConf.merge(default_config, loaded_config)
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitb14_pretrain.yaml b/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitb14_pretrain.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..117d0f027ca26cd8ce6c010bb78d5a8fac42c70e
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitb14_pretrain.yaml
@@ -0,0 +1,6 @@
+student:
+  arch: vit_base
+  patch_size: 14
+crops:
+  global_crops_size: 518  # this is to set up the position embeddings properly
+  local_crops_size: 98
\ No newline at end of file
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitg14_pretrain.yaml b/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitg14_pretrain.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a96dd5b117b4d59ee210b65037821f1b3e3f16e3
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitg14_pretrain.yaml
@@ -0,0 +1,7 @@
+student:
+  arch: vit_giant2
+  patch_size: 14
+  ffn_layer: swiglufused
+crops:
+  global_crops_size: 518  # this is to set up the position embeddings properly
+  local_crops_size: 98
\ No newline at end of file
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitl14_pretrain.yaml b/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitl14_pretrain.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7a984548bd034f762d455419d7193917fa462dd8
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitl14_pretrain.yaml
@@ -0,0 +1,6 @@
+student:
+  arch: vit_large
+  patch_size: 14
+crops:
+  global_crops_size: 518  # this is to set up the position embeddings properly
+  local_crops_size: 98
\ No newline at end of file
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vits14_pretrain.yaml b/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vits14_pretrain.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..afbdb4ba14f1c97130a25b579360f4d817cda495
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vits14_pretrain.yaml
@@ -0,0 +1,6 @@
+student:
+  arch: vit_small
+  patch_size: 14
+crops:
+  global_crops_size: 518  # this is to set up the position embeddings properly
+  local_crops_size: 98
\ No newline at end of file
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/configs/ssl_default_config.yaml b/torchhub/facebookresearch_dinov2_main/dinov2/configs/ssl_default_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a4ef04545ce9d6cc52b5179236008adc8a9bbda2
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/configs/ssl_default_config.yaml
@@ -0,0 +1,115 @@
+MODEL:
+  WEIGHTS: ''
+compute_precision:
+  grad_scaler: true
+  teacher:
+    backbone:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp16
+        buffer_dtype: fp32
+    dino_head:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp16
+        buffer_dtype: fp32
+    ibot_head:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp16
+        buffer_dtype: fp32
+  student:
+    backbone:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp16
+        buffer_dtype: fp32
+    dino_head:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp32
+        buffer_dtype: fp32
+    ibot_head:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp32
+        buffer_dtype: fp32
+dino:
+  loss_weight: 1.0
+  head_n_prototypes: 65536
+  head_bottleneck_dim: 256
+  head_nlayers: 3
+  head_hidden_dim: 2048
+  koleo_loss_weight: 0.1
+ibot:
+  loss_weight: 1.0
+  mask_sample_probability: 0.5
+  mask_ratio_min_max:
+  - 0.1
+  - 0.5
+  separate_head: false
+  head_n_prototypes: 65536
+  head_bottleneck_dim: 256
+  head_nlayers: 3
+  head_hidden_dim: 2048
+train:
+  batch_size_per_gpu: 64
+  dataset_path: ImageNet:split=TRAIN
+  output_dir: .
+  saveckp_freq: 20
+  seed: 0
+  num_workers: 10
+  OFFICIAL_EPOCH_LENGTH: 1250
+  cache_dataset: true
+  centering: "centering" # or "sinkhorn_knopp"
+student:
+  arch: vit_large
+  patch_size: 16
+  drop_path_rate: 0.3
+  layerscale: 1.0e-05
+  drop_path_uniform: true
+  pretrained_weights: ''
+  ffn_layer: "mlp"
+  block_chunks: 0
+  qkv_bias: true
+  proj_bias: true
+  ffn_bias: true
+teacher:
+  momentum_teacher: 0.992
+  final_momentum_teacher: 1
+  warmup_teacher_temp: 0.04
+  teacher_temp: 0.07
+  warmup_teacher_temp_epochs: 30
+optim:
+  epochs: 100
+  weight_decay: 0.04
+  weight_decay_end: 0.4
+  base_lr: 0.004  # learning rate for a batch size of 1024
+  lr: 0.  # will be set after applying scaling rule
+  warmup_epochs: 10
+  min_lr: 1.0e-06
+  clip_grad: 3.0
+  freeze_last_layer_epochs: 1
+  scaling_rule: sqrt_wrt_1024
+  patch_embed_lr_mult: 0.2
+  layerwise_decay: 0.9
+  adamw_beta1: 0.9
+  adamw_beta2: 0.999
+crops:
+  global_crops_scale:
+  - 0.32
+  - 1.0
+  local_crops_number: 8
+  local_crops_scale:
+  - 0.05
+  - 0.32
+  global_crops_size: 224
+  local_crops_size: 96
+evaluation:
+  eval_period_iterations: 12500
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitg14.yaml b/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitg14.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d05cf0d59e07ac6e4a2b0f9bdcb6131d7c508962
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitg14.yaml
@@ -0,0 +1,26 @@
+dino:
+  head_n_prototypes: 131072
+  head_bottleneck_dim: 384
+ibot:
+  separate_head: true
+  head_n_prototypes: 131072
+train:
+  batch_size_per_gpu: 12
+  dataset_path: ImageNet22k
+  centering: sinkhorn_knopp
+student:
+  arch: vit_giant2
+  patch_size: 14
+  drop_path_rate: 0.4
+  ffn_layer: swiglufused
+  block_chunks: 4
+teacher:
+  momentum_teacher: 0.994
+optim:
+  epochs: 500
+  weight_decay_end: 0.2
+  base_lr: 2.0e-04  # learning rate for a batch size of 1024
+  warmup_epochs: 80
+  layerwise_decay: 1.0
+crops:
+  local_crops_size: 98
\ No newline at end of file
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitl14.yaml b/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitl14.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d9b491dcc6a522c71328fc2933dd0501123c8f6b
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitl14.yaml
@@ -0,0 +1,26 @@
+dino:
+  head_n_prototypes: 131072
+  head_bottleneck_dim: 384
+ibot:
+  separate_head: true
+  head_n_prototypes: 131072
+train:
+  batch_size_per_gpu: 32
+  dataset_path: ImageNet22k
+  centering: sinkhorn_knopp
+student:
+  arch: vit_large
+  patch_size: 14
+  drop_path_rate: 0.4
+  ffn_layer: swiglufused
+  block_chunks: 4
+teacher:
+  momentum_teacher: 0.994
+optim:
+  epochs: 500
+  weight_decay_end: 0.2
+  base_lr: 2.0e-04  # learning rate for a batch size of 1024
+  warmup_epochs: 80
+  layerwise_decay: 1.0
+crops:
+  local_crops_size: 98
\ No newline at end of file
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitl16_short.yaml b/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitl16_short.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3e7e72864c92175a1354142ac1d64da8070d1e5e
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitl16_short.yaml
@@ -0,0 +1,6 @@
+# this corresponds to the default config
+train:
+  dataset_path: ImageNet:split=TRAIN
+  batch_size_per_gpu: 64
+student:
+  block_chunks: 4
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/data/__init__.py b/torchhub/facebookresearch_dinov2_main/dinov2/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..357db5c542c5810391ba2bd45a60c13c01c3737a
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/data/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .adapters import DatasetWithEnumeratedTargets
+from .loaders import make_data_loader, make_dataset, SamplerType
+from .collate import collate_data_and_cast
+from .masking import MaskingGenerator
+from .augmentations import DataAugmentationDINO
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/data/adapters.py b/torchhub/facebookresearch_dinov2_main/dinov2/data/adapters.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dcbc68e046f03460d5867f1388d5380d9c6f603
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/data/adapters.py
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, Tuple
+
+from torch.utils.data import Dataset
+
+
+class DatasetWithEnumeratedTargets(Dataset):
+    def __init__(self, dataset):
+        self._dataset = dataset
+
+    def get_image_data(self, index: int) -> bytes:
+        return self._dataset.get_image_data(index)
+
+    def get_target(self, index: int) -> Tuple[Any, int]:
+        target = self._dataset.get_target(index)
+        return (index, target)
+
+    def __getitem__(self, index: int) -> Tuple[Any, Tuple[Any, int]]:
+        image, target = self._dataset[index]
+        target = index if target is None else target
+        return image, (index, target)
+
+    def __len__(self) -> int:
+        return len(self._dataset)
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/data/augmentations.py b/torchhub/facebookresearch_dinov2_main/dinov2/data/augmentations.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ca28cb59a4de2566a6c9ef9c301cbbb4e54b5ee
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/data/augmentations.py
@@ -0,0 +1,119 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+from torchvision import transforms
+
+from .transforms import (
+    GaussianBlur,
+    make_normalize_transform,
+)
+
+
+logger = logging.getLogger("dinov2")
+
+
+class DataAugmentationDINO(object):
+    def __init__(
+        self,
+        global_crops_scale,
+        local_crops_scale,
+        local_crops_number,
+        global_crops_size=224,
+        local_crops_size=96,
+    ):
+        self.global_crops_scale = global_crops_scale
+        self.local_crops_scale = local_crops_scale
+        self.local_crops_number = local_crops_number
+        self.global_crops_size = global_crops_size
+        self.local_crops_size = local_crops_size
+
+        logger.info("###################################")
+        logger.info("Using data augmentation parameters:")
+        logger.info(f"global_crops_scale: {global_crops_scale}")
+        logger.info(f"local_crops_scale: {local_crops_scale}")
+        logger.info(f"local_crops_number: {local_crops_number}")
+        logger.info(f"global_crops_size: {global_crops_size}")
+        logger.info(f"local_crops_size: {local_crops_size}")
+        logger.info("###################################")
+
+        # random resized crop and flip
+        self.geometric_augmentation_global = transforms.Compose(
+            [
+                transforms.RandomResizedCrop(
+                    global_crops_size, scale=global_crops_scale, interpolation=transforms.InterpolationMode.BICUBIC
+                ),
+                transforms.RandomHorizontalFlip(p=0.5),
+            ]
+        )
+
+        self.geometric_augmentation_local = transforms.Compose(
+            [
+                transforms.RandomResizedCrop(
+                    local_crops_size, scale=local_crops_scale, interpolation=transforms.InterpolationMode.BICUBIC
+                ),
+                transforms.RandomHorizontalFlip(p=0.5),
+            ]
+        )
+
+        # color distorsions / blurring
+        color_jittering = transforms.Compose(
+            [
+                transforms.RandomApply(
+                    [transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1)],
+                    p=0.8,
+                ),
+                transforms.RandomGrayscale(p=0.2),
+            ]
+        )
+
+        global_transfo1_extra = GaussianBlur(p=1.0)
+
+        global_transfo2_extra = transforms.Compose(
+            [
+                GaussianBlur(p=0.1),
+                transforms.RandomSolarize(threshold=128, p=0.2),
+            ]
+        )
+
+        local_transfo_extra = GaussianBlur(p=0.5)
+
+        # normalization
+        self.normalize = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                make_normalize_transform(),
+            ]
+        )
+
+        self.global_transfo1 = transforms.Compose([color_jittering, global_transfo1_extra, self.normalize])
+        self.global_transfo2 = transforms.Compose([color_jittering, global_transfo2_extra, self.normalize])
+        self.local_transfo = transforms.Compose([color_jittering, local_transfo_extra, self.normalize])
+
+    def __call__(self, image):
+        output = {}
+
+        # global crops:
+        im1_base = self.geometric_augmentation_global(image)
+        global_crop_1 = self.global_transfo1(im1_base)
+
+        im2_base = self.geometric_augmentation_global(image)
+        global_crop_2 = self.global_transfo2(im2_base)
+
+        output["global_crops"] = [global_crop_1, global_crop_2]
+
+        # global crops for teacher:
+        output["global_crops_teacher"] = [global_crop_1, global_crop_2]
+
+        # local crops:
+        local_crops = [
+            self.local_transfo(self.geometric_augmentation_local(image)) for _ in range(self.local_crops_number)
+        ]
+        output["local_crops"] = local_crops
+        output["offsets"] = ()
+
+        return output
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/data/collate.py b/torchhub/facebookresearch_dinov2_main/dinov2/data/collate.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f0d98906808ed326dff4486d95b3ec04f8a5e75
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/data/collate.py
@@ -0,0 +1,50 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import random
+
+
+def collate_data_and_cast(samples_list, mask_ratio_tuple, mask_probability, dtype, n_tokens=None, mask_generator=None):
+    # dtype = torch.half  # TODO: Remove
+
+    n_global_crops = len(samples_list[0][0]["global_crops"])
+    n_local_crops = len(samples_list[0][0]["local_crops"])
+
+    collated_global_crops = torch.stack([s[0]["global_crops"][i] for i in range(n_global_crops) for s in samples_list])
+
+    collated_local_crops = torch.stack([s[0]["local_crops"][i] for i in range(n_local_crops) for s in samples_list])
+
+    B = len(collated_global_crops)
+    N = n_tokens
+    n_samples_masked = int(B * mask_probability)
+    probs = torch.linspace(*mask_ratio_tuple, n_samples_masked + 1)
+    upperbound = 0
+    masks_list = []
+    for i in range(0, n_samples_masked):
+        prob_min = probs[i]
+        prob_max = probs[i + 1]
+        masks_list.append(torch.BoolTensor(mask_generator(int(N * random.uniform(prob_min, prob_max)))))
+        upperbound += int(N * prob_max)
+    for i in range(n_samples_masked, B):
+        masks_list.append(torch.BoolTensor(mask_generator(0)))
+
+    random.shuffle(masks_list)
+
+    collated_masks = torch.stack(masks_list).flatten(1)
+    mask_indices_list = collated_masks.flatten().nonzero().flatten()
+
+    masks_weight = (1 / collated_masks.sum(-1).clamp(min=1.0)).unsqueeze(-1).expand_as(collated_masks)[collated_masks]
+
+    return {
+        "collated_global_crops": collated_global_crops.to(dtype),
+        "collated_local_crops": collated_local_crops.to(dtype),
+        "collated_masks": collated_masks,
+        "mask_indices_list": mask_indices_list,
+        "masks_weight": masks_weight,
+        "upperbound": upperbound,
+        "n_masked_patches": torch.full((1,), fill_value=mask_indices_list.shape[0], dtype=torch.long),
+    }
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/data/loaders.py b/torchhub/facebookresearch_dinov2_main/dinov2/data/loaders.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fb6f25a0a3c3251b803f48d0a515aa0b9591226
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/data/loaders.py
@@ -0,0 +1,223 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from enum import Enum
+from typing import Any, Callable, List, Optional, TypeVar
+
+import torch
+from torch.utils.data import Sampler
+
+from .datasets import ImageNet, ImageNet22k
+from .samplers import EpochSampler, InfiniteSampler, ShardedInfiniteSampler
+
+
+logger = logging.getLogger("dinov2")
+
+
+class SamplerType(Enum):
+    DISTRIBUTED = 0
+    EPOCH = 1
+    INFINITE = 2
+    SHARDED_INFINITE = 3
+    SHARDED_INFINITE_NEW = 4
+
+
+def _make_bool_str(b: bool) -> str:
+    return "yes" if b else "no"
+
+
+def _make_sample_transform(image_transform: Optional[Callable] = None, target_transform: Optional[Callable] = None):
+    def transform(sample):
+        image, target = sample
+        if image_transform is not None:
+            image = image_transform(image)
+        if target_transform is not None:
+            target = target_transform(target)
+        return image, target
+
+    return transform
+
+
+def _parse_dataset_str(dataset_str: str):
+    tokens = dataset_str.split(":")
+
+    name = tokens[0]
+    kwargs = {}
+
+    for token in tokens[1:]:
+        key, value = token.split("=")
+        assert key in ("root", "extra", "split")
+        kwargs[key] = value
+
+    if name == "ImageNet":
+        class_ = ImageNet
+        if "split" in kwargs:
+            kwargs["split"] = ImageNet.Split[kwargs["split"]]
+    elif name == "ImageNet22k":
+        class_ = ImageNet22k
+    else:
+        raise ValueError(f'Unsupported dataset "{name}"')
+
+    return class_, kwargs
+
+
+def make_dataset(
+    *,
+    dataset_str: str,
+    transform: Optional[Callable] = None,
+    target_transform: Optional[Callable] = None,
+):
+    """
+    Creates a dataset with the specified parameters.
+
+    Args:
+        dataset_str: A dataset string description (e.g. ImageNet:split=TRAIN).
+        transform: A transform to apply to images.
+        target_transform: A transform to apply to targets.
+
+    Returns:
+        The created dataset.
+    """
+    logger.info(f'using dataset: "{dataset_str}"')
+
+    class_, kwargs = _parse_dataset_str(dataset_str)
+    dataset = class_(transform=transform, target_transform=target_transform, **kwargs)
+
+    logger.info(f"# of dataset samples: {len(dataset):,d}")
+
+    # Aggregated datasets do not expose (yet) these attributes, so add them.
+    if not hasattr(dataset, "transform"):
+        setattr(dataset, "transform", transform)
+    if not hasattr(dataset, "target_transform"):
+        setattr(dataset, "target_transform", target_transform)
+
+    return dataset
+
+
+def _make_sampler(
+    *,
+    dataset,
+    type: Optional[SamplerType] = None,
+    shuffle: bool = False,
+    seed: int = 0,
+    size: int = -1,
+    advance: int = 0,
+) -> Optional[Sampler]:
+    sample_count = len(dataset)
+
+    if type == SamplerType.INFINITE:
+        logger.info("sampler: infinite")
+        if size > 0:
+            raise ValueError("sampler size > 0 is invalid")
+        return InfiniteSampler(
+            sample_count=sample_count,
+            shuffle=shuffle,
+            seed=seed,
+            advance=advance,
+        )
+    elif type in (SamplerType.SHARDED_INFINITE, SamplerType.SHARDED_INFINITE_NEW):
+        logger.info("sampler: sharded infinite")
+        if size > 0:
+            raise ValueError("sampler size > 0 is invalid")
+        # TODO: Remove support for old shuffling
+        use_new_shuffle_tensor_slice = type == SamplerType.SHARDED_INFINITE_NEW
+        return ShardedInfiniteSampler(
+            sample_count=sample_count,
+            shuffle=shuffle,
+            seed=seed,
+            advance=advance,
+            use_new_shuffle_tensor_slice=use_new_shuffle_tensor_slice,
+        )
+    elif type == SamplerType.EPOCH:
+        logger.info("sampler: epoch")
+        if advance > 0:
+            raise NotImplementedError("sampler advance > 0 is not supported")
+        size = size if size > 0 else sample_count
+        logger.info(f"# of samples / epoch: {size:,d}")
+        return EpochSampler(
+            size=size,
+            sample_count=sample_count,
+            shuffle=shuffle,
+            seed=seed,
+        )
+    elif type == SamplerType.DISTRIBUTED:
+        logger.info("sampler: distributed")
+        if size > 0:
+            raise ValueError("sampler size > 0 is invalid")
+        if advance > 0:
+            raise ValueError("sampler advance > 0 is invalid")
+        return torch.utils.data.DistributedSampler(
+            dataset=dataset,
+            shuffle=shuffle,
+            seed=seed,
+            drop_last=False,
+        )
+
+    logger.info("sampler: none")
+    return None
+
+
+T = TypeVar("T")
+
+
+def make_data_loader(
+    *,
+    dataset,
+    batch_size: int,
+    num_workers: int,
+    shuffle: bool = True,
+    seed: int = 0,
+    sampler_type: Optional[SamplerType] = SamplerType.INFINITE,
+    sampler_size: int = -1,
+    sampler_advance: int = 0,
+    drop_last: bool = True,
+    persistent_workers: bool = False,
+    collate_fn: Optional[Callable[[List[T]], Any]] = None,
+):
+    """
+    Creates a data loader with the specified parameters.
+
+    Args:
+        dataset: A dataset (third party, LaViDa or WebDataset).
+        batch_size: The size of batches to generate.
+        num_workers: The number of workers to use.
+        shuffle: Whether to shuffle samples.
+        seed: The random seed to use.
+        sampler_type: Which sampler to use: EPOCH, INFINITE, SHARDED_INFINITE, SHARDED_INFINITE_NEW, DISTRIBUTED or None.
+        sampler_size: The number of images per epoch (when applicable) or -1 for the entire dataset.
+        sampler_advance: How many samples to skip (when applicable).
+        drop_last: Whether the last non-full batch of data should be dropped.
+        persistent_workers: maintain the workers Dataset instances alive after a dataset has been consumed once.
+        collate_fn: Function that performs batch collation
+    """
+
+    sampler = _make_sampler(
+        dataset=dataset,
+        type=sampler_type,
+        shuffle=shuffle,
+        seed=seed,
+        size=sampler_size,
+        advance=sampler_advance,
+    )
+
+    logger.info("using PyTorch data loader")
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        sampler=sampler,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_memory=True,
+        drop_last=drop_last,
+        persistent_workers=persistent_workers,
+        collate_fn=collate_fn,
+    )
+
+    try:
+        logger.info(f"# of batches: {len(data_loader):,d}")
+    except TypeError:  # data loader has no length
+        logger.info("infinite data loader")
+    return data_loader
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/data/masking.py b/torchhub/facebookresearch_dinov2_main/dinov2/data/masking.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc3c72648c3e440dcdb284366b98d2df12ad1272
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/data/masking.py
@@ -0,0 +1,87 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import random
+import math
+import numpy as np
+
+
+class MaskingGenerator:
+    def __init__(
+        self,
+        input_size,
+        num_masking_patches=None,
+        min_num_patches=4,
+        max_num_patches=None,
+        min_aspect=0.3,
+        max_aspect=None,
+    ):
+        if not isinstance(input_size, tuple):
+            input_size = (input_size,) * 2
+        self.height, self.width = input_size
+
+        self.num_patches = self.height * self.width
+        self.num_masking_patches = num_masking_patches
+
+        self.min_num_patches = min_num_patches
+        self.max_num_patches = num_masking_patches if max_num_patches is None else max_num_patches
+
+        max_aspect = max_aspect or 1 / min_aspect
+        self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect))
+
+    def __repr__(self):
+        repr_str = "Generator(%d, %d -> [%d ~ %d], max = %d, %.3f ~ %.3f)" % (
+            self.height,
+            self.width,
+            self.min_num_patches,
+            self.max_num_patches,
+            self.num_masking_patches,
+            self.log_aspect_ratio[0],
+            self.log_aspect_ratio[1],
+        )
+        return repr_str
+
+    def get_shape(self):
+        return self.height, self.width
+
+    def _mask(self, mask, max_mask_patches):
+        delta = 0
+        for _ in range(10):
+            target_area = random.uniform(self.min_num_patches, max_mask_patches)
+            aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
+            h = int(round(math.sqrt(target_area * aspect_ratio)))
+            w = int(round(math.sqrt(target_area / aspect_ratio)))
+            if w < self.width and h < self.height:
+                top = random.randint(0, self.height - h)
+                left = random.randint(0, self.width - w)
+
+                num_masked = mask[top : top + h, left : left + w].sum()
+                # Overlap
+                if 0 < h * w - num_masked <= max_mask_patches:
+                    for i in range(top, top + h):
+                        for j in range(left, left + w):
+                            if mask[i, j] == 0:
+                                mask[i, j] = 1
+                                delta += 1
+
+                if delta > 0:
+                    break
+        return delta
+
+    def __call__(self, num_masking_patches=0):
+        mask = np.zeros(shape=self.get_shape(), dtype=bool)
+        mask_count = 0
+        while mask_count < num_masking_patches:
+            max_mask_patches = num_masking_patches - mask_count
+            max_mask_patches = min(max_mask_patches, self.max_num_patches)
+
+            delta = self._mask(mask, max_mask_patches)
+            if delta == 0:
+                break
+            else:
+                mask_count += delta
+
+        return mask
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/data/samplers.py b/torchhub/facebookresearch_dinov2_main/dinov2/data/samplers.py
new file mode 100644
index 0000000000000000000000000000000000000000..e356edf603a33ce2d18a388fd799694e22d1980f
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/data/samplers.py
@@ -0,0 +1,230 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+from typing import Any, Optional
+import warnings
+
+import numpy as np
+import torch
+from torch.utils.data.sampler import Sampler
+
+import dinov2.distributed as distributed
+
+
+class EpochSampler(Sampler):
+    def __init__(
+        self,
+        *,
+        size: int,
+        sample_count: int,
+        shuffle: bool = False,
+        seed: int = 0,
+        start: Optional[int] = None,
+        step: Optional[int] = None,
+    ):
+        self._size = size
+        self._sample_count = sample_count
+        self._shuffle = shuffle
+        self._seed = seed
+        self._start = distributed.get_global_rank() if start is None else start
+        self._step = distributed.get_global_size() if step is None else step
+        self._epoch = 0
+
+    def __iter__(self):
+        count = (self._size + self._sample_count - 1) // self._sample_count
+        tiled_indices = np.tile(np.arange(self._sample_count), count)
+        if self._shuffle:
+            seed = self._seed * self._epoch if self._seed != 0 else self._epoch
+            rng = np.random.default_rng(seed)
+            iterable = rng.choice(tiled_indices, self._size, replace=False)
+        else:
+            iterable = tiled_indices[: self._size]
+
+        yield from itertools.islice(iterable, self._start, None, self._step)
+
+    def __len__(self):
+        return (self._size - self._start + self._step - 1) // self._step
+
+    def set_epoch(self, epoch):
+        self._epoch = epoch
+
+
+def _get_numpy_dtype(size: int) -> Any:
+    return np.int32 if size <= 2**31 else np.int64
+
+
+def _get_torch_dtype(size: int) -> Any:
+    return torch.int32 if size <= 2**31 else torch.int64
+
+
+def _generate_randperm_indices(*, size: int, generator: torch.Generator):
+    """Generate the indices of a random permutation."""
+    dtype = _get_torch_dtype(size)
+    # This is actually matching PyTorch's CPU implementation, see: https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorFactories.cpp#L900-L921
+    perm = torch.arange(size, dtype=dtype)
+    for i in range(size):
+        j = torch.randint(i, size, size=(1,), generator=generator).item()
+
+        # Always swap even if no-op
+        value = perm[j].item()
+        perm[j] = perm[i].item()
+        perm[i] = value
+        yield value
+
+
+class InfiniteSampler(Sampler):
+    def __init__(
+        self,
+        *,
+        sample_count: int,
+        shuffle: bool = False,
+        seed: int = 0,
+        start: Optional[int] = None,
+        step: Optional[int] = None,
+        advance: int = 0,
+    ):
+        self._sample_count = sample_count
+        self._seed = seed
+        self._shuffle = shuffle
+        self._start = distributed.get_global_rank() if start is None else start
+        self._step = distributed.get_global_size() if step is None else step
+        self._advance = advance
+
+    def __iter__(self):
+        if self._shuffle:
+            iterator = self._shuffled_iterator()
+        else:
+            iterator = self._iterator()
+
+        yield from itertools.islice(iterator, self._advance, None)
+
+    def _iterator(self):
+        assert not self._shuffle
+
+        while True:
+            iterable = range(self._sample_count)
+            yield from itertools.islice(iterable, self._start, None, self._step)
+
+    def _shuffled_iterator(self):
+        assert self._shuffle
+
+        # Instantiate a generator here (rather than in the ctor) to keep the class
+        # picklable (requirement of mp.spawn)
+        generator = torch.Generator().manual_seed(self._seed)
+
+        while True:
+            iterable = _generate_randperm_indices(size=self._sample_count, generator=generator)
+            yield from itertools.islice(iterable, self._start, None, self._step)
+
+
+# The following function is somewhat equivalent to _new_shuffle_tensor_slice below,
+# but avoids a full in-place random permutation generation.
+def _shuffle_tensor_slice(
+    *, tensor: torch.Tensor, start: int = 0, step: int = 1, generator: torch.Generator
+) -> np.ndarray:
+    stop = len(tensor)
+    count = stop // step
+    drop_count = stop - step * count
+    if drop_count:
+        warnings.warn(f"# of dropped samples: {drop_count}")
+
+    dtype = _get_numpy_dtype(stop)
+    result = np.empty(count, dtype=dtype)
+
+    for i in range(count):
+        j = torch.randint(0, i + 1, size=(1,), generator=generator).item() if i > 0 else 0
+
+        result[i] = result[j]
+        result[j] = tensor[start + i * step].item()
+
+    return result
+
+
+def _new_shuffle_tensor_slice(
+    *, tensor: torch.Tensor, start: int = 0, step: int = 1, generator: torch.Generator
+) -> np.ndarray:
+    stop = len(tensor)
+    count = stop // step
+    dtype = torch.int64  # Needed for using randperm result as indices
+    count = stop // step
+    drop_count = stop - step * count
+    if drop_count:
+        warnings.warn(f"# of dropped samples: {drop_count}")
+    indices = torch.randperm(count, dtype=dtype, generator=generator)
+    return tensor[start::step][indices].numpy()
+
+
+def _make_seed(seed: int, start: int, iter_count: int) -> int:
+    # NOTE: Tried a few variants (including iter_count << 32), this one worked best.
+    return seed + start + (iter_count << 24)
+
+
+class ShardedInfiniteSampler(Sampler):
+    def __init__(
+        self,
+        *,
+        sample_count: int,
+        shuffle: bool = False,
+        seed: int = 0,
+        start: Optional[int] = None,
+        step: Optional[int] = None,
+        advance: int = 0,
+        use_new_shuffle_tensor_slice: bool = False,
+    ):
+        self._sample_count = sample_count
+        self._seed = seed
+        self._shuffle = shuffle
+        self._start = distributed.get_global_rank() if start is None else start
+        self._step = distributed.get_global_size() if step is None else step
+        self._advance = advance
+        self._iter_count = 0
+        self._shuffle_tensor_slice_fn = (
+            _new_shuffle_tensor_slice if use_new_shuffle_tensor_slice else _shuffle_tensor_slice
+        )
+
+    def __iter__(self):
+        iter_count = self._advance // self._sample_count
+        if iter_count > 0:
+            self._advance -= iter_count * self._sample_count
+            self._iter_count += iter_count
+
+        if self._shuffle:
+            iterator = self._shuffled_iterator()
+        else:
+            iterator = self._iterator()
+
+        yield from itertools.islice(iterator, self._advance, None)
+
+    def _iterator(self):
+        assert not self._shuffle
+
+        while True:
+            iterable = range(self._sample_count)
+            yield from itertools.islice(iterable, self._start, None, self._step)
+
+    def _shuffled_iterator(self):
+        assert self._shuffle
+
+        # Instantiate a generator here (rather than in the ctor) to be keep the class
+        # picklable (requirement of mp.spawn)
+        generator = torch.Generator()
+
+        # Always shuffle everything first
+        generator.manual_seed(self._seed)
+        dtype = _get_torch_dtype(self._sample_count)
+        perm = torch.randperm(self._sample_count, dtype=dtype, generator=generator)
+
+        while True:
+            # Re-seed on each iteration to allow skipping whole permutations
+            seed = _make_seed(self._seed, self._start, self._iter_count)
+            generator.manual_seed(seed)
+
+            iterable = self._shuffle_tensor_slice_fn(
+                tensor=perm, start=self._start, step=self._step, generator=generator
+            )
+            yield from iterable
+            self._iter_count += 1
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/data/transforms.py b/torchhub/facebookresearch_dinov2_main/dinov2/data/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1bc4cbd1a459a9f44314806cf9ccedea112ab14
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/data/transforms.py
@@ -0,0 +1,92 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Sequence
+
+import torch
+from torchvision import transforms
+
+
+class GaussianBlur(transforms.RandomApply):
+    """
+    Apply Gaussian Blur to the PIL image.
+    """
+
+    def __init__(self, *, p: float = 0.5, radius_min: float = 0.1, radius_max: float = 2.0):
+        # NOTE: torchvision is applying 1 - probability to return the original image
+        keep_p = 1 - p
+        transform = transforms.GaussianBlur(kernel_size=9, sigma=(radius_min, radius_max))
+        super().__init__(transforms=[transform], p=keep_p)
+
+
+class MaybeToTensor(transforms.ToTensor):
+    """
+    Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor, or keep as is if already a tensor.
+    """
+
+    def __call__(self, pic):
+        """
+        Args:
+            pic (PIL Image, numpy.ndarray or torch.tensor): Image to be converted to tensor.
+        Returns:
+            Tensor: Converted image.
+        """
+        if isinstance(pic, torch.Tensor):
+            return pic
+        return super().__call__(pic)
+
+
+# Use timm's names
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+
+
+def make_normalize_transform(
+    mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
+    std: Sequence[float] = IMAGENET_DEFAULT_STD,
+) -> transforms.Normalize:
+    return transforms.Normalize(mean=mean, std=std)
+
+
+# This roughly matches torchvision's preset for classification training:
+#   https://github.com/pytorch/vision/blob/main/references/classification/presets.py#L6-L44
+def make_classification_train_transform(
+    *,
+    crop_size: int = 224,
+    interpolation=transforms.InterpolationMode.BICUBIC,
+    hflip_prob: float = 0.5,
+    mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
+    std: Sequence[float] = IMAGENET_DEFAULT_STD,
+):
+    transforms_list = [transforms.RandomResizedCrop(crop_size, interpolation=interpolation)]
+    if hflip_prob > 0.0:
+        transforms_list.append(transforms.RandomHorizontalFlip(hflip_prob))
+    transforms_list.extend(
+        [
+            MaybeToTensor(),
+            make_normalize_transform(mean=mean, std=std),
+        ]
+    )
+    return transforms.Compose(transforms_list)
+
+
+# This matches (roughly) torchvision's preset for classification evaluation:
+#   https://github.com/pytorch/vision/blob/main/references/classification/presets.py#L47-L69
+def make_classification_eval_transform(
+    *,
+    resize_size: int = 256,
+    interpolation=transforms.InterpolationMode.BICUBIC,
+    crop_size: int = 224,
+    mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
+    std: Sequence[float] = IMAGENET_DEFAULT_STD,
+) -> transforms.Compose:
+    transforms_list = [
+        transforms.Resize(resize_size, interpolation=interpolation),
+        transforms.CenterCrop(crop_size),
+        MaybeToTensor(),
+        make_normalize_transform(mean=mean, std=std),
+    ]
+    return transforms.Compose(transforms_list)
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/distributed/__init__.py b/torchhub/facebookresearch_dinov2_main/dinov2/distributed/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ccd663f33d5a21ad1f9d25db7bd378ec52aeac2
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/distributed/__init__.py
@@ -0,0 +1,271 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import random
+import re
+import socket
+from typing import Dict, List
+
+import torch
+import torch.distributed as dist
+
+_LOCAL_RANK = -1
+_LOCAL_WORLD_SIZE = -1
+
+
+def is_enabled() -> bool:
+    """
+    Returns:
+        True if distributed training is enabled
+    """
+    return dist.is_available() and dist.is_initialized()
+
+
+def get_global_size() -> int:
+    """
+    Returns:
+        The number of processes in the process group
+    """
+    return dist.get_world_size() if is_enabled() else 1
+
+
+def get_global_rank() -> int:
+    """
+    Returns:
+        The rank of the current process within the global process group.
+    """
+    return dist.get_rank() if is_enabled() else 0
+
+
+def get_local_rank() -> int:
+    """
+    Returns:
+        The rank of the current process within the local (per-machine) process group.
+    """
+    if not is_enabled():
+        return 0
+    assert 0 <= _LOCAL_RANK < _LOCAL_WORLD_SIZE
+    return _LOCAL_RANK
+
+
+def get_local_size() -> int:
+    """
+    Returns:
+        The size of the per-machine process group,
+        i.e. the number of processes per machine.
+    """
+    if not is_enabled():
+        return 1
+    assert 0 <= _LOCAL_RANK < _LOCAL_WORLD_SIZE
+    return _LOCAL_WORLD_SIZE
+
+
+def is_main_process() -> bool:
+    """
+    Returns:
+        True if the current process is the main one.
+    """
+    return get_global_rank() == 0
+
+
+def _restrict_print_to_main_process() -> None:
+    """
+    This function disables printing when not in the main process
+    """
+    import builtins as __builtin__
+
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        if is_main_process() or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def _get_master_port(seed: int = 0) -> int:
+    MIN_MASTER_PORT, MAX_MASTER_PORT = (20_000, 60_000)
+
+    master_port_str = os.environ.get("MASTER_PORT")
+    if master_port_str is None:
+        rng = random.Random(seed)
+        return rng.randint(MIN_MASTER_PORT, MAX_MASTER_PORT)
+
+    return int(master_port_str)
+
+
+def _get_available_port() -> int:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        # A "" host address means INADDR_ANY i.e. binding to all interfaces.
+        # Note this is not compatible with IPv6.
+        s.bind(("", 0))
+        port = s.getsockname()[1]
+        return port
+
+
+_TORCH_DISTRIBUTED_ENV_VARS = (
+    "MASTER_ADDR",
+    "MASTER_PORT",
+    "RANK",
+    "WORLD_SIZE",
+    "LOCAL_RANK",
+    "LOCAL_WORLD_SIZE",
+)
+
+
+def _collect_env_vars() -> Dict[str, str]:
+    return {env_var: os.environ[env_var] for env_var in _TORCH_DISTRIBUTED_ENV_VARS if env_var in os.environ}
+
+
+def _is_slurm_job_process() -> bool:
+    return "SLURM_JOB_ID" in os.environ
+
+
+def _parse_slurm_node_list(s: str) -> List[str]:
+    nodes = []
+    # Extract "hostname", "hostname[1-2,3,4-5]," substrings
+    p = re.compile(r"(([^\[]+)(?:\[([^\]]+)\])?),?")
+    for m in p.finditer(s):
+        prefix, suffixes = s[m.start(2) : m.end(2)], s[m.start(3) : m.end(3)]
+        for suffix in suffixes.split(","):
+            span = suffix.split("-")
+            if len(span) == 1:
+                nodes.append(prefix + suffix)
+            else:
+                width = len(span[0])
+                start, end = int(span[0]), int(span[1]) + 1
+                nodes.extend([prefix + f"{i:0{width}}" for i in range(start, end)])
+    return nodes
+
+
+def _check_env_variable(key: str, new_value: str):
+    # Only check for difference with preset environment variables
+    if key in os.environ and os.environ[key] != new_value:
+        raise RuntimeError(f"Cannot export environment variables as {key} is already set")
+
+
+class _TorchDistributedEnvironment:
+    def __init__(self):
+        self.master_addr = "127.0.0.1"
+        self.master_port = 0
+        self.rank = -1
+        self.world_size = -1
+        self.local_rank = -1
+        self.local_world_size = -1
+
+        if _is_slurm_job_process():
+            return self._set_from_slurm_env()
+
+        env_vars = _collect_env_vars()
+        if not env_vars:
+            # Environment is not set
+            pass
+        elif len(env_vars) == len(_TORCH_DISTRIBUTED_ENV_VARS):
+            # Environment is fully set
+            return self._set_from_preset_env()
+        else:
+            # Environment is partially set
+            collected_env_vars = ", ".join(env_vars.keys())
+            raise RuntimeError(f"Partially set environment: {collected_env_vars}")
+
+        if torch.cuda.device_count() > 0:
+            return self._set_from_local()
+
+        raise RuntimeError("Can't initialize PyTorch distributed environment")
+
+    # Slurm job created with sbatch, submitit, etc...
+    def _set_from_slurm_env(self):
+        # logger.info("Initialization from Slurm environment")
+        job_id = int(os.environ["SLURM_JOB_ID"])
+        node_count = int(os.environ["SLURM_JOB_NUM_NODES"])
+        nodes = _parse_slurm_node_list(os.environ["SLURM_JOB_NODELIST"])
+        assert len(nodes) == node_count
+
+        self.master_addr = nodes[0]
+        self.master_port = _get_master_port(seed=job_id)
+        self.rank = int(os.environ["SLURM_PROCID"])
+        self.world_size = int(os.environ["SLURM_NTASKS"])
+        assert self.rank < self.world_size
+        self.local_rank = int(os.environ["SLURM_LOCALID"])
+        self.local_world_size = self.world_size // node_count
+        assert self.local_rank < self.local_world_size
+
+    # Single node job with preset environment (i.e. torchrun)
+    def _set_from_preset_env(self):
+        # logger.info("Initialization from preset environment")
+        self.master_addr = os.environ["MASTER_ADDR"]
+        self.master_port = os.environ["MASTER_PORT"]
+        self.rank = int(os.environ["RANK"])
+        self.world_size = int(os.environ["WORLD_SIZE"])
+        assert self.rank < self.world_size
+        self.local_rank = int(os.environ["LOCAL_RANK"])
+        self.local_world_size = int(os.environ["LOCAL_WORLD_SIZE"])
+        assert self.local_rank < self.local_world_size
+
+    # Single node and GPU job (i.e. local script run)
+    def _set_from_local(self):
+        # logger.info("Initialization from local")
+        self.master_addr = "127.0.0.1"
+        self.master_port = _get_available_port()
+        self.rank = 0
+        self.world_size = 1
+        self.local_rank = 0
+        self.local_world_size = 1
+
+    def export(self, *, overwrite: bool) -> "_TorchDistributedEnvironment":
+        # See the "Environment variable initialization" section from
+        # https://pytorch.org/docs/stable/distributed.html for the complete list of
+        # environment variables required for the env:// initialization method.
+        env_vars = {
+            "MASTER_ADDR": self.master_addr,
+            "MASTER_PORT": str(self.master_port),
+            "RANK": str(self.rank),
+            "WORLD_SIZE": str(self.world_size),
+            "LOCAL_RANK": str(self.local_rank),
+            "LOCAL_WORLD_SIZE": str(self.local_world_size),
+        }
+        if not overwrite:
+            for k, v in env_vars.items():
+                _check_env_variable(k, v)
+
+        os.environ.update(env_vars)
+        return self
+
+
+def enable(*, set_cuda_current_device: bool = True, overwrite: bool = False, allow_nccl_timeout: bool = False):
+    """Enable distributed mode
+
+    Args:
+        set_cuda_current_device: If True, call torch.cuda.set_device() to set the
+            current PyTorch CUDA device to the one matching the local rank.
+        overwrite: If True, overwrites already set variables. Else fails.
+    """
+
+    global _LOCAL_RANK, _LOCAL_WORLD_SIZE
+    if _LOCAL_RANK >= 0 or _LOCAL_WORLD_SIZE >= 0:
+        raise RuntimeError("Distributed mode has already been enabled")
+    torch_env = _TorchDistributedEnvironment()
+    torch_env.export(overwrite=overwrite)
+
+    if set_cuda_current_device:
+        torch.cuda.set_device(torch_env.local_rank)
+
+    if allow_nccl_timeout:
+        # This allows to use torch distributed timeout in a NCCL backend
+        key, value = "NCCL_ASYNC_ERROR_HANDLING", "1"
+        if not overwrite:
+            _check_env_variable(key, value)
+        os.environ[key] = value
+
+    dist.init_process_group(backend="nccl")
+    dist.barrier()
+
+    # Finalize setup
+    _LOCAL_RANK = torch_env.local_rank
+    _LOCAL_WORLD_SIZE = torch_env.local_world_size
+    _restrict_print_to_main_process()
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/eval/__init__.py b/torchhub/facebookresearch_dinov2_main/dinov2/eval/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0952fcc3f57e34b3747962e9ebd6fc57aeea63fa
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/eval/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/eval/knn.py b/torchhub/facebookresearch_dinov2_main/dinov2/eval/knn.py
new file mode 100644
index 0000000000000000000000000000000000000000..02ee261348e9871b10bfc40b7283b4f6205cba18
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/eval/knn.py
@@ -0,0 +1,405 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+from functools import partial
+import json
+import logging
+import os
+import sys
+from typing import List, Optional
+
+import torch
+from torch.nn.functional import one_hot, softmax
+
+import dinov2.distributed as distributed
+from dinov2.data import SamplerType, make_data_loader, make_dataset
+from dinov2.data.transforms import make_classification_eval_transform
+from dinov2.eval.metrics import AccuracyAveraging, build_topk_accuracy_metric
+from dinov2.eval.setup import get_args_parser as get_setup_args_parser
+from dinov2.eval.setup import setup_and_build_model
+from dinov2.eval.utils import ModelWithNormalize, evaluate, extract_features
+
+
+logger = logging.getLogger("dinov2")
+
+
+def get_args_parser(
+    description: Optional[str] = None,
+    parents: Optional[List[argparse.ArgumentParser]] = None,
+    add_help: bool = True,
+):
+    parents = parents or []
+    setup_args_parser = get_setup_args_parser(parents=parents, add_help=False)
+    parents = [setup_args_parser]
+    parser = argparse.ArgumentParser(
+        description=description,
+        parents=parents,
+        add_help=add_help,
+    )
+    parser.add_argument(
+        "--train-dataset",
+        dest="train_dataset_str",
+        type=str,
+        help="Training dataset",
+    )
+    parser.add_argument(
+        "--val-dataset",
+        dest="val_dataset_str",
+        type=str,
+        help="Validation dataset",
+    )
+    parser.add_argument(
+        "--nb_knn",
+        nargs="+",
+        type=int,
+        help="Number of NN to use. 20 is usually working the best.",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        help="Temperature used in the voting coefficient",
+    )
+    parser.add_argument(
+        "--gather-on-cpu",
+        action="store_true",
+        help="Whether to gather the train features on cpu, slower"
+        "but useful to avoid OOM for large datasets (e.g. ImageNet22k).",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        help="Batch size.",
+    )
+    parser.add_argument(
+        "--n-per-class-list",
+        nargs="+",
+        type=int,
+        help="Number to take per class",
+    )
+    parser.add_argument(
+        "--n-tries",
+        type=int,
+        help="Number of tries",
+    )
+    parser.set_defaults(
+        train_dataset_str="ImageNet:split=TRAIN",
+        val_dataset_str="ImageNet:split=VAL",
+        nb_knn=[10, 20, 100, 200],
+        temperature=0.07,
+        batch_size=256,
+        n_per_class_list=[-1],
+        n_tries=1,
+    )
+    return parser
+
+
+class KnnModule(torch.nn.Module):
+    """
+    Gets knn of test features from all processes on a chunk of the train features
+
+    Each rank gets a chunk of the train features as well as a chunk of the test features.
+    In `compute_neighbors`, for each rank one after the other, its chunk of test features
+    is sent to all devices, partial knns are computed with each chunk of train features
+    then collated back on the original device.
+    """
+
+    def __init__(self, train_features, train_labels, nb_knn, T, device, num_classes=1000):
+        super().__init__()
+
+        self.global_rank = distributed.get_global_rank()
+        self.global_size = distributed.get_global_size()
+
+        self.device = device
+        self.train_features_rank_T = train_features.chunk(self.global_size)[self.global_rank].T.to(self.device)
+        self.candidates = train_labels.chunk(self.global_size)[self.global_rank].view(1, -1).to(self.device)
+
+        self.nb_knn = nb_knn
+        self.max_k = max(self.nb_knn)
+        self.T = T
+        self.num_classes = num_classes
+
+    def _get_knn_sims_and_labels(self, similarity, train_labels):
+        topk_sims, indices = similarity.topk(self.max_k, largest=True, sorted=True)
+        neighbors_labels = torch.gather(train_labels, 1, indices)
+        return topk_sims, neighbors_labels
+
+    def _similarity_for_rank(self, features_rank, source_rank):
+        # Send the features from `source_rank` to all ranks
+        broadcast_shape = torch.tensor(features_rank.shape).to(self.device)
+        torch.distributed.broadcast(broadcast_shape, source_rank)
+
+        broadcasted = features_rank
+        if self.global_rank != source_rank:
+            broadcasted = torch.zeros(*broadcast_shape, dtype=features_rank.dtype, device=self.device)
+        torch.distributed.broadcast(broadcasted, source_rank)
+
+        # Compute the neighbors for `source_rank` among `train_features_rank_T`
+        similarity_rank = torch.mm(broadcasted, self.train_features_rank_T)
+        candidate_labels = self.candidates.expand(len(similarity_rank), -1)
+        return self._get_knn_sims_and_labels(similarity_rank, candidate_labels)
+
+    def _gather_all_knn_for_rank(self, topk_sims, neighbors_labels, target_rank):
+        # Gather all neighbors for `target_rank`
+        topk_sims_rank = retrieved_rank = None
+        if self.global_rank == target_rank:
+            topk_sims_rank = [torch.zeros_like(topk_sims) for _ in range(self.global_size)]
+            retrieved_rank = [torch.zeros_like(neighbors_labels) for _ in range(self.global_size)]
+
+        torch.distributed.gather(topk_sims, topk_sims_rank, dst=target_rank)
+        torch.distributed.gather(neighbors_labels, retrieved_rank, dst=target_rank)
+
+        if self.global_rank == target_rank:
+            # Perform a second top-k on the k * global_size retrieved neighbors
+            topk_sims_rank = torch.cat(topk_sims_rank, dim=1)
+            retrieved_rank = torch.cat(retrieved_rank, dim=1)
+            results = self._get_knn_sims_and_labels(topk_sims_rank, retrieved_rank)
+            return results
+        return None
+
+    def compute_neighbors(self, features_rank):
+        for rank in range(self.global_size):
+            topk_sims, neighbors_labels = self._similarity_for_rank(features_rank, rank)
+            results = self._gather_all_knn_for_rank(topk_sims, neighbors_labels, rank)
+            if results is not None:
+                topk_sims_rank, neighbors_labels_rank = results
+        return topk_sims_rank, neighbors_labels_rank
+
+    def forward(self, features_rank):
+        """
+        Compute the results on all values of `self.nb_knn` neighbors from the full `self.max_k`
+        """
+        assert all(k <= self.max_k for k in self.nb_knn)
+
+        topk_sims, neighbors_labels = self.compute_neighbors(features_rank)
+        batch_size = neighbors_labels.shape[0]
+        topk_sims_transform = softmax(topk_sims / self.T, 1)
+        matmul = torch.mul(
+            one_hot(neighbors_labels, num_classes=self.num_classes),
+            topk_sims_transform.view(batch_size, -1, 1),
+        )
+        probas_for_k = {k: torch.sum(matmul[:, :k, :], 1) for k in self.nb_knn}
+        return probas_for_k
+
+
+class DictKeysModule(torch.nn.Module):
+    def __init__(self, keys):
+        super().__init__()
+        self.keys = keys
+
+    def forward(self, features_dict, targets):
+        for k in self.keys:
+            features_dict = features_dict[k]
+        return {"preds": features_dict, "target": targets}
+
+
+def create_module_dict(*, module, n_per_class_list, n_tries, nb_knn, train_features, train_labels):
+    modules = {}
+    mapping = create_class_indices_mapping(train_labels)
+    for npc in n_per_class_list:
+        if npc < 0:  # Only one try needed when using the full data
+            full_module = module(
+                train_features=train_features,
+                train_labels=train_labels,
+                nb_knn=nb_knn,
+            )
+            modules["full"] = ModuleDictWithForward({"1": full_module})
+            continue
+        all_tries = {}
+        for t in range(n_tries):
+            final_indices = filter_train(mapping, npc, seed=t)
+            k_list = list(set(nb_knn + [npc]))
+            k_list = sorted([el for el in k_list if el <= npc])
+            all_tries[str(t)] = module(
+                train_features=train_features[final_indices],
+                train_labels=train_labels[final_indices],
+                nb_knn=k_list,
+            )
+        modules[f"{npc} per class"] = ModuleDictWithForward(all_tries)
+
+    return ModuleDictWithForward(modules)
+
+
+def filter_train(mapping, n_per_class, seed):
+    torch.manual_seed(seed)
+    final_indices = []
+    for k in mapping.keys():
+        index = torch.randperm(len(mapping[k]))[:n_per_class]
+        final_indices.append(mapping[k][index])
+    return torch.cat(final_indices).squeeze()
+
+
+def create_class_indices_mapping(labels):
+    unique_labels, inverse = torch.unique(labels, return_inverse=True)
+    mapping = {unique_labels[i]: (inverse == i).nonzero() for i in range(len(unique_labels))}
+    return mapping
+
+
+class ModuleDictWithForward(torch.nn.ModuleDict):
+    def forward(self, *args, **kwargs):
+        return {k: module(*args, **kwargs) for k, module in self._modules.items()}
+
+
+def eval_knn(
+    model,
+    train_dataset,
+    val_dataset,
+    accuracy_averaging,
+    nb_knn,
+    temperature,
+    batch_size,
+    num_workers,
+    gather_on_cpu,
+    n_per_class_list=[-1],
+    n_tries=1,
+):
+    model = ModelWithNormalize(model)
+
+    logger.info("Extracting features for train set...")
+    train_features, train_labels = extract_features(
+        model, train_dataset, batch_size, num_workers, gather_on_cpu=gather_on_cpu
+    )
+    logger.info(f"Train features created, shape {train_features.shape}.")
+
+    val_dataloader = make_data_loader(
+        dataset=val_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        sampler_type=SamplerType.DISTRIBUTED,
+        drop_last=False,
+        shuffle=False,
+        persistent_workers=True,
+    )
+    num_classes = train_labels.max() + 1
+    metric_collection = build_topk_accuracy_metric(accuracy_averaging, num_classes=num_classes)
+
+    device = torch.cuda.current_device()
+    partial_module = partial(KnnModule, T=temperature, device=device, num_classes=num_classes)
+    knn_module_dict = create_module_dict(
+        module=partial_module,
+        n_per_class_list=n_per_class_list,
+        n_tries=n_tries,
+        nb_knn=nb_knn,
+        train_features=train_features,
+        train_labels=train_labels,
+    )
+    postprocessors, metrics = {}, {}
+    for n_per_class, knn_module in knn_module_dict.items():
+        for t, knn_try in knn_module.items():
+            postprocessors = {
+                **postprocessors,
+                **{(n_per_class, t, k): DictKeysModule([n_per_class, t, k]) for k in knn_try.nb_knn},
+            }
+            metrics = {**metrics, **{(n_per_class, t, k): metric_collection.clone() for k in knn_try.nb_knn}}
+    model_with_knn = torch.nn.Sequential(model, knn_module_dict)
+
+    # ============ evaluation ... ============
+    logger.info("Start the k-NN classification.")
+    _, results_dict = evaluate(model_with_knn, val_dataloader, postprocessors, metrics, device)
+
+    # Averaging the results over the n tries for each value of n_per_class
+    for n_per_class, knn_module in knn_module_dict.items():
+        first_try = list(knn_module.keys())[0]
+        k_list = knn_module[first_try].nb_knn
+        for k in k_list:
+            keys = results_dict[(n_per_class, first_try, k)].keys()  # keys are e.g. `top-1` and `top-5`
+            results_dict[(n_per_class, k)] = {
+                key: torch.mean(torch.stack([results_dict[(n_per_class, t, k)][key] for t in knn_module.keys()]))
+                for key in keys
+            }
+            for t in knn_module.keys():
+                del results_dict[(n_per_class, t, k)]
+
+    return results_dict
+
+
+def eval_knn_with_model(
+    model,
+    output_dir,
+    train_dataset_str="ImageNet:split=TRAIN",
+    val_dataset_str="ImageNet:split=VAL",
+    nb_knn=(10, 20, 100, 200),
+    temperature=0.07,
+    autocast_dtype=torch.float,
+    accuracy_averaging=AccuracyAveraging.MEAN_ACCURACY,
+    transform=None,
+    gather_on_cpu=False,
+    batch_size=256,
+    num_workers=5,
+    n_per_class_list=[-1],
+    n_tries=1,
+):
+    transform = transform or make_classification_eval_transform()
+
+    train_dataset = make_dataset(
+        dataset_str=train_dataset_str,
+        transform=transform,
+    )
+    val_dataset = make_dataset(
+        dataset_str=val_dataset_str,
+        transform=transform,
+    )
+
+    with torch.cuda.amp.autocast(dtype=autocast_dtype):
+        results_dict_knn = eval_knn(
+            model=model,
+            train_dataset=train_dataset,
+            val_dataset=val_dataset,
+            accuracy_averaging=accuracy_averaging,
+            nb_knn=nb_knn,
+            temperature=temperature,
+            batch_size=batch_size,
+            num_workers=num_workers,
+            gather_on_cpu=gather_on_cpu,
+            n_per_class_list=n_per_class_list,
+            n_tries=n_tries,
+        )
+
+    results_dict = {}
+    if distributed.is_main_process():
+        for knn_ in results_dict_knn.keys():
+            top1 = results_dict_knn[knn_]["top-1"].item() * 100.0
+            top5 = results_dict_knn[knn_]["top-5"].item() * 100.0
+            results_dict[f"{knn_} Top 1"] = top1
+            results_dict[f"{knn_} Top 5"] = top5
+            logger.info(f"{knn_} classifier result: Top1: {top1:.2f} Top5: {top5:.2f}")
+
+    metrics_file_path = os.path.join(output_dir, "results_eval_knn.json")
+    with open(metrics_file_path, "a") as f:
+        for k, v in results_dict.items():
+            f.write(json.dumps({k: v}) + "\n")
+
+    if distributed.is_enabled():
+        torch.distributed.barrier()
+    return results_dict
+
+
+def main(args):
+    model, autocast_dtype = setup_and_build_model(args)
+    eval_knn_with_model(
+        model=model,
+        output_dir=args.output_dir,
+        train_dataset_str=args.train_dataset_str,
+        val_dataset_str=args.val_dataset_str,
+        nb_knn=args.nb_knn,
+        temperature=args.temperature,
+        autocast_dtype=autocast_dtype,
+        accuracy_averaging=AccuracyAveraging.MEAN_ACCURACY,
+        transform=None,
+        gather_on_cpu=args.gather_on_cpu,
+        batch_size=args.batch_size,
+        num_workers=5,
+        n_per_class_list=args.n_per_class_list,
+        n_tries=args.n_tries,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    description = "DINOv2 k-NN evaluation"
+    args_parser = get_args_parser(description=description)
+    args = args_parser.parse_args()
+    sys.exit(main(args))
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/eval/linear.py b/torchhub/facebookresearch_dinov2_main/dinov2/eval/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d8202606999c0c01353904d8b02d2ff3509fef9
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/eval/linear.py
@@ -0,0 +1,626 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+from functools import partial
+import json
+import logging
+import os
+import sys
+from typing import List, Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn.parallel import DistributedDataParallel
+from fvcore.common.checkpoint import Checkpointer, PeriodicCheckpointer
+
+from dinov2.data import SamplerType, make_data_loader, make_dataset
+from dinov2.data.transforms import make_classification_eval_transform, make_classification_train_transform
+import dinov2.distributed as distributed
+from dinov2.eval.metrics import MetricType, build_metric
+from dinov2.eval.setup import get_args_parser as get_setup_args_parser
+from dinov2.eval.setup import setup_and_build_model
+from dinov2.eval.utils import ModelWithIntermediateLayers, evaluate
+from dinov2.logging import MetricLogger
+
+
+logger = logging.getLogger("dinov2")
+
+
+def get_args_parser(
+    description: Optional[str] = None,
+    parents: Optional[List[argparse.ArgumentParser]] = None,
+    add_help: bool = True,
+):
+    parents = parents or []
+    setup_args_parser = get_setup_args_parser(parents=parents, add_help=False)
+    parents = [setup_args_parser]
+    parser = argparse.ArgumentParser(
+        description=description,
+        parents=parents,
+        add_help=add_help,
+    )
+    parser.add_argument(
+        "--train-dataset",
+        dest="train_dataset_str",
+        type=str,
+        help="Training dataset",
+    )
+    parser.add_argument(
+        "--val-dataset",
+        dest="val_dataset_str",
+        type=str,
+        help="Validation dataset",
+    )
+    parser.add_argument(
+        "--test-datasets",
+        dest="test_dataset_strs",
+        type=str,
+        nargs="+",
+        help="Test datasets, none to reuse the validation dataset",
+    )
+    parser.add_argument(
+        "--epochs",
+        type=int,
+        help="Number of training epochs",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        help="Batch Size (per GPU)",
+    )
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        help="Number de Workers",
+    )
+    parser.add_argument(
+        "--epoch-length",
+        type=int,
+        help="Length of an epoch in number of iterations",
+    )
+    parser.add_argument(
+        "--save-checkpoint-frequency",
+        type=int,
+        help="Number of epochs between two named checkpoint saves.",
+    )
+    parser.add_argument(
+        "--eval-period-iterations",
+        type=int,
+        help="Number of iterations between two evaluations.",
+    )
+    parser.add_argument(
+        "--learning-rates",
+        nargs="+",
+        type=float,
+        help="Learning rates to grid search.",
+    )
+    parser.add_argument(
+        "--no-resume",
+        action="store_true",
+        help="Whether to not resume from existing checkpoints",
+    )
+    parser.add_argument(
+        "--val-metric-type",
+        type=MetricType,
+        choices=list(MetricType),
+        help="Validation metric",
+    )
+    parser.add_argument(
+        "--test-metric-types",
+        type=MetricType,
+        choices=list(MetricType),
+        nargs="+",
+        help="Evaluation metric",
+    )
+    parser.add_argument(
+        "--classifier-fpath",
+        type=str,
+        help="Path to a file containing pretrained linear classifiers",
+    )
+    parser.add_argument(
+        "--val-class-mapping-fpath",
+        type=str,
+        help="Path to a file containing a mapping to adjust classifier outputs",
+    )
+    parser.add_argument(
+        "--test-class-mapping-fpaths",
+        nargs="+",
+        type=str,
+        help="Path to a file containing a mapping to adjust classifier outputs",
+    )
+    parser.set_defaults(
+        train_dataset_str="ImageNet:split=TRAIN",
+        val_dataset_str="ImageNet:split=VAL",
+        test_dataset_strs=None,
+        epochs=10,
+        batch_size=128,
+        num_workers=8,
+        epoch_length=1250,
+        save_checkpoint_frequency=20,
+        eval_period_iterations=1250,
+        learning_rates=[1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 5e-4, 1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 0.1],
+        val_metric_type=MetricType.MEAN_ACCURACY,
+        test_metric_types=None,
+        classifier_fpath=None,
+        val_class_mapping_fpath=None,
+        test_class_mapping_fpaths=[None],
+    )
+    return parser
+
+
+def has_ddp_wrapper(m: nn.Module) -> bool:
+    return isinstance(m, DistributedDataParallel)
+
+
+def remove_ddp_wrapper(m: nn.Module) -> nn.Module:
+    return m.module if has_ddp_wrapper(m) else m
+
+
+def _pad_and_collate(batch):
+    maxlen = max(len(targets) for image, targets in batch)
+    padded_batch = [
+        (image, np.pad(targets, (0, maxlen - len(targets)), constant_values=-1)) for image, targets in batch
+    ]
+    return torch.utils.data.default_collate(padded_batch)
+
+
+def create_linear_input(x_tokens_list, use_n_blocks, use_avgpool):
+    intermediate_output = x_tokens_list[-use_n_blocks:]
+    output = torch.cat([class_token for _, class_token in intermediate_output], dim=-1)
+    if use_avgpool:
+        output = torch.cat(
+            (
+                output,
+                torch.mean(intermediate_output[-1][0], dim=1),  # patch tokens
+            ),
+            dim=-1,
+        )
+        output = output.reshape(output.shape[0], -1)
+    return output.float()
+
+
+class LinearClassifier(nn.Module):
+    """Linear layer to train on top of frozen features"""
+
+    def __init__(self, out_dim, use_n_blocks, use_avgpool, num_classes=1000):
+        super().__init__()
+        self.out_dim = out_dim
+        self.use_n_blocks = use_n_blocks
+        self.use_avgpool = use_avgpool
+        self.num_classes = num_classes
+        self.linear = nn.Linear(out_dim, num_classes)
+        self.linear.weight.data.normal_(mean=0.0, std=0.01)
+        self.linear.bias.data.zero_()
+
+    def forward(self, x_tokens_list):
+        output = create_linear_input(x_tokens_list, self.use_n_blocks, self.use_avgpool)
+        return self.linear(output)
+
+
+class AllClassifiers(nn.Module):
+    def __init__(self, classifiers_dict):
+        super().__init__()
+        self.classifiers_dict = nn.ModuleDict()
+        self.classifiers_dict.update(classifiers_dict)
+
+    def forward(self, inputs):
+        return {k: v.forward(inputs) for k, v in self.classifiers_dict.items()}
+
+    def __len__(self):
+        return len(self.classifiers_dict)
+
+
+class LinearPostprocessor(nn.Module):
+    def __init__(self, linear_classifier, class_mapping=None):
+        super().__init__()
+        self.linear_classifier = linear_classifier
+        self.register_buffer("class_mapping", None if class_mapping is None else torch.LongTensor(class_mapping))
+
+    def forward(self, samples, targets):
+        preds = self.linear_classifier(samples)
+        return {
+            "preds": preds[:, self.class_mapping] if self.class_mapping is not None else preds,
+            "target": targets,
+        }
+
+
+def scale_lr(learning_rates, batch_size):
+    return learning_rates * (batch_size * distributed.get_global_size()) / 256.0
+
+
+def setup_linear_classifiers(sample_output, n_last_blocks_list, learning_rates, batch_size, num_classes=1000):
+    linear_classifiers_dict = nn.ModuleDict()
+    optim_param_groups = []
+    for n in n_last_blocks_list:
+        for avgpool in [False, True]:
+            for _lr in learning_rates:
+                lr = scale_lr(_lr, batch_size)
+                out_dim = create_linear_input(sample_output, use_n_blocks=n, use_avgpool=avgpool).shape[1]
+                linear_classifier = LinearClassifier(
+                    out_dim, use_n_blocks=n, use_avgpool=avgpool, num_classes=num_classes
+                )
+                linear_classifier = linear_classifier.cuda()
+                linear_classifiers_dict[
+                    f"classifier_{n}_blocks_avgpool_{avgpool}_lr_{lr:.5f}".replace(".", "_")
+                ] = linear_classifier
+                optim_param_groups.append({"params": linear_classifier.parameters(), "lr": lr})
+
+    linear_classifiers = AllClassifiers(linear_classifiers_dict)
+    if distributed.is_enabled():
+        linear_classifiers = nn.parallel.DistributedDataParallel(linear_classifiers)
+
+    return linear_classifiers, optim_param_groups
+
+
+@torch.no_grad()
+def evaluate_linear_classifiers(
+    feature_model,
+    linear_classifiers,
+    data_loader,
+    metric_type,
+    metrics_file_path,
+    training_num_classes,
+    iteration,
+    prefixstring="",
+    class_mapping=None,
+    best_classifier_on_val=None,
+):
+    logger.info("running validation !")
+
+    num_classes = len(class_mapping) if class_mapping is not None else training_num_classes
+    metric = build_metric(metric_type, num_classes=num_classes)
+    postprocessors = {k: LinearPostprocessor(v, class_mapping) for k, v in linear_classifiers.classifiers_dict.items()}
+    metrics = {k: metric.clone() for k in linear_classifiers.classifiers_dict}
+
+    _, results_dict_temp = evaluate(
+        feature_model,
+        data_loader,
+        postprocessors,
+        metrics,
+        torch.cuda.current_device(),
+    )
+
+    logger.info("")
+    results_dict = {}
+    max_accuracy = 0
+    best_classifier = ""
+    for i, (classifier_string, metric) in enumerate(results_dict_temp.items()):
+        logger.info(f"{prefixstring} -- Classifier: {classifier_string} * {metric}")
+        if (
+            best_classifier_on_val is None and metric["top-1"].item() > max_accuracy
+        ) or classifier_string == best_classifier_on_val:
+            max_accuracy = metric["top-1"].item()
+            best_classifier = classifier_string
+
+    results_dict["best_classifier"] = {"name": best_classifier, "accuracy": max_accuracy}
+
+    logger.info(f"best classifier: {results_dict['best_classifier']}")
+
+    if distributed.is_main_process():
+        with open(metrics_file_path, "a") as f:
+            f.write(f"iter: {iteration}\n")
+            for k, v in results_dict.items():
+                f.write(json.dumps({k: v}) + "\n")
+            f.write("\n")
+
+    return results_dict
+
+
+def eval_linear(
+    *,
+    feature_model,
+    linear_classifiers,
+    train_data_loader,
+    val_data_loader,
+    metrics_file_path,
+    optimizer,
+    scheduler,
+    output_dir,
+    max_iter,
+    checkpoint_period,  # In number of iter, creates a new file every period
+    running_checkpoint_period,  # Period to update main checkpoint file
+    eval_period,
+    metric_type,
+    training_num_classes,
+    resume=True,
+    classifier_fpath=None,
+    val_class_mapping=None,
+):
+    checkpointer = Checkpointer(linear_classifiers, output_dir, optimizer=optimizer, scheduler=scheduler)
+    start_iter = checkpointer.resume_or_load(classifier_fpath or "", resume=resume).get("iteration", -1) + 1
+
+    periodic_checkpointer = PeriodicCheckpointer(checkpointer, checkpoint_period, max_iter=max_iter)
+    iteration = start_iter
+    logger.info("Starting training from iteration {}".format(start_iter))
+    metric_logger = MetricLogger(delimiter="  ")
+    header = "Training"
+
+    for data, labels in metric_logger.log_every(
+        train_data_loader,
+        10,
+        header,
+        max_iter,
+        start_iter,
+    ):
+        data = data.cuda(non_blocking=True)
+        labels = labels.cuda(non_blocking=True)
+
+        features = feature_model(data)
+        outputs = linear_classifiers(features)
+
+        losses = {f"loss_{k}": nn.CrossEntropyLoss()(v, labels) for k, v in outputs.items()}
+        loss = sum(losses.values())
+
+        # compute the gradients
+        optimizer.zero_grad()
+        loss.backward()
+
+        # step
+        optimizer.step()
+        scheduler.step()
+
+        # log
+        if iteration % 10 == 0:
+            torch.cuda.synchronize()
+            metric_logger.update(loss=loss.item())
+            metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+            print("lr", optimizer.param_groups[0]["lr"])
+
+        if iteration - start_iter > 5:
+            if iteration % running_checkpoint_period == 0:
+                torch.cuda.synchronize()
+                if distributed.is_main_process():
+                    logger.info("Checkpointing running_checkpoint")
+                    periodic_checkpointer.save("running_checkpoint_linear_eval", iteration=iteration)
+                torch.cuda.synchronize()
+        periodic_checkpointer.step(iteration)
+
+        if eval_period > 0 and (iteration + 1) % eval_period == 0 and iteration != max_iter - 1:
+            _ = evaluate_linear_classifiers(
+                feature_model=feature_model,
+                linear_classifiers=remove_ddp_wrapper(linear_classifiers),
+                data_loader=val_data_loader,
+                metrics_file_path=metrics_file_path,
+                prefixstring=f"ITER: {iteration}",
+                metric_type=metric_type,
+                training_num_classes=training_num_classes,
+                iteration=iteration,
+                class_mapping=val_class_mapping,
+            )
+            torch.cuda.synchronize()
+
+        iteration = iteration + 1
+
+    val_results_dict = evaluate_linear_classifiers(
+        feature_model=feature_model,
+        linear_classifiers=remove_ddp_wrapper(linear_classifiers),
+        data_loader=val_data_loader,
+        metrics_file_path=metrics_file_path,
+        metric_type=metric_type,
+        training_num_classes=training_num_classes,
+        iteration=iteration,
+        class_mapping=val_class_mapping,
+    )
+    return val_results_dict, feature_model, linear_classifiers, iteration
+
+
+def make_eval_data_loader(test_dataset_str, batch_size, num_workers, metric_type):
+    test_dataset = make_dataset(
+        dataset_str=test_dataset_str,
+        transform=make_classification_eval_transform(),
+    )
+    test_data_loader = make_data_loader(
+        dataset=test_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        sampler_type=SamplerType.DISTRIBUTED,
+        drop_last=False,
+        shuffle=False,
+        persistent_workers=False,
+        collate_fn=_pad_and_collate if metric_type == MetricType.IMAGENET_REAL_ACCURACY else None,
+    )
+    return test_data_loader
+
+
+def test_on_datasets(
+    feature_model,
+    linear_classifiers,
+    test_dataset_strs,
+    batch_size,
+    num_workers,
+    test_metric_types,
+    metrics_file_path,
+    training_num_classes,
+    iteration,
+    best_classifier_on_val,
+    prefixstring="",
+    test_class_mappings=[None],
+):
+    results_dict = {}
+    for test_dataset_str, class_mapping, metric_type in zip(test_dataset_strs, test_class_mappings, test_metric_types):
+        logger.info(f"Testing on {test_dataset_str}")
+        test_data_loader = make_eval_data_loader(test_dataset_str, batch_size, num_workers, metric_type)
+        dataset_results_dict = evaluate_linear_classifiers(
+            feature_model,
+            remove_ddp_wrapper(linear_classifiers),
+            test_data_loader,
+            metric_type,
+            metrics_file_path,
+            training_num_classes,
+            iteration,
+            prefixstring="",
+            class_mapping=class_mapping,
+            best_classifier_on_val=best_classifier_on_val,
+        )
+        results_dict[f"{test_dataset_str}_accuracy"] = 100.0 * dataset_results_dict["best_classifier"]["accuracy"]
+    return results_dict
+
+
+def run_eval_linear(
+    model,
+    output_dir,
+    train_dataset_str,
+    val_dataset_str,
+    batch_size,
+    epochs,
+    epoch_length,
+    num_workers,
+    save_checkpoint_frequency,
+    eval_period_iterations,
+    learning_rates,
+    autocast_dtype,
+    test_dataset_strs=None,
+    resume=True,
+    classifier_fpath=None,
+    val_class_mapping_fpath=None,
+    test_class_mapping_fpaths=[None],
+    val_metric_type=MetricType.MEAN_ACCURACY,
+    test_metric_types=None,
+):
+    seed = 0
+
+    if test_dataset_strs is None:
+        test_dataset_strs = [val_dataset_str]
+    if test_metric_types is None:
+        test_metric_types = [val_metric_type] * len(test_dataset_strs)
+    else:
+        assert len(test_metric_types) == len(test_dataset_strs)
+    assert len(test_dataset_strs) == len(test_class_mapping_fpaths)
+
+    train_transform = make_classification_train_transform()
+    train_dataset = make_dataset(
+        dataset_str=train_dataset_str,
+        transform=train_transform,
+    )
+    training_num_classes = len(torch.unique(torch.Tensor(train_dataset.get_targets().astype(int))))
+    sampler_type = SamplerType.SHARDED_INFINITE
+    # sampler_type = SamplerType.INFINITE
+
+    n_last_blocks_list = [1, 4]
+    n_last_blocks = max(n_last_blocks_list)
+    autocast_ctx = partial(torch.cuda.amp.autocast, enabled=True, dtype=autocast_dtype)
+    feature_model = ModelWithIntermediateLayers(model, n_last_blocks, autocast_ctx)
+    sample_output = feature_model(train_dataset[0][0].unsqueeze(0).cuda())
+
+    linear_classifiers, optim_param_groups = setup_linear_classifiers(
+        sample_output,
+        n_last_blocks_list,
+        learning_rates,
+        batch_size,
+        training_num_classes,
+    )
+
+    optimizer = torch.optim.SGD(optim_param_groups, momentum=0.9, weight_decay=0)
+    max_iter = epochs * epoch_length
+    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, max_iter, eta_min=0)
+    checkpointer = Checkpointer(linear_classifiers, output_dir, optimizer=optimizer, scheduler=scheduler)
+    start_iter = checkpointer.resume_or_load(classifier_fpath or "", resume=resume).get("iteration", -1) + 1
+    train_data_loader = make_data_loader(
+        dataset=train_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=True,
+        seed=seed,
+        sampler_type=sampler_type,
+        sampler_advance=start_iter,
+        drop_last=True,
+        persistent_workers=True,
+    )
+    val_data_loader = make_eval_data_loader(val_dataset_str, batch_size, num_workers, val_metric_type)
+
+    checkpoint_period = save_checkpoint_frequency * epoch_length
+
+    if val_class_mapping_fpath is not None:
+        logger.info(f"Using class mapping from {val_class_mapping_fpath}")
+        val_class_mapping = np.load(val_class_mapping_fpath)
+    else:
+        val_class_mapping = None
+
+    test_class_mappings = []
+    for class_mapping_fpath in test_class_mapping_fpaths:
+        if class_mapping_fpath is not None and class_mapping_fpath != "None":
+            logger.info(f"Using class mapping from {class_mapping_fpath}")
+            class_mapping = np.load(class_mapping_fpath)
+        else:
+            class_mapping = None
+        test_class_mappings.append(class_mapping)
+
+    metrics_file_path = os.path.join(output_dir, "results_eval_linear.json")
+    val_results_dict, feature_model, linear_classifiers, iteration = eval_linear(
+        feature_model=feature_model,
+        linear_classifiers=linear_classifiers,
+        train_data_loader=train_data_loader,
+        val_data_loader=val_data_loader,
+        metrics_file_path=metrics_file_path,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        output_dir=output_dir,
+        max_iter=max_iter,
+        checkpoint_period=checkpoint_period,
+        running_checkpoint_period=epoch_length,
+        eval_period=eval_period_iterations,
+        metric_type=val_metric_type,
+        training_num_classes=training_num_classes,
+        resume=resume,
+        val_class_mapping=val_class_mapping,
+        classifier_fpath=classifier_fpath,
+    )
+    results_dict = {}
+    if len(test_dataset_strs) > 1 or test_dataset_strs[0] != val_dataset_str:
+        results_dict = test_on_datasets(
+            feature_model,
+            linear_classifiers,
+            test_dataset_strs,
+            batch_size,
+            0,  # num_workers,
+            test_metric_types,
+            metrics_file_path,
+            training_num_classes,
+            iteration,
+            val_results_dict["best_classifier"]["name"],
+            prefixstring="",
+            test_class_mappings=test_class_mappings,
+        )
+    results_dict["best_classifier"] = val_results_dict["best_classifier"]["name"]
+    results_dict[f"{val_dataset_str}_accuracy"] = 100.0 * val_results_dict["best_classifier"]["accuracy"]
+    logger.info("Test Results Dict " + str(results_dict))
+
+    return results_dict
+
+
+def main(args):
+    model, autocast_dtype = setup_and_build_model(args)
+    run_eval_linear(
+        model=model,
+        output_dir=args.output_dir,
+        train_dataset_str=args.train_dataset_str,
+        val_dataset_str=args.val_dataset_str,
+        test_dataset_strs=args.test_dataset_strs,
+        batch_size=args.batch_size,
+        epochs=args.epochs,
+        epoch_length=args.epoch_length,
+        num_workers=args.num_workers,
+        save_checkpoint_frequency=args.save_checkpoint_frequency,
+        eval_period_iterations=args.eval_period_iterations,
+        learning_rates=args.learning_rates,
+        autocast_dtype=autocast_dtype,
+        resume=not args.no_resume,
+        classifier_fpath=args.classifier_fpath,
+        val_metric_type=args.val_metric_type,
+        test_metric_types=args.test_metric_types,
+        val_class_mapping_fpath=args.val_class_mapping_fpath,
+        test_class_mapping_fpaths=args.test_class_mapping_fpaths,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    description = "DINOv2 linear evaluation"
+    args_parser = get_args_parser(description=description)
+    args = args_parser.parse_args()
+    sys.exit(main(args))
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/eval/log_regression.py b/torchhub/facebookresearch_dinov2_main/dinov2/eval/log_regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e6ede2b616208cb49c7af67d58c8e6e4afb60e1
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/eval/log_regression.py
@@ -0,0 +1,445 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import gc
+import logging
+import sys
+import time
+from typing import List, Optional
+
+from cuml.linear_model import LogisticRegression
+import torch
+import torch.backends.cudnn as cudnn
+import torch.distributed
+from torch import nn
+from torch.utils.data import TensorDataset
+from torchmetrics import MetricTracker
+
+from dinov2.data import make_dataset
+from dinov2.data.transforms import make_classification_eval_transform
+from dinov2.distributed import get_global_rank, get_global_size
+from dinov2.eval.metrics import MetricType, build_metric
+from dinov2.eval.setup import get_args_parser as get_setup_args_parser
+from dinov2.eval.setup import setup_and_build_model
+from dinov2.eval.utils import evaluate, extract_features
+from dinov2.utils.dtype import as_torch_dtype
+
+
+logger = logging.getLogger("dinov2")
+
+DEFAULT_MAX_ITER = 1_000
+C_POWER_RANGE = torch.linspace(-6, 5, 45)
+_CPU_DEVICE = torch.device("cpu")
+
+
+def get_args_parser(
+    description: Optional[str] = None,
+    parents: Optional[List[argparse.ArgumentParser]] = None,
+    add_help: bool = True,
+):
+    parents = parents or []
+    setup_args_parser = get_setup_args_parser(parents=parents, add_help=False)
+    parents = [setup_args_parser]
+    parser = argparse.ArgumentParser(
+        description=description,
+        parents=parents,
+        add_help=add_help,
+    )
+    parser.add_argument(
+        "--train-dataset",
+        dest="train_dataset_str",
+        type=str,
+        help="Training dataset",
+    )
+    parser.add_argument(
+        "--val-dataset",
+        dest="val_dataset_str",
+        type=str,
+        help="Validation dataset",
+    )
+    parser.add_argument(
+        "--finetune-dataset-str",
+        dest="finetune_dataset_str",
+        type=str,
+        help="Fine-tuning dataset",
+    )
+    parser.add_argument(
+        "--finetune-on-val",
+        action="store_true",
+        help="If there is no finetune dataset, whether to choose the "
+        "hyperparameters on the val set instead of 10%% of the train dataset",
+    )
+    parser.add_argument(
+        "--metric-type",
+        type=MetricType,
+        choices=list(MetricType),
+        help="Metric type",
+    )
+    parser.add_argument(
+        "--train-features-device",
+        type=str,
+        help="Device to gather train features (cpu, cuda, cuda:0, etc.), default: %(default)s",
+    )
+    parser.add_argument(
+        "--train-dtype",
+        type=str,
+        help="Data type to convert the train features to (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--max-train-iters",
+        type=int,
+        help="Maximum number of train iterations (default: %(default)s)",
+    )
+    parser.set_defaults(
+        train_dataset_str="ImageNet:split=TRAIN",
+        val_dataset_str="ImageNet:split=VAL",
+        finetune_dataset_str=None,
+        metric_type=MetricType.MEAN_ACCURACY,
+        train_features_device="cpu",
+        train_dtype="float64",
+        max_train_iters=DEFAULT_MAX_ITER,
+        finetune_on_val=False,
+    )
+    return parser
+
+
+class LogRegModule(nn.Module):
+    def __init__(
+        self,
+        C,
+        max_iter=DEFAULT_MAX_ITER,
+        dtype=torch.float64,
+        device=_CPU_DEVICE,
+    ):
+        super().__init__()
+        self.dtype = dtype
+        self.device = device
+        self.estimator = LogisticRegression(
+            penalty="l2",
+            C=C,
+            max_iter=max_iter,
+            output_type="numpy",
+            tol=1e-12,
+            linesearch_max_iter=50,
+        )
+
+    def forward(self, samples, targets):
+        samples_device = samples.device
+        samples = samples.to(dtype=self.dtype, device=self.device)
+        if self.device == _CPU_DEVICE:
+            samples = samples.numpy()
+        probas = self.estimator.predict_proba(samples)
+        return {"preds": torch.from_numpy(probas).to(samples_device), "target": targets}
+
+    def fit(self, train_features, train_labels):
+        train_features = train_features.to(dtype=self.dtype, device=self.device)
+        train_labels = train_labels.to(dtype=self.dtype, device=self.device)
+        if self.device == _CPU_DEVICE:
+            # both cuML and sklearn only work with numpy arrays on CPU
+            train_features = train_features.numpy()
+            train_labels = train_labels.numpy()
+        self.estimator.fit(train_features, train_labels)
+
+
+def evaluate_model(*, logreg_model, logreg_metric, test_data_loader, device):
+    postprocessors = {"metrics": logreg_model}
+    metrics = {"metrics": logreg_metric}
+    return evaluate(nn.Identity(), test_data_loader, postprocessors, metrics, device)
+
+
+def train_for_C(*, C, max_iter, train_features, train_labels, dtype=torch.float64, device=_CPU_DEVICE):
+    logreg_model = LogRegModule(C, max_iter=max_iter, dtype=dtype, device=device)
+    logreg_model.fit(train_features, train_labels)
+    return logreg_model
+
+
+def train_and_evaluate(
+    *,
+    C,
+    max_iter,
+    train_features,
+    train_labels,
+    logreg_metric,
+    test_data_loader,
+    train_dtype=torch.float64,
+    train_features_device,
+    eval_device,
+):
+    logreg_model = train_for_C(
+        C=C,
+        max_iter=max_iter,
+        train_features=train_features,
+        train_labels=train_labels,
+        dtype=train_dtype,
+        device=train_features_device,
+    )
+    return evaluate_model(
+        logreg_model=logreg_model,
+        logreg_metric=logreg_metric,
+        test_data_loader=test_data_loader,
+        device=eval_device,
+    )
+
+
+def sweep_C_values(
+    *,
+    train_features,
+    train_labels,
+    test_data_loader,
+    metric_type,
+    num_classes,
+    train_dtype=torch.float64,
+    train_features_device=_CPU_DEVICE,
+    max_train_iters=DEFAULT_MAX_ITER,
+):
+    if metric_type == MetricType.PER_CLASS_ACCURACY:
+        # If we want to output per-class accuracy, we select the hyperparameters with mean per class
+        metric_type = MetricType.MEAN_PER_CLASS_ACCURACY
+    logreg_metric = build_metric(metric_type, num_classes=num_classes)
+    metric_tracker = MetricTracker(logreg_metric, maximize=True)
+    ALL_C = 10**C_POWER_RANGE
+    logreg_models = {}
+
+    train_features = train_features.to(dtype=train_dtype, device=train_features_device)
+    train_labels = train_labels.to(device=train_features_device)
+
+    for i in range(get_global_rank(), len(ALL_C), get_global_size()):
+        C = ALL_C[i].item()
+        logger.info(
+            f"Training for C = {C:.5f}, dtype={train_dtype}, "
+            f"features: {train_features.shape}, {train_features.dtype}, "
+            f"labels: {train_labels.shape}, {train_labels.dtype}"
+        )
+        logreg_models[C] = train_for_C(
+            C=C,
+            max_iter=max_train_iters,
+            train_features=train_features,
+            train_labels=train_labels,
+            dtype=train_dtype,
+            device=train_features_device,
+        )
+
+    gather_list = [None for _ in range(get_global_size())]
+    torch.distributed.all_gather_object(gather_list, logreg_models)
+
+    logreg_models_gathered = {}
+    for logreg_dict in gather_list:
+        logreg_models_gathered.update(logreg_dict)
+
+    for i in range(len(ALL_C)):
+        metric_tracker.increment()
+        C = ALL_C[i].item()
+        evals = evaluate_model(
+            logreg_model=logreg_models_gathered[C],
+            logreg_metric=metric_tracker,
+            test_data_loader=test_data_loader,
+            device=torch.cuda.current_device(),
+        )
+        logger.info(f"Trained for C = {C:.5f}, accuracies = {evals}")
+
+        best_stats, which_epoch = metric_tracker.best_metric(return_step=True)
+        best_stats_100 = {k: 100.0 * v for k, v in best_stats.items()}
+        if which_epoch["top-1"] == i:
+            best_C = C
+    logger.info(f"Sweep best {best_stats_100}, best C = {best_C:.6f}")
+
+    return best_stats, best_C
+
+
+def eval_log_regression(
+    *,
+    model,
+    train_dataset,
+    val_dataset,
+    finetune_dataset,
+    metric_type,
+    batch_size,
+    num_workers,
+    finetune_on_val=False,
+    train_dtype=torch.float64,
+    train_features_device=_CPU_DEVICE,
+    max_train_iters=DEFAULT_MAX_ITER,
+):
+    """
+    Implements the "standard" process for log regression evaluation:
+    The value of C is chosen by training on train_dataset and evaluating on
+    finetune_dataset. Then, the final model is trained on a concatenation of
+    train_dataset and finetune_dataset, and is evaluated on val_dataset.
+    If there is no finetune_dataset, the value of C is the one that yields
+    the best results on a random 10% subset of the train dataset
+    """
+
+    start = time.time()
+
+    train_features, train_labels = extract_features(
+        model, train_dataset, batch_size, num_workers, gather_on_cpu=(train_features_device == _CPU_DEVICE)
+    )
+    val_features, val_labels = extract_features(
+        model, val_dataset, batch_size, num_workers, gather_on_cpu=(train_features_device == _CPU_DEVICE)
+    )
+    val_data_loader = torch.utils.data.DataLoader(
+        TensorDataset(val_features, val_labels),
+        batch_size=batch_size,
+        drop_last=False,
+        num_workers=0,
+        persistent_workers=False,
+    )
+
+    if finetune_dataset is None and finetune_on_val:
+        logger.info("Choosing hyperparameters on the val dataset")
+        finetune_features, finetune_labels = val_features, val_labels
+    elif finetune_dataset is None and not finetune_on_val:
+        logger.info("Choosing hyperparameters on 10% of the train dataset")
+        torch.manual_seed(0)
+        indices = torch.randperm(len(train_features), device=train_features.device)
+        finetune_index = indices[: len(train_features) // 10]
+        train_index = indices[len(train_features) // 10 :]
+        finetune_features, finetune_labels = train_features[finetune_index], train_labels[finetune_index]
+        train_features, train_labels = train_features[train_index], train_labels[train_index]
+    else:
+        logger.info("Choosing hyperparameters on the finetune dataset")
+        finetune_features, finetune_labels = extract_features(
+            model, finetune_dataset, batch_size, num_workers, gather_on_cpu=(train_features_device == _CPU_DEVICE)
+        )
+    # release the model - free GPU memory
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()
+    finetune_data_loader = torch.utils.data.DataLoader(
+        TensorDataset(finetune_features, finetune_labels),
+        batch_size=batch_size,
+        drop_last=False,
+    )
+
+    if len(train_labels.shape) > 1:
+        num_classes = train_labels.shape[1]
+    else:
+        num_classes = train_labels.max() + 1
+
+    logger.info("Using cuML for logistic regression")
+
+    best_stats, best_C = sweep_C_values(
+        train_features=train_features,
+        train_labels=train_labels,
+        test_data_loader=finetune_data_loader,
+        metric_type=metric_type,
+        num_classes=num_classes,
+        train_dtype=train_dtype,
+        train_features_device=train_features_device,
+        max_train_iters=max_train_iters,
+    )
+
+    if not finetune_on_val:
+        logger.info("Best parameter found, concatenating features")
+        train_features = torch.cat((train_features, finetune_features))
+        train_labels = torch.cat((train_labels, finetune_labels))
+
+    logger.info("Training final model")
+    logreg_metric = build_metric(metric_type, num_classes=num_classes)
+    evals = train_and_evaluate(
+        C=best_C,
+        max_iter=max_train_iters,
+        train_features=train_features,
+        train_labels=train_labels,
+        logreg_metric=logreg_metric.clone(),
+        test_data_loader=val_data_loader,
+        eval_device=torch.cuda.current_device(),
+        train_dtype=train_dtype,
+        train_features_device=train_features_device,
+    )
+
+    best_stats = evals[1]["metrics"]
+
+    best_stats["best_C"] = best_C
+
+    logger.info(f"Log regression evaluation done in {int(time.time() - start)}s")
+    return best_stats
+
+
+def eval_log_regression_with_model(
+    model,
+    train_dataset_str="ImageNet:split=TRAIN",
+    val_dataset_str="ImageNet:split=VAL",
+    finetune_dataset_str=None,
+    autocast_dtype=torch.float,
+    finetune_on_val=False,
+    metric_type=MetricType.MEAN_ACCURACY,
+    train_dtype=torch.float64,
+    train_features_device=_CPU_DEVICE,
+    max_train_iters=DEFAULT_MAX_ITER,
+):
+    cudnn.benchmark = True
+
+    transform = make_classification_eval_transform(resize_size=224)
+    target_transform = None
+
+    train_dataset = make_dataset(dataset_str=train_dataset_str, transform=transform, target_transform=target_transform)
+    val_dataset = make_dataset(dataset_str=val_dataset_str, transform=transform, target_transform=target_transform)
+    if finetune_dataset_str is not None:
+        finetune_dataset = make_dataset(
+            dataset_str=finetune_dataset_str, transform=transform, target_transform=target_transform
+        )
+    else:
+        finetune_dataset = None
+
+    with torch.cuda.amp.autocast(dtype=autocast_dtype):
+        results_dict_logreg = eval_log_regression(
+            model=model,
+            train_dataset=train_dataset,
+            val_dataset=val_dataset,
+            finetune_dataset=finetune_dataset,
+            metric_type=metric_type,
+            batch_size=256,
+            num_workers=0,  # 5,
+            finetune_on_val=finetune_on_val,
+            train_dtype=train_dtype,
+            train_features_device=train_features_device,
+            max_train_iters=max_train_iters,
+        )
+
+    results_dict = {
+        "top-1": results_dict_logreg["top-1"].cpu().numpy() * 100.0,
+        "top-5": results_dict_logreg.get("top-5", torch.tensor(0.0)).cpu().numpy() * 100.0,
+        "best_C": results_dict_logreg["best_C"],
+    }
+    logger.info(
+        "\n".join(
+            [
+                "Training of the supervised logistic regression on frozen features completed.\n"
+                "Top-1 test accuracy: {acc:.1f}".format(acc=results_dict["top-1"]),
+                "Top-5 test accuracy: {acc:.1f}".format(acc=results_dict["top-5"]),
+                "obtained for C = {c:.6f}".format(c=results_dict["best_C"]),
+            ]
+        )
+    )
+
+    torch.distributed.barrier()
+    return results_dict
+
+
+def main(args):
+    model, autocast_dtype = setup_and_build_model(args)
+    eval_log_regression_with_model(
+        model=model,
+        train_dataset_str=args.train_dataset_str,
+        val_dataset_str=args.val_dataset_str,
+        finetune_dataset_str=args.finetune_dataset_str,
+        autocast_dtype=autocast_dtype,
+        finetune_on_val=args.finetune_on_val,
+        metric_type=args.metric_type,
+        train_dtype=as_torch_dtype(args.train_dtype),
+        train_features_device=torch.device(args.train_features_device),
+        max_train_iters=args.max_train_iters,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    description = "DINOv2 logistic regression evaluation"
+    args_parser = get_args_parser(description=description)
+    args = args_parser.parse_args()
+    sys.exit(main(args))
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/eval/metrics.py b/torchhub/facebookresearch_dinov2_main/dinov2/eval/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..80bf88da224e749dd6b3dd4b2bd90ec99eaec34e
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/eval/metrics.py
@@ -0,0 +1,114 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+import logging
+from typing import Any, Dict, Optional
+
+import torch
+from torch import Tensor
+from torchmetrics import Metric, MetricCollection
+from torchmetrics.classification import MulticlassAccuracy
+from torchmetrics.utilities.data import dim_zero_cat, select_topk
+
+
+logger = logging.getLogger("dinov2")
+
+
+class MetricType(Enum):
+    MEAN_ACCURACY = "mean_accuracy"
+    MEAN_PER_CLASS_ACCURACY = "mean_per_class_accuracy"
+    PER_CLASS_ACCURACY = "per_class_accuracy"
+    IMAGENET_REAL_ACCURACY = "imagenet_real_accuracy"
+
+    @property
+    def accuracy_averaging(self):
+        return getattr(AccuracyAveraging, self.name, None)
+
+    def __str__(self):
+        return self.value
+
+
+class AccuracyAveraging(Enum):
+    MEAN_ACCURACY = "micro"
+    MEAN_PER_CLASS_ACCURACY = "macro"
+    PER_CLASS_ACCURACY = "none"
+
+    def __str__(self):
+        return self.value
+
+
+def build_metric(metric_type: MetricType, *, num_classes: int, ks: Optional[tuple] = None):
+    if metric_type.accuracy_averaging is not None:
+        return build_topk_accuracy_metric(
+            average_type=metric_type.accuracy_averaging,
+            num_classes=num_classes,
+            ks=(1, 5) if ks is None else ks,
+        )
+    elif metric_type == MetricType.IMAGENET_REAL_ACCURACY:
+        return build_topk_imagenet_real_accuracy_metric(
+            num_classes=num_classes,
+            ks=(1, 5) if ks is None else ks,
+        )
+
+    raise ValueError(f"Unknown metric type {metric_type}")
+
+
+def build_topk_accuracy_metric(average_type: AccuracyAveraging, num_classes: int, ks: tuple = (1, 5)):
+    metrics: Dict[str, Metric] = {
+        f"top-{k}": MulticlassAccuracy(top_k=k, num_classes=int(num_classes), average=average_type.value) for k in ks
+    }
+    return MetricCollection(metrics)
+
+
+def build_topk_imagenet_real_accuracy_metric(num_classes: int, ks: tuple = (1, 5)):
+    metrics: Dict[str, Metric] = {f"top-{k}": ImageNetReaLAccuracy(top_k=k, num_classes=int(num_classes)) for k in ks}
+    return MetricCollection(metrics)
+
+
+class ImageNetReaLAccuracy(Metric):
+    is_differentiable: bool = False
+    higher_is_better: Optional[bool] = None
+    full_state_update: bool = False
+
+    def __init__(
+        self,
+        num_classes: int,
+        top_k: int = 1,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.num_classes = num_classes
+        self.top_k = top_k
+        self.add_state("tp", [], dist_reduce_fx="cat")
+
+    def update(self, preds: Tensor, target: Tensor) -> None:  # type: ignore
+        # preds [B, D]
+        # target [B, A]
+        # preds_oh [B, D] with 0 and 1
+        # select top K highest probabilities, use one hot representation
+        preds_oh = select_topk(preds, self.top_k)
+        # target_oh [B, D + 1] with 0 and 1
+        target_oh = torch.zeros((preds_oh.shape[0], preds_oh.shape[1] + 1), device=target.device, dtype=torch.int32)
+        target = target.long()
+        # for undefined targets (-1) use a fake value `num_classes`
+        target[target == -1] = self.num_classes
+        # fill targets, use one hot representation
+        target_oh.scatter_(1, target, 1)
+        # target_oh [B, D] (remove the fake target at index `num_classes`)
+        target_oh = target_oh[:, :-1]
+        # tp [B] with 0 and 1
+        tp = (preds_oh * target_oh == 1).sum(dim=1)
+        # at least one match between prediction and target
+        tp.clip_(max=1)
+        # ignore instances where no targets are defined
+        mask = target_oh.sum(dim=1) > 0
+        tp = tp[mask]
+        self.tp.append(tp)  # type: ignore
+
+    def compute(self) -> Tensor:
+        tp = dim_zero_cat(self.tp)  # type: ignore
+        return tp.float().mean()
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/eval/setup.py b/torchhub/facebookresearch_dinov2_main/dinov2/eval/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7fadc2b63b994f569c8def82a43ed08ccd15b33
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/eval/setup.py
@@ -0,0 +1,76 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+from typing import Any, List, Optional, Tuple
+
+import torch
+import torch.backends.cudnn as cudnn
+
+from dinov2.models import build_model_from_cfg
+from dinov2.utils.config import setup
+import dinov2.utils.utils as dinov2_utils
+
+
+def get_args_parser(
+    description: Optional[str] = None,
+    parents: Optional[List[argparse.ArgumentParser]] = None,
+    add_help: bool = True,
+):
+    parser = argparse.ArgumentParser(
+        description=description,
+        parents=parents or [],
+        add_help=add_help,
+    )
+    parser.add_argument(
+        "--config-file",
+        type=str,
+        help="Model configuration file",
+    )
+    parser.add_argument(
+        "--pretrained-weights",
+        type=str,
+        help="Pretrained model weights",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="",
+        type=str,
+        help="Output directory to write results and logs",
+    )
+    parser.add_argument(
+        "--opts",
+        help="Extra configuration options",
+        default=[],
+        nargs="+",
+    )
+    return parser
+
+
+def get_autocast_dtype(config):
+    teacher_dtype_str = config.compute_precision.teacher.backbone.mixed_precision.param_dtype
+    if teacher_dtype_str == "fp16":
+        return torch.half
+    elif teacher_dtype_str == "bf16":
+        return torch.bfloat16
+    else:
+        return torch.float
+
+
+def build_model_for_eval(config, pretrained_weights):
+    model, _ = build_model_from_cfg(config, only_teacher=True)
+    dinov2_utils.load_pretrained_weights(model, pretrained_weights, "teacher")
+    model.eval()
+    model.cuda()
+    return model
+
+
+def setup_and_build_model(args) -> Tuple[Any, torch.dtype]:
+    cudnn.benchmark = True
+    config = setup(args)
+    model = build_model_for_eval(config, args.pretrained_weights)
+    autocast_dtype = get_autocast_dtype(config)
+    return model, autocast_dtype
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/eval/utils.py b/torchhub/facebookresearch_dinov2_main/dinov2/eval/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2f7e34f41ba6a0b911023e0c5375eef21f426fa
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/eval/utils.py
@@ -0,0 +1,147 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import Dict, Optional
+
+import torch
+from torch import nn
+from torchmetrics import MetricCollection
+
+from dinov2.data import DatasetWithEnumeratedTargets, SamplerType, make_data_loader
+import dinov2.distributed as distributed
+from dinov2.logging import MetricLogger
+
+
+logger = logging.getLogger("dinov2")
+
+
+class ModelWithNormalize(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(self, samples):
+        return nn.functional.normalize(self.model(samples), dim=1, p=2)
+
+
+class ModelWithIntermediateLayers(nn.Module):
+    def __init__(self, feature_model, n_last_blocks, autocast_ctx):
+        super().__init__()
+        self.feature_model = feature_model
+        self.feature_model.eval()
+        self.n_last_blocks = n_last_blocks
+        self.autocast_ctx = autocast_ctx
+
+    def forward(self, images):
+        with torch.inference_mode():
+            with self.autocast_ctx():
+                features = self.feature_model.get_intermediate_layers(
+                    images, self.n_last_blocks, return_class_token=True
+                )
+        return features
+
+
+@torch.inference_mode()
+def evaluate(
+    model: nn.Module,
+    data_loader,
+    postprocessors: Dict[str, nn.Module],
+    metrics: Dict[str, MetricCollection],
+    device: torch.device,
+    criterion: Optional[nn.Module] = None,
+):
+    model.eval()
+    if criterion is not None:
+        criterion.eval()
+
+    for metric in metrics.values():
+        metric = metric.to(device)
+
+    metric_logger = MetricLogger(delimiter="  ")
+    header = "Test:"
+
+    for samples, targets, *_ in metric_logger.log_every(data_loader, 10, header):
+        outputs = model(samples.to(device))
+        targets = targets.to(device)
+
+        if criterion is not None:
+            loss = criterion(outputs, targets)
+            metric_logger.update(loss=loss.item())
+
+        for k, metric in metrics.items():
+            metric_inputs = postprocessors[k](outputs, targets)
+            metric.update(**metric_inputs)
+
+    metric_logger.synchronize_between_processes()
+    logger.info(f"Averaged stats: {metric_logger}")
+
+    stats = {k: metric.compute() for k, metric in metrics.items()}
+    metric_logger_stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+    return metric_logger_stats, stats
+
+
+def all_gather_and_flatten(tensor_rank):
+    tensor_all_ranks = torch.empty(
+        distributed.get_global_size(),
+        *tensor_rank.shape,
+        dtype=tensor_rank.dtype,
+        device=tensor_rank.device,
+    )
+    tensor_list = list(tensor_all_ranks.unbind(0))
+    torch.distributed.all_gather(tensor_list, tensor_rank.contiguous())
+    return tensor_all_ranks.flatten(end_dim=1)
+
+
+def extract_features(model, dataset, batch_size, num_workers, gather_on_cpu=False):
+    dataset_with_enumerated_targets = DatasetWithEnumeratedTargets(dataset)
+    sample_count = len(dataset_with_enumerated_targets)
+    data_loader = make_data_loader(
+        dataset=dataset_with_enumerated_targets,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        sampler_type=SamplerType.DISTRIBUTED,
+        drop_last=False,
+        shuffle=False,
+    )
+    return extract_features_with_dataloader(model, data_loader, sample_count, gather_on_cpu)
+
+
+@torch.inference_mode()
+def extract_features_with_dataloader(model, data_loader, sample_count, gather_on_cpu=False):
+    gather_device = torch.device("cpu") if gather_on_cpu else torch.device("cuda")
+    metric_logger = MetricLogger(delimiter="  ")
+    features, all_labels = None, None
+    for samples, (index, labels_rank) in metric_logger.log_every(data_loader, 10):
+        samples = samples.cuda(non_blocking=True)
+        labels_rank = labels_rank.cuda(non_blocking=True)
+        index = index.cuda(non_blocking=True)
+        features_rank = model(samples).float()
+
+        # init storage feature matrix
+        if features is None:
+            features = torch.zeros(sample_count, features_rank.shape[-1], device=gather_device)
+            labels_shape = list(labels_rank.shape)
+            labels_shape[0] = sample_count
+            all_labels = torch.full(labels_shape, fill_value=-1, device=gather_device)
+            logger.info(f"Storing features into tensor of shape {features.shape}")
+
+        # share indexes, features and labels between processes
+        index_all = all_gather_and_flatten(index).to(gather_device)
+        features_all_ranks = all_gather_and_flatten(features_rank).to(gather_device)
+        labels_all_ranks = all_gather_and_flatten(labels_rank).to(gather_device)
+
+        # update storage feature matrix
+        if len(index_all) > 0:
+            features.index_copy_(0, index_all, features_all_ranks)
+            all_labels.index_copy_(0, index_all, labels_all_ranks)
+
+    logger.info(f"Features shape: {tuple(features.shape)}")
+    logger.info(f"Labels shape: {tuple(all_labels.shape)}")
+
+    assert torch.all(all_labels > -1)
+
+    return features, all_labels
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/fsdp/__init__.py b/torchhub/facebookresearch_dinov2_main/dinov2/fsdp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d20397611619e6a02ea07f5305d650ffef2a51
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/fsdp/__init__.py
@@ -0,0 +1,158 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+from typing import Any
+
+import torch
+import dinov2.distributed as distributed
+from functools import partial
+from fvcore.common.checkpoint import Checkpointer
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import ShardingStrategy
+from torch.distributed.fsdp import MixedPrecision
+from torch.distributed.fsdp import StateDictType
+from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
+from torch.distributed.fsdp.wrap import ModuleWrapPolicy
+from torch.distributed.fsdp._runtime_utils import _reshard
+
+
+def get_fsdp_wrapper(model_cfg, modules_to_wrap=set()):
+    sharding_strategy_dict = {
+        "NO_SHARD": ShardingStrategy.NO_SHARD,
+        "SHARD_GRAD_OP": ShardingStrategy.SHARD_GRAD_OP,
+        "FULL_SHARD": ShardingStrategy.FULL_SHARD,
+    }
+
+    dtype_dict = {
+        "fp32": torch.float32,
+        "fp16": torch.float16,
+        "bf16": torch.bfloat16,
+    }
+
+    mixed_precision_config = MixedPrecision(
+        param_dtype=dtype_dict[model_cfg.mixed_precision.param_dtype],
+        reduce_dtype=dtype_dict[model_cfg.mixed_precision.reduce_dtype],
+        buffer_dtype=dtype_dict[model_cfg.mixed_precision.buffer_dtype],
+    )
+
+    sharding_strategy_config = sharding_strategy_dict[model_cfg.sharding_strategy]
+
+    local_rank = distributed.get_local_rank()
+
+    fsdp_wrapper = partial(
+        FSDP,
+        sharding_strategy=sharding_strategy_config,
+        mixed_precision=mixed_precision_config,
+        device_id=local_rank,
+        sync_module_states=True,
+        use_orig_params=True,
+        auto_wrap_policy=ModuleWrapPolicy(modules_to_wrap),
+    )
+    return fsdp_wrapper
+
+
+def is_fsdp(x):
+    return isinstance(x, FSDP)
+
+
+def is_sharded_fsdp(x):
+    return is_fsdp(x) and x.sharding_strategy is not ShardingStrategy.NO_SHARD
+
+
+def free_if_fsdp(x):
+    if is_sharded_fsdp(x):
+        handles = x._handles
+        true_list = [True for h in handles]
+        _reshard(x, handles, true_list)
+
+
+def get_fsdp_modules(x):
+    return FSDP.fsdp_modules(x)
+
+
+def reshard_fsdp_model(x):
+    for m in get_fsdp_modules(x):
+        free_if_fsdp(m)
+
+
+def rankstr():
+    return f"rank_{distributed.get_global_rank()}"
+
+
+class FSDPCheckpointer(Checkpointer):
+    def save(self, name: str, **kwargs: Any) -> None:
+        """
+        Dump model and checkpointables to a file.
+
+        Args:
+            name (str): name of the file.
+            kwargs (dict): extra arbitrary data to save.
+        """
+        if not self.save_dir or not self.save_to_disk:
+            return
+
+        data = {}
+        with FSDP.state_dict_type(self.model, StateDictType.LOCAL_STATE_DICT):
+            data["model"] = self.model.state_dict()
+
+        # data["model"] = self.model.state_dict()
+        for key, obj in self.checkpointables.items():
+            data[key] = obj.state_dict()
+        data.update(kwargs)
+
+        basename = f"{name}.{rankstr()}.pth"
+        save_file = os.path.join(self.save_dir, basename)
+        assert os.path.basename(save_file) == basename, basename
+        self.logger.info("Saving checkpoint to {}".format(save_file))
+        with self.path_manager.open(save_file, "wb") as f:
+            torch.save(data, f)
+        self.tag_last_checkpoint(basename)
+
+    def load(self, *args, **kwargs):
+        with FSDP.state_dict_type(self.model, StateDictType.LOCAL_STATE_DICT):
+            return super().load(*args, **kwargs)
+
+    def has_checkpoint(self) -> bool:
+        """
+        Returns:
+            bool: whether a checkpoint exists in the target directory.
+        """
+        save_file = os.path.join(self.save_dir, f"last_checkpoint.{rankstr()}")
+        return self.path_manager.exists(save_file)
+
+    def get_checkpoint_file(self) -> str:
+        """
+        Returns:
+            str: The latest checkpoint file in target directory.
+        """
+        save_file = os.path.join(self.save_dir, f"last_checkpoint.{rankstr()}")
+        try:
+            with self.path_manager.open(save_file, "r") as f:
+                last_saved = f.read().strip()
+        except IOError:
+            # if file doesn't exist, maybe because it has just been
+            # deleted by a separate process
+            return ""
+        # pyre-fixme[6]: For 2nd param expected `Union[PathLike[str], str]` but got
+        #  `Union[bytes, str]`.
+        return os.path.join(self.save_dir, last_saved)
+
+    def tag_last_checkpoint(self, last_filename_basename: str) -> None:
+        """
+        Tag the last checkpoint.
+
+        Args:
+            last_filename_basename (str): the basename of the last filename.
+        """
+        if distributed.is_enabled():
+            torch.distributed.barrier()
+        save_file = os.path.join(self.save_dir, f"last_checkpoint.{rankstr()}")
+        with self.path_manager.open(save_file, "w") as f:
+            f.write(last_filename_basename)  # pyre-ignore
+
+
+ShardedGradScaler = ShardedGradScaler
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/layers/__init__.py b/torchhub/facebookresearch_dinov2_main/dinov2/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..31f196aacac5be8a7c537a3dfa8f97084671b466
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/layers/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .dino_head import DINOHead
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/layers/attention.py b/torchhub/facebookresearch_dinov2_main/dinov2/layers/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f9b0c94b40967dfdff4f261c127cbd21328c905
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/layers/attention.py
@@ -0,0 +1,81 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+
+import logging
+
+from torch import Tensor
+from torch import nn
+
+
+logger = logging.getLogger("dinov2")
+
+
+try:
+    from xformers.ops import memory_efficient_attention, unbind, fmha
+
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+
+
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            assert attn_bias is None, "xFormers is required for nested tensors usage"
+            return super().forward(x)
+
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+
+        q, k, v = unbind(qkv, 2)
+
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/layers/block.py b/torchhub/facebookresearch_dinov2_main/dinov2/layers/block.py
new file mode 100644
index 0000000000000000000000000000000000000000..25488f57cc0ad3c692f86b62555f6668e2a66db1
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/layers/block.py
@@ -0,0 +1,252 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+
+import logging
+from typing import Callable, List, Any, Tuple, Dict
+
+import torch
+from torch import nn, Tensor
+
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+
+
+logger = logging.getLogger("dinov2")
+
+
+try:
+    from xformers.ops import fmha
+    from xformers.ops import scaled_index_add, index_select_cat
+
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+
+
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+        self.sample_drop_ratio = drop_path
+
+    def forward(self, x: Tensor) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+
+
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+
+    residual_scale_factor = b / sample_subset_size
+
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+
+
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+
+
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+
+
+attn_bias_cache: Dict[Tuple, Any] = {}
+
+
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+
+    return attn_bias_cache[all_shapes], cat_tensors
+
+
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+
+
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+
+        if self.training and self.sample_drop_ratio > 0.0:
+
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/layers/dino_head.py b/torchhub/facebookresearch_dinov2_main/dinov2/layers/dino_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..7212db92a4fd8d4c7230e284e551a0234e9d8623
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/layers/dino_head.py
@@ -0,0 +1,59 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from torch.nn.utils import weight_norm
+
+
+class DINOHead(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        use_bn=False,
+        nlayers=3,
+        hidden_dim=2048,
+        bottleneck_dim=256,
+        mlp_bias=True,
+    ):
+        super().__init__()
+        nlayers = max(nlayers, 1)
+        self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias)
+        self.apply(self._init_weights)
+        self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
+        self.last_layer.weight_g.data.fill_(1)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        x = self.mlp(x)
+        eps = 1e-6 if x.dtype == torch.float16 else 1e-12
+        x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
+        x = self.last_layer(x)
+        return x
+
+
+def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True):
+    if nlayers == 1:
+        return nn.Linear(in_dim, bottleneck_dim, bias=bias)
+    else:
+        layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
+        if use_bn:
+            layers.append(nn.BatchNorm1d(hidden_dim))
+        layers.append(nn.GELU())
+        for _ in range(nlayers - 2):
+            layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
+            if use_bn:
+                layers.append(nn.BatchNorm1d(hidden_dim))
+            layers.append(nn.GELU())
+        layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
+        return nn.Sequential(*layers)
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/layers/drop_path.py b/torchhub/facebookresearch_dinov2_main/dinov2/layers/drop_path.py
new file mode 100644
index 0000000000000000000000000000000000000000..af05625984dd14682cc96a63bf0c97bab1f123b1
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/layers/drop_path.py
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+
+
+from torch import nn
+
+
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/layers/layer_scale.py b/torchhub/facebookresearch_dinov2_main/dinov2/layers/layer_scale.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca5daa52bd81d3581adeb2198ea5b7dba2a3aea1
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/layers/layer_scale.py
@@ -0,0 +1,28 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+
+from typing import Union
+
+import torch
+from torch import Tensor
+from torch import nn
+
+
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/layers/mlp.py b/torchhub/facebookresearch_dinov2_main/dinov2/layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e4b315f972f9a9f54aef1e4ef4e81b52976f018
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/layers/mlp.py
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+
+
+from typing import Callable, Optional
+
+from torch import Tensor, nn
+
+
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/layers/patch_embed.py b/torchhub/facebookresearch_dinov2_main/dinov2/layers/patch_embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..574abe41175568d700a389b8b96d1ba554914779
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/layers/patch_embed.py
@@ -0,0 +1,89 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+
+from typing import Callable, Optional, Tuple, Union
+
+from torch import Tensor
+import torch.nn as nn
+
+
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+
+    assert isinstance(x, int)
+    return (x, x)
+
+
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.flatten_embedding = flatten_embedding
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/layers/swiglu_ffn.py b/torchhub/facebookresearch_dinov2_main/dinov2/layers/swiglu_ffn.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3324b266fb0a50ccf8c3a0ede2ae10ac4dfa03e
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/layers/swiglu_ffn.py
@@ -0,0 +1,63 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Callable, Optional
+
+from torch import Tensor, nn
+import torch.nn.functional as F
+
+
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+
+
+try:
+    from xformers.ops import SwiGLU
+
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+
+
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/logging/__init__.py b/torchhub/facebookresearch_dinov2_main/dinov2/logging/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e80dadb2d57056e9f6f4989cd24a3c7e26fee23f
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/logging/__init__.py
@@ -0,0 +1,103 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import functools
+import logging
+import os
+import sys
+from typing import Optional
+
+import dinov2.distributed as distributed
+from .helpers import MetricLogger, SmoothedValue
+
+
+# So that calling _configure_logger multiple times won't add many handlers
+@functools.lru_cache()
+def _configure_logger(
+    name: Optional[str] = None,
+    *,
+    level: int = logging.DEBUG,
+    output: Optional[str] = None,
+):
+    """
+    Configure a logger.
+
+    Adapted from Detectron2.
+
+    Args:
+        name: The name of the logger to configure.
+        level: The logging level to use.
+        output: A file name or a directory to save log. If None, will not save log file.
+            If ends with ".txt" or ".log", assumed to be a file name.
+            Otherwise, logs will be saved to `output/log.txt`.
+
+    Returns:
+        The configured logger.
+    """
+
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    logger.propagate = False
+
+    # Loosely match Google glog format:
+    #   [IWEF]yyyymmdd hh:mm:ss.uuuuuu threadid file:line] msg
+    # but use a shorter timestamp and include the logger name:
+    #   [IWEF]yyyymmdd hh:mm:ss logger threadid file:line] msg
+    fmt_prefix = "%(levelname).1s%(asctime)s %(process)s %(name)s %(filename)s:%(lineno)s] "
+    fmt_message = "%(message)s"
+    fmt = fmt_prefix + fmt_message
+    datefmt = "%Y%m%d %H:%M:%S"
+    formatter = logging.Formatter(fmt=fmt, datefmt=datefmt)
+
+    # stdout logging for main worker only
+    if distributed.is_main_process():
+        handler = logging.StreamHandler(stream=sys.stdout)
+        handler.setLevel(logging.DEBUG)
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+
+    # file logging for all workers
+    if output:
+        if os.path.splitext(output)[-1] in (".txt", ".log"):
+            filename = output
+        else:
+            filename = os.path.join(output, "logs", "log.txt")
+
+        if not distributed.is_main_process():
+            global_rank = distributed.get_global_rank()
+            filename = filename + ".rank{}".format(global_rank)
+
+        os.makedirs(os.path.dirname(filename), exist_ok=True)
+
+        handler = logging.StreamHandler(open(filename, "a"))
+        handler.setLevel(logging.DEBUG)
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+
+    return logger
+
+
+def setup_logging(
+    output: Optional[str] = None,
+    *,
+    name: Optional[str] = None,
+    level: int = logging.DEBUG,
+    capture_warnings: bool = True,
+) -> None:
+    """
+    Setup logging.
+
+    Args:
+        output: A file name or a directory to save log files. If None, log
+            files will not be saved. If output ends with ".txt" or ".log", it
+            is assumed to be a file name.
+            Otherwise, logs will be saved to `output/log.txt`.
+        name: The name of the logger to configure, by default the root logger.
+        level: The logging level to use.
+        capture_warnings: Whether warnings should be captured as logs.
+    """
+    logging.captureWarnings(capture_warnings)
+    _configure_logger(name, level=level, output=output)
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/logging/helpers.py b/torchhub/facebookresearch_dinov2_main/dinov2/logging/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..16d643500d2ee10ffea5916aad07f9b9d7c0af6d
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/logging/helpers.py
@@ -0,0 +1,195 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import defaultdict, deque
+import datetime
+import json
+import logging
+import time
+
+import torch
+
+import dinov2.distributed as distributed
+
+
+logger = logging.getLogger("dinov2")
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t", output_file=None):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+        self.output_file = output_file
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append("{}: {}".format(name, str(meter)))
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def dump_in_output_file(self, iteration, iter_time, data_time):
+        if self.output_file is None or not distributed.is_main_process():
+            return
+        dict_to_dump = dict(
+            iteration=iteration,
+            iter_time=iter_time,
+            data_time=data_time,
+        )
+        dict_to_dump.update({k: v.median for k, v in self.meters.items()})
+        with open(self.output_file, "a") as f:
+            f.write(json.dumps(dict_to_dump) + "\n")
+        pass
+
+    def log_every(self, iterable, print_freq, header=None, n_iterations=None, start_iteration=0):
+        i = start_iteration
+        if not header:
+            header = ""
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt="{avg:.6f}")
+        data_time = SmoothedValue(fmt="{avg:.6f}")
+
+        if n_iterations is None:
+            n_iterations = len(iterable)
+
+        space_fmt = ":" + str(len(str(n_iterations))) + "d"
+
+        log_list = [
+            header,
+            "[{0" + space_fmt + "}/{1}]",
+            "eta: {eta}",
+            "{meters}",
+            "time: {time}",
+            "data: {data}",
+        ]
+        if torch.cuda.is_available():
+            log_list += ["max mem: {memory:.0f}"]
+
+        log_msg = self.delimiter.join(log_list)
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == n_iterations - 1:
+                self.dump_in_output_file(iteration=i, iter_time=iter_time.avg, data_time=data_time.avg)
+                eta_seconds = iter_time.global_avg * (n_iterations - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    logger.info(
+                        log_msg.format(
+                            i,
+                            n_iterations,
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                            memory=torch.cuda.max_memory_allocated() / MB,
+                        )
+                    )
+                else:
+                    logger.info(
+                        log_msg.format(
+                            i,
+                            n_iterations,
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                        )
+                    )
+            i += 1
+            end = time.time()
+            if i >= n_iterations:
+                break
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        logger.info("{} Total time: {} ({:.6f} s / it)".format(header, total_time_str, total_time / n_iterations))
+
+
+class SmoothedValue:
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, num=1):
+        self.deque.append(value)
+        self.count += num
+        self.total += value * num
+
+    def synchronize_between_processes(self):
+        """
+        Distributed synchronization of the metric
+        Warning: does not synchronize the deque!
+        """
+        if not distributed.is_enabled():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
+        torch.distributed.barrier()
+        torch.distributed.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value,
+        )
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/loss/__init__.py b/torchhub/facebookresearch_dinov2_main/dinov2/loss/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..477b71b28259bf97b806df3f3d2f392dded866d6
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/loss/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .dino_clstoken_loss import DINOLoss
+from .ibot_patch_loss import iBOTPatchLoss
+from .koleo_loss import KoLeoLoss
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/loss/dino_clstoken_loss.py b/torchhub/facebookresearch_dinov2_main/dinov2/loss/dino_clstoken_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f33897efb1084e6c1c14ae00bc93ab332c61074
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/loss/dino_clstoken_loss.py
@@ -0,0 +1,100 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch import nn
+
+
+class DINOLoss(nn.Module):
+    def __init__(
+        self,
+        out_dim,
+        student_temp=0.1,
+        center_momentum=0.9,
+    ):
+        super().__init__()
+        self.student_temp = student_temp
+        self.center_momentum = center_momentum
+        self.register_buffer("center", torch.zeros(1, out_dim))
+        self.updated = True
+        self.reduce_handle = None
+        self.len_teacher_output = None
+        self.async_batch_center = None
+
+    @torch.no_grad()
+    def softmax_center_teacher(self, teacher_output, teacher_temp):
+        self.apply_center_update()
+        # teacher centering and sharpening
+        return F.softmax((teacher_output - self.center) / teacher_temp, dim=-1)
+
+    @torch.no_grad()
+    def sinkhorn_knopp_teacher(self, teacher_output, teacher_temp, n_iterations=3):
+        teacher_output = teacher_output.float()
+        world_size = dist.get_world_size() if dist.is_initialized() else 1
+        Q = torch.exp(teacher_output / teacher_temp).t()  # Q is K-by-B for consistency with notations from our paper
+        B = Q.shape[1] * world_size  # number of samples to assign
+        K = Q.shape[0]  # how many prototypes
+
+        # make the matrix sums to 1
+        sum_Q = torch.sum(Q)
+        if dist.is_initialized():
+            dist.all_reduce(sum_Q)
+        Q /= sum_Q
+
+        for it in range(n_iterations):
+            # normalize each row: total weight per prototype must be 1/K
+            sum_of_rows = torch.sum(Q, dim=1, keepdim=True)
+            if dist.is_initialized():
+                dist.all_reduce(sum_of_rows)
+            Q /= sum_of_rows
+            Q /= K
+
+            # normalize each column: total weight per sample must be 1/B
+            Q /= torch.sum(Q, dim=0, keepdim=True)
+            Q /= B
+
+        Q *= B  # the columns must sum to 1 so that Q is an assignment
+        return Q.t()
+
+    def forward(self, student_output_list, teacher_out_softmaxed_centered_list):
+        """
+        Cross-entropy between softmax outputs of the teacher and student networks.
+        """
+        # TODO: Use cross_entropy_distribution here
+        total_loss = 0
+        for s in student_output_list:
+            lsm = F.log_softmax(s / self.student_temp, dim=-1)
+            for t in teacher_out_softmaxed_centered_list:
+                loss = torch.sum(t * lsm, dim=-1)
+                total_loss -= loss.mean()
+        return total_loss
+
+    @torch.no_grad()
+    def update_center(self, teacher_output):
+        self.reduce_center_update(teacher_output)
+
+    @torch.no_grad()
+    def reduce_center_update(self, teacher_output):
+        self.updated = False
+        self.len_teacher_output = len(teacher_output)
+        self.async_batch_center = torch.sum(teacher_output, dim=0, keepdim=True)
+        if dist.is_initialized():
+            self.reduce_handle = dist.all_reduce(self.async_batch_center, async_op=True)
+
+    @torch.no_grad()
+    def apply_center_update(self):
+        if self.updated is False:
+            world_size = dist.get_world_size() if dist.is_initialized() else 1
+
+            if self.reduce_handle is not None:
+                self.reduce_handle.wait()
+            _t = self.async_batch_center / (self.len_teacher_output * world_size)
+
+            self.center = self.center * self.center_momentum + _t * (1 - self.center_momentum)
+
+            self.updated = True
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/loss/ibot_patch_loss.py b/torchhub/facebookresearch_dinov2_main/dinov2/loss/ibot_patch_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..16bc5cf634d661f1fa337304273f60dcd43c79c3
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/loss/ibot_patch_loss.py
@@ -0,0 +1,152 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch import nn
+
+import logging
+
+
+logger = logging.getLogger("dinov2")
+
+
+try:
+    from xformers.ops import cross_entropy
+
+    def lossfunc(t, s, temp):
+        s = s.float()
+        t = t.float()
+        if s.ndim == 2:
+            return -cross_entropy(s.unsqueeze(0), t.unsqueeze(0), temp, bw_inplace=True).squeeze(0)
+        elif s.ndim == 3:
+            return -cross_entropy(s, t, temp, bw_inplace=True)
+
+except ImportError:
+
+    def lossfunc(t, s, temp):
+        return torch.sum(t * F.log_softmax(s / temp, dim=-1), dim=-1)
+
+
+class iBOTPatchLoss(nn.Module):
+    def __init__(self, patch_out_dim, student_temp=0.1, center_momentum=0.9):
+        super().__init__()
+        self.student_temp = student_temp
+        self.center_momentum = center_momentum
+        self.register_buffer("center", torch.zeros(1, 1, patch_out_dim))
+        self.updated = True
+        self.reduce_handle = None
+        self.len_teacher_patch_tokens = None
+        self.async_batch_center = None
+
+    @torch.no_grad()
+    def softmax_center_teacher(self, teacher_patch_tokens, teacher_temp):
+        self.apply_center_update()
+        # teacher centering and sharpening
+        #
+        # WARNING:
+        #   as self.center is a float32, everything gets casted to float32 afterwards
+        #
+        # teacher_patch_tokens = teacher_patch_tokens.float()
+        # return F.softmax((teacher_patch_tokens.sub_(self.center.to(teacher_patch_tokens.dtype))).mul_(1 / teacher_temp), dim=-1)
+
+        return F.softmax((teacher_patch_tokens - self.center) / teacher_temp, dim=-1)
+
+        # this is experimental, keep everything in float16 and let's see what happens:
+        # return F.softmax((teacher_patch_tokens.sub_(self.center)) / teacher_temp, dim=-1)
+
+    @torch.no_grad()
+    def sinkhorn_knopp_teacher(self, teacher_output, teacher_temp, n_masked_patches_tensor, n_iterations=3):
+        teacher_output = teacher_output.float()
+        # world_size = dist.get_world_size() if dist.is_initialized() else 1
+        Q = torch.exp(teacher_output / teacher_temp).t()  # Q is K-by-B for consistency with notations from our paper
+        # B = Q.shape[1] * world_size # number of samples to assign
+        B = n_masked_patches_tensor
+        dist.all_reduce(B)
+        K = Q.shape[0]  # how many prototypes
+
+        # make the matrix sums to 1
+        sum_Q = torch.sum(Q)
+        if dist.is_initialized():
+            dist.all_reduce(sum_Q)
+        Q /= sum_Q
+
+        for it in range(n_iterations):
+            # normalize each row: total weight per prototype must be 1/K
+            sum_of_rows = torch.sum(Q, dim=1, keepdim=True)
+            if dist.is_initialized():
+                dist.all_reduce(sum_of_rows)
+            Q /= sum_of_rows
+            Q /= K
+
+            # normalize each column: total weight per sample must be 1/B
+            Q /= torch.sum(Q, dim=0, keepdim=True)
+            Q /= B
+
+        Q *= B  # the columns must sum to 1 so that Q is an assignment
+        return Q.t()
+
+    def forward(self, student_patch_tokens, teacher_patch_tokens, student_masks_flat):
+        """
+        Cross-entropy between softmax outputs of the teacher and student networks.
+        student_patch_tokens: (B, N, D) tensor
+        teacher_patch_tokens: (B, N, D) tensor
+        student_masks_flat: (B, N) tensor
+        """
+        t = teacher_patch_tokens
+        s = student_patch_tokens
+        loss = torch.sum(t * F.log_softmax(s / self.student_temp, dim=-1), dim=-1)
+        loss = torch.sum(loss * student_masks_flat.float(), dim=-1) / student_masks_flat.sum(dim=-1).clamp(min=1.0)
+        return -loss.mean()
+
+    def forward_masked(
+        self,
+        student_patch_tokens_masked,
+        teacher_patch_tokens_masked,
+        student_masks_flat,
+        n_masked_patches=None,
+        masks_weight=None,
+    ):
+        t = teacher_patch_tokens_masked
+        s = student_patch_tokens_masked
+        # loss = torch.sum(t * F.log_softmax(s / self.student_temp, dim=-1), dim=-1)
+        loss = lossfunc(t, s, self.student_temp)
+        if masks_weight is None:
+            masks_weight = (
+                (1 / student_masks_flat.sum(-1).clamp(min=1.0))
+                .unsqueeze(-1)
+                .expand_as(student_masks_flat)[student_masks_flat]
+            )
+        if n_masked_patches is not None:
+            loss = loss[:n_masked_patches]
+        loss = loss * masks_weight
+        return -loss.sum() / student_masks_flat.shape[0]
+
+    @torch.no_grad()
+    def update_center(self, teacher_patch_tokens):
+        self.reduce_center_update(teacher_patch_tokens)
+
+    @torch.no_grad()
+    def reduce_center_update(self, teacher_patch_tokens):
+        self.updated = False
+        self.len_teacher_patch_tokens = len(teacher_patch_tokens)
+        self.async_batch_center = torch.sum(teacher_patch_tokens.mean(1), dim=0, keepdim=True)
+        if dist.is_initialized():
+            self.reduce_handle = dist.all_reduce(self.async_batch_center, async_op=True)
+
+    @torch.no_grad()
+    def apply_center_update(self):
+        if self.updated is False:
+            world_size = dist.get_world_size() if dist.is_initialized() else 1
+
+            if self.reduce_handle is not None:
+                self.reduce_handle.wait()
+            _t = self.async_batch_center / (self.len_teacher_patch_tokens * world_size)
+
+            self.center = self.center * self.center_momentum + _t * (1 - self.center_momentum)
+
+            self.updated = True
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/loss/koleo_loss.py b/torchhub/facebookresearch_dinov2_main/dinov2/loss/koleo_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..e776d0426bb029cf48f25b0c94077720bc8421c4
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/loss/koleo_loss.py
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# import torch.distributed as dist
+
+
+logger = logging.getLogger("dinov2")
+
+
+class KoLeoLoss(nn.Module):
+    """Kozachenko-Leonenko entropic loss regularizer from Sablayrolles et al. - 2018 - Spreading vectors for similarity search"""
+
+    def __init__(self):
+        super().__init__()
+        self.pdist = nn.PairwiseDistance(2, eps=1e-8)
+
+    def pairwise_NNs_inner(self, x):
+        """
+        Pairwise nearest neighbors for L2-normalized vectors.
+        Uses Torch rather than Faiss to remain on GPU.
+        """
+        # parwise dot products (= inverse distance)
+        dots = torch.mm(x, x.t())
+        n = x.shape[0]
+        dots.view(-1)[:: (n + 1)].fill_(-1)  # Trick to fill diagonal with -1
+        # max inner prod -> min distance
+        _, I = torch.max(dots, dim=1)  # noqa: E741
+        return I
+
+    def forward(self, student_output, eps=1e-8):
+        """
+        Args:
+            student_output (BxD): backbone output of student
+        """
+        with torch.cuda.amp.autocast(enabled=False):
+            student_output = F.normalize(student_output, eps=eps, p=2, dim=-1)
+            I = self.pairwise_NNs_inner(student_output)  # noqa: E741
+            distances = self.pdist(student_output, student_output[I])  # BxD, BxD -> B
+            loss = -torch.log(distances + eps).mean()
+        return loss
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/models/__init__.py b/torchhub/facebookresearch_dinov2_main/dinov2/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e5a1f3832464f898752e57e865760e9864613cb
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/models/__init__.py
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+from . import vision_transformer as vits
+
+
+logger = logging.getLogger("dinov2")
+
+
+def build_model(args, only_teacher=False, img_size=224):
+    args.arch = args.arch.removesuffix("_memeff")
+    if "vit" in args.arch:
+        vit_kwargs = dict(
+            img_size=img_size,
+            patch_size=args.patch_size,
+            init_values=args.layerscale,
+            ffn_layer=args.ffn_layer,
+            block_chunks=args.block_chunks,
+            qkv_bias=args.qkv_bias,
+            proj_bias=args.proj_bias,
+            ffn_bias=args.ffn_bias,
+        )
+        teacher = vits.__dict__[args.arch](**vit_kwargs)
+        if only_teacher:
+            return teacher, teacher.embed_dim
+        student = vits.__dict__[args.arch](
+            **vit_kwargs,
+            drop_path_rate=args.drop_path_rate,
+            drop_path_uniform=args.drop_path_uniform,
+        )
+        embed_dim = student.embed_dim
+    return student, teacher, embed_dim
+
+
+def build_model_from_cfg(cfg, only_teacher=False):
+    return build_model(cfg.student, only_teacher=only_teacher, img_size=cfg.crops.global_crops_size)
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/models/vision_transformer.py b/torchhub/facebookresearch_dinov2_main/dinov2/models/vision_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..18e159a986336af813c8f0e505b946f42cd83e47
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/models/vision_transformer.py
@@ -0,0 +1,358 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+
+from dinov2.layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+
+
+logger = logging.getLogger("dinov2")
+
+
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+
+
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+
+
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+
+            def f(*args, **kwargs):
+                return nn.Identity()
+
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+
+        self.init_weights()
+
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        w0, h0 = w0 + 0.1, h0 + 0.1
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2),
+            scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
+            mode="bicubic",
+        )
+
+        assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+
+        return x
+
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            x = blk(x)
+
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_patchtokens": x_norm[:, 1:],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+
+        x = self.prepare_tokens_with_masks(x, masks)
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_patchtokens": x_norm[:, 1:],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1:] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+
+    def forward(self, *args, is_training=False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+
+
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+
+
+def vit_small(patch_size=16, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        **kwargs,
+    )
+    return model
+
+
+def vit_base(patch_size=16, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        **kwargs,
+    )
+    return model
+
+
+def vit_large(patch_size=16, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        **kwargs,
+    )
+    return model
+
+
+def vit_giant2(patch_size=16, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        **kwargs,
+    )
+    return model
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/run/__init__.py b/torchhub/facebookresearch_dinov2_main/dinov2/run/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0952fcc3f57e34b3747962e9ebd6fc57aeea63fa
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/run/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/knn.py b/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/knn.py
new file mode 100644
index 0000000000000000000000000000000000000000..15d674b78b0629aa0f041c2426c894925469a0e8
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/knn.py
@@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+from dinov2.eval.knn import get_args_parser as get_knn_args_parser
+from dinov2.logging import setup_logging
+from dinov2.run.submit import get_args_parser, submit_jobs
+
+
+logger = logging.getLogger("dinov2")
+
+
+class Evaluator:
+    def __init__(self, args):
+        self.args = args
+
+    def __call__(self):
+        from dinov2.eval.knn import main as knn_main
+
+        self._setup_args()
+        knn_main(self.args)
+
+    def checkpoint(self):
+        import submitit
+
+        logger.info(f"Requeuing {self.args}")
+        empty = type(self)(self.args)
+        return submitit.helpers.DelayedSubmission(empty)
+
+    def _setup_args(self):
+        import submitit
+
+        job_env = submitit.JobEnvironment()
+        self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
+        logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
+        logger.info(f"Args: {self.args}")
+
+
+def main():
+    description = "Submitit launcher for DINOv2 k-NN evaluation"
+    knn_args_parser = get_knn_args_parser(add_help=False)
+    parents = [knn_args_parser]
+    args_parser = get_args_parser(description=description, parents=parents)
+    args = args_parser.parse_args()
+
+    setup_logging()
+
+    assert os.path.exists(args.config_file), "Configuration file does not exist!"
+    submit_jobs(Evaluator, args, name="dinov2:knn")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/linear.py b/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8c264762ac6bb82a3622c74e1e683ea5c6be437
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/linear.py
@@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+from dinov2.eval.linear import get_args_parser as get_linear_args_parser
+from dinov2.logging import setup_logging
+from dinov2.run.submit import get_args_parser, submit_jobs
+
+
+logger = logging.getLogger("dinov2")
+
+
+class Evaluator:
+    def __init__(self, args):
+        self.args = args
+
+    def __call__(self):
+        from dinov2.eval.linear import main as linear_main
+
+        self._setup_args()
+        linear_main(self.args)
+
+    def checkpoint(self):
+        import submitit
+
+        logger.info(f"Requeuing {self.args}")
+        empty = type(self)(self.args)
+        return submitit.helpers.DelayedSubmission(empty)
+
+    def _setup_args(self):
+        import submitit
+
+        job_env = submitit.JobEnvironment()
+        self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
+        logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
+        logger.info(f"Args: {self.args}")
+
+
+def main():
+    description = "Submitit launcher for DINOv2 linear evaluation"
+    linear_args_parser = get_linear_args_parser(add_help=False)
+    parents = [linear_args_parser]
+    args_parser = get_args_parser(description=description, parents=parents)
+    args = args_parser.parse_args()
+
+    setup_logging()
+
+    assert os.path.exists(args.config_file), "Configuration file does not exist!"
+    submit_jobs(Evaluator, args, name="dinov2:linear")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/log_regression.py b/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/log_regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d3d5a5742792fc8d4ca3b39c15c47e8aa349bc7
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/log_regression.py
@@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+from dinov2.eval.log_regression import get_args_parser as get_log_regression_args_parser
+from dinov2.logging import setup_logging
+from dinov2.run.submit import get_args_parser, submit_jobs
+
+
+logger = logging.getLogger("dinov2")
+
+
+class Evaluator:
+    def __init__(self, args):
+        self.args = args
+
+    def __call__(self):
+        from dinov2.eval.log_regression import main as log_regression_main
+
+        self._setup_args()
+        log_regression_main(self.args)
+
+    def checkpoint(self):
+        import submitit
+
+        logger.info(f"Requeuing {self.args}")
+        empty = type(self)(self.args)
+        return submitit.helpers.DelayedSubmission(empty)
+
+    def _setup_args(self):
+        import submitit
+
+        job_env = submitit.JobEnvironment()
+        self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
+        logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
+        logger.info(f"Args: {self.args}")
+
+
+def main():
+    description = "Submitit launcher for DINOv2 logistic evaluation"
+    log_regression_args_parser = get_log_regression_args_parser(add_help=False)
+    parents = [log_regression_args_parser]
+    args_parser = get_args_parser(description=description, parents=parents)
+    args = args_parser.parse_args()
+
+    setup_logging()
+
+    assert os.path.exists(args.config_file), "Configuration file does not exist!"
+    submit_jobs(Evaluator, args, name="dinov2:logreg")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/run/submit.py b/torchhub/facebookresearch_dinov2_main/dinov2/run/submit.py
new file mode 100644
index 0000000000000000000000000000000000000000..68140f3d6d93dc67ccd7c45fe712eb15483d1ad6
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/run/submit.py
@@ -0,0 +1,123 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import logging
+import os
+from pathlib import Path
+from typing import List, Optional
+
+import submitit
+
+from dinov2.utils.cluster import (
+    get_slurm_executor_parameters,
+    get_slurm_partition,
+    get_user_checkpoint_path,
+)
+
+
+logger = logging.getLogger("dinov2")
+
+
+def get_args_parser(
+    description: Optional[str] = None,
+    parents: Optional[List[argparse.ArgumentParser]] = None,
+    add_help: bool = True,
+) -> argparse.ArgumentParser:
+    parents = parents or []
+    slurm_partition = get_slurm_partition()
+    parser = argparse.ArgumentParser(
+        description=description,
+        parents=parents,
+        add_help=add_help,
+    )
+    parser.add_argument(
+        "--ngpus",
+        "--gpus",
+        "--gpus-per-node",
+        default=8,
+        type=int,
+        help="Number of GPUs to request on each node",
+    )
+    parser.add_argument(
+        "--nodes",
+        "--nnodes",
+        default=2,
+        type=int,
+        help="Number of nodes to request",
+    )
+    parser.add_argument(
+        "--timeout",
+        default=2800,
+        type=int,
+        help="Duration of the job",
+    )
+    parser.add_argument(
+        "--partition",
+        default=slurm_partition,
+        type=str,
+        help="Partition where to submit",
+    )
+    parser.add_argument(
+        "--use-volta32",
+        action="store_true",
+        help="Request V100-32GB GPUs",
+    )
+    parser.add_argument(
+        "--comment",
+        default="",
+        type=str,
+        help="Comment to pass to scheduler, e.g. priority message",
+    )
+    parser.add_argument(
+        "--exclude",
+        default="",
+        type=str,
+        help="Nodes to exclude",
+    )
+    return parser
+
+
+def get_shared_folder() -> Path:
+    user_checkpoint_path = get_user_checkpoint_path()
+    if user_checkpoint_path is None:
+        raise RuntimeError("Path to user checkpoint cannot be determined")
+    path = user_checkpoint_path / "experiments"
+    path.mkdir(exist_ok=True)
+    return path
+
+
+def submit_jobs(task_class, args, name: str):
+    if not args.output_dir:
+        args.output_dir = str(get_shared_folder() / "%j")
+
+    Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    executor = submitit.AutoExecutor(folder=args.output_dir, slurm_max_num_timeout=30)
+
+    kwargs = {}
+    if args.use_volta32:
+        kwargs["slurm_constraint"] = "volta32gb"
+    if args.comment:
+        kwargs["slurm_comment"] = args.comment
+    if args.exclude:
+        kwargs["slurm_exclude"] = args.exclude
+
+    executor_params = get_slurm_executor_parameters(
+        nodes=args.nodes,
+        num_gpus_per_node=args.ngpus,
+        timeout_min=args.timeout,  # max is 60 * 72
+        slurm_signal_delay_s=120,
+        slurm_partition=args.partition,
+        **kwargs,
+    )
+    executor.update_parameters(name=name, **executor_params)
+
+    task = task_class(args)
+    job = executor.submit(task)
+
+    logger.info(f"Submitted job_id: {job.job_id}")
+    str_output_dir = os.path.abspath(args.output_dir).replace("%j", str(job.job_id))
+    logger.info(f"Logs and checkpoints will be saved at: {str_output_dir}")
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/run/train/train.py b/torchhub/facebookresearch_dinov2_main/dinov2/run/train/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..24716f2a314820a4cc15289fe0cb13ad52cf343c
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/run/train/train.py
@@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+from dinov2.logging import setup_logging
+from dinov2.train import get_args_parser as get_train_args_parser
+from dinov2.run.submit import get_args_parser, submit_jobs
+
+
+logger = logging.getLogger("dinov2")
+
+
+class Trainer(object):
+    def __init__(self, args):
+        self.args = args
+
+    def __call__(self):
+        from dinov2.train import main as train_main
+
+        self._setup_args()
+        train_main(self.args)
+
+    def checkpoint(self):
+        import submitit
+
+        logger.info(f"Requeuing {self.args}")
+        empty = type(self)(self.args)
+        return submitit.helpers.DelayedSubmission(empty)
+
+    def _setup_args(self):
+        import submitit
+
+        job_env = submitit.JobEnvironment()
+        self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
+        logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
+        logger.info(f"Args: {self.args}")
+
+
+def main():
+    description = "Submitit launcher for DINOv2 training"
+    train_args_parser = get_train_args_parser(add_help=False)
+    parents = [train_args_parser]
+    args_parser = get_args_parser(description=description, parents=parents)
+    args = args_parser.parse_args()
+
+    setup_logging()
+
+    assert os.path.exists(args.config_file), "Configuration file does not exist!"
+    submit_jobs(Trainer, args, name="dinov2:train")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/train/__init__.py b/torchhub/facebookresearch_dinov2_main/dinov2/train/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0b66d17aa547ed5560e75a03f5c1587da2d4fd7
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/train/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .train import get_args_parser, main
+from .ssl_meta_arch import SSLMetaArch
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/train/ssl_meta_arch.py b/torchhub/facebookresearch_dinov2_main/dinov2/train/ssl_meta_arch.py
new file mode 100644
index 0000000000000000000000000000000000000000..86d0c2413f9abc61953d0e12b43a5a843d97d244
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/train/ssl_meta_arch.py
@@ -0,0 +1,403 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from functools import partial
+import logging
+
+import torch
+from torch import nn
+
+from dinov2.loss import DINOLoss, iBOTPatchLoss, KoLeoLoss
+from dinov2.models import build_model_from_cfg
+from dinov2.layers import DINOHead
+from dinov2.utils.utils import has_batchnorms
+from dinov2.utils.param_groups import get_params_groups_with_decay, fuse_params_groups
+from dinov2.fsdp import get_fsdp_wrapper, ShardedGradScaler, get_fsdp_modules, reshard_fsdp_model
+
+from dinov2.models.vision_transformer import BlockChunk
+
+try:
+    from xformers.ops import fmha
+
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    XFORMERS_AVAILABLE = False
+assert XFORMERS_AVAILABLE, "xFormers is required for DINOv2 training"
+
+
+logger = logging.getLogger("dinov2")
+
+
+class SSLMetaArch(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.fp16_scaler = ShardedGradScaler() if cfg.compute_precision.grad_scaler else None
+
+        student_model_dict = dict()
+        teacher_model_dict = dict()
+
+        student_backbone, teacher_backbone, embed_dim = build_model_from_cfg(cfg)
+        student_model_dict["backbone"] = student_backbone
+        teacher_model_dict["backbone"] = teacher_backbone
+        logger.info(f"OPTIONS -- architecture : embed_dim: {embed_dim}")
+
+        if cfg.student.pretrained_weights:
+            chkpt = torch.load(cfg.student.pretrained_weights)
+            logger.info(f"OPTIONS -- pretrained weights: loading from {cfg.student.pretrained_weights}")
+            student_backbone.load_state_dict(chkpt["model"], strict=False)
+
+        self.embed_dim = embed_dim
+        self.dino_out_dim = cfg.dino.head_n_prototypes
+
+        self.do_dino = cfg.dino.loss_weight > 0
+        self.do_koleo = cfg.dino.koleo_loss_weight > 0
+        self.do_ibot = cfg.ibot.loss_weight > 0
+        self.ibot_separate_head = cfg.ibot.separate_head
+
+        logger.info("OPTIONS -- DINO")
+        if self.do_dino:
+            logger.info(f"OPTIONS -- DINO -- loss_weight: {cfg.dino.loss_weight}")
+            logger.info(f"OPTIONS -- DINO -- head_n_prototypes: {cfg.dino.head_n_prototypes}")
+            logger.info(f"OPTIONS -- DINO -- head_bottleneck_dim: {cfg.dino.head_bottleneck_dim}")
+            logger.info(f"OPTIONS -- DINO -- head_hidden_dim: {cfg.dino.head_hidden_dim}")
+            self.dino_loss_weight = cfg.dino.loss_weight
+            dino_head = partial(
+                DINOHead,
+                in_dim=embed_dim,
+                out_dim=cfg.dino.head_n_prototypes,
+                hidden_dim=cfg.dino.head_hidden_dim,
+                bottleneck_dim=cfg.dino.head_bottleneck_dim,
+                nlayers=cfg.dino.head_nlayers,
+            )
+            self.dino_loss = DINOLoss(self.dino_out_dim)
+            if self.do_koleo:
+                logger.info("OPTIONS -- DINO -- applying KOLEO regularization")
+                self.koleo_loss = KoLeoLoss()
+
+        else:
+            logger.info("OPTIONS -- DINO -- not using DINO")
+
+        if self.do_dino or self.do_ibot:
+            student_model_dict["dino_head"] = dino_head()
+            teacher_model_dict["dino_head"] = dino_head()
+
+        logger.info("OPTIONS -- IBOT")
+        logger.info(f"OPTIONS -- IBOT -- loss_weight: {cfg.ibot.loss_weight}")
+        logger.info(f"OPTIONS -- IBOT masking -- ibot_mask_ratio_tuple: {cfg.ibot.mask_ratio_min_max}")
+        logger.info(f"OPTIONS -- IBOT masking -- ibot_mask_sample_probability: {cfg.ibot.mask_sample_probability}")
+        if self.do_ibot:
+            self.ibot_loss_weight = cfg.ibot.loss_weight
+            assert max(cfg.ibot.mask_ratio_min_max) > 0, "please provide a positive mask ratio tuple for ibot"
+            assert cfg.ibot.mask_sample_probability > 0, "please provide a positive mask probability for ibot"
+            self.ibot_out_dim = cfg.ibot.head_n_prototypes if self.ibot_separate_head else cfg.dino.head_n_prototypes
+            self.ibot_patch_loss = iBOTPatchLoss(self.ibot_out_dim)
+            if self.ibot_separate_head:
+                logger.info(f"OPTIONS -- IBOT -- loss_weight: {cfg.ibot.loss_weight}")
+                logger.info(f"OPTIONS -- IBOT -- head_n_prototypes: {cfg.ibot.head_n_prototypes}")
+                logger.info(f"OPTIONS -- IBOT -- head_bottleneck_dim: {cfg.ibot.head_bottleneck_dim}")
+                logger.info(f"OPTIONS -- IBOT -- head_hidden_dim: {cfg.ibot.head_hidden_dim}")
+                ibot_head = partial(
+                    DINOHead,
+                    in_dim=embed_dim,
+                    out_dim=cfg.ibot.head_n_prototypes,
+                    hidden_dim=cfg.ibot.head_hidden_dim,
+                    bottleneck_dim=cfg.ibot.head_bottleneck_dim,
+                    nlayers=cfg.ibot.head_nlayers,
+                )
+                student_model_dict["ibot_head"] = ibot_head()
+                teacher_model_dict["ibot_head"] = ibot_head()
+            else:
+                logger.info("OPTIONS -- IBOT -- head shared with DINO")
+
+        self.need_to_synchronize_fsdp_streams = True
+
+        self.student = nn.ModuleDict(student_model_dict)
+        self.teacher = nn.ModuleDict(teacher_model_dict)
+
+        # there is no backpropagation through the teacher, so no need for gradients
+        for p in self.teacher.parameters():
+            p.requires_grad = False
+        logger.info(f"Student and Teacher are built: they are both {cfg.student.arch} network.")
+
+    def forward(self, inputs):
+        raise NotImplementedError
+
+    def backprop_loss(self, loss):
+        if self.fp16_scaler is not None:
+            self.fp16_scaler.scale(loss).backward()
+        else:
+            loss.backward()
+
+    def forward_backward(self, images, teacher_temp):
+        n_global_crops = 2
+        assert n_global_crops == 2
+        n_local_crops = self.cfg.crops.local_crops_number
+
+        global_crops = images["collated_global_crops"].cuda(non_blocking=True)
+        local_crops = images["collated_local_crops"].cuda(non_blocking=True)
+
+        masks = images["collated_masks"].cuda(non_blocking=True)
+        mask_indices_list = images["mask_indices_list"].cuda(non_blocking=True)
+        n_masked_patches_tensor = images["n_masked_patches"].cuda(non_blocking=True)
+        n_masked_patches = mask_indices_list.shape[0]
+        upperbound = images["upperbound"]
+        masks_weight = images["masks_weight"].cuda(non_blocking=True)
+
+        n_local_crops_loss_terms = max(n_local_crops * n_global_crops, 1)
+        n_global_crops_loss_terms = (n_global_crops - 1) * n_global_crops
+
+        do_dino = self.do_dino
+        do_ibot = self.do_ibot
+
+        # loss scales
+        ibot_loss_scale = 1.0 / n_global_crops
+
+        # teacher output
+        @torch.no_grad()
+        def get_teacher_output():
+            x, n_global_crops_teacher = global_crops, n_global_crops
+            teacher_backbone_output_dict = self.teacher.backbone(x, is_training=True)
+            teacher_cls_tokens = teacher_backbone_output_dict["x_norm_clstoken"]
+            teacher_cls_tokens = teacher_cls_tokens.chunk(n_global_crops_teacher)
+            # watch out: these are chunked and cat'd in reverse so A is matched to B in the global crops dino loss
+            teacher_cls_tokens = torch.cat((teacher_cls_tokens[1], teacher_cls_tokens[0]))
+            ibot_teacher_patch_tokens = teacher_backbone_output_dict["x_norm_patchtokens"]
+            _dim = ibot_teacher_patch_tokens.shape[-1]
+            n_cls_tokens = teacher_cls_tokens.shape[0]
+
+            if do_ibot and not self.ibot_separate_head:
+                buffer_tensor_teacher = ibot_teacher_patch_tokens.new_zeros(upperbound + n_cls_tokens, _dim)
+                buffer_tensor_teacher[:n_cls_tokens].copy_(teacher_cls_tokens)
+                torch.index_select(
+                    ibot_teacher_patch_tokens.flatten(0, 1),
+                    dim=0,
+                    index=mask_indices_list,
+                    out=buffer_tensor_teacher[n_cls_tokens : n_cls_tokens + n_masked_patches],
+                )
+                tokens_after_head = self.teacher.dino_head(buffer_tensor_teacher)
+                teacher_cls_tokens_after_head = tokens_after_head[:n_cls_tokens]
+                masked_teacher_patch_tokens_after_head = tokens_after_head[
+                    n_cls_tokens : n_cls_tokens + n_masked_patches
+                ]
+            elif do_ibot and self.ibot_separate_head:
+                buffer_tensor_teacher = ibot_teacher_patch_tokens.new_zeros(upperbound, _dim)
+                torch.index_select(
+                    ibot_teacher_patch_tokens.flatten(0, 1),
+                    dim=0,
+                    index=mask_indices_list,
+                    out=buffer_tensor_teacher[:n_masked_patches],
+                )
+                teacher_cls_tokens_after_head = self.teacher.dino_head(teacher_cls_tokens)
+                masked_teacher_patch_tokens_after_head = self.teacher.ibot_head(buffer_tensor_teacher)[
+                    :n_masked_patches
+                ]
+            else:
+                teacher_cls_tokens_after_head = self.teacher.dino_head(teacher_cls_tokens)
+                masked_teacher_ibot_softmaxed_centered = None
+
+            if self.cfg.train.centering == "centering":
+                teacher_dino_softmaxed_centered_list = self.dino_loss.softmax_center_teacher(
+                    teacher_cls_tokens_after_head, teacher_temp=teacher_temp
+                ).view(n_global_crops_teacher, -1, *teacher_cls_tokens_after_head.shape[1:])
+                self.dino_loss.update_center(teacher_cls_tokens_after_head)
+                if do_ibot:
+                    masked_teacher_patch_tokens_after_head = masked_teacher_patch_tokens_after_head.unsqueeze(0)
+                    masked_teacher_ibot_softmaxed_centered = self.ibot_patch_loss.softmax_center_teacher(
+                        masked_teacher_patch_tokens_after_head[:, :n_masked_patches], teacher_temp=teacher_temp
+                    )
+                    masked_teacher_ibot_softmaxed_centered = masked_teacher_ibot_softmaxed_centered.squeeze(0)
+                    self.ibot_patch_loss.update_center(masked_teacher_patch_tokens_after_head[:n_masked_patches])
+
+            elif self.cfg.train.centering == "sinkhorn_knopp":
+                teacher_dino_softmaxed_centered_list = self.dino_loss.sinkhorn_knopp_teacher(
+                    teacher_cls_tokens_after_head, teacher_temp=teacher_temp
+                ).view(n_global_crops_teacher, -1, *teacher_cls_tokens_after_head.shape[1:])
+
+                if do_ibot:
+                    masked_teacher_ibot_softmaxed_centered = self.ibot_patch_loss.sinkhorn_knopp_teacher(
+                        masked_teacher_patch_tokens_after_head,
+                        teacher_temp=teacher_temp,
+                        n_masked_patches_tensor=n_masked_patches_tensor,
+                    )
+
+            else:
+                raise NotImplementedError
+
+            return teacher_dino_softmaxed_centered_list, masked_teacher_ibot_softmaxed_centered
+
+        teacher_dino_softmaxed_centered_list, masked_teacher_ibot_softmaxed_centered = get_teacher_output()
+        reshard_fsdp_model(self.teacher)
+
+        loss_dict = {}
+
+        loss_accumulator = 0  # for backprop
+        student_global_backbone_output_dict, student_local_backbone_output_dict = self.student.backbone(
+            [global_crops, local_crops], masks=[masks, None], is_training=True
+        )
+
+        inputs_for_student_head_list = []
+
+        # 1a: local crops cls tokens
+        student_local_cls_tokens = student_local_backbone_output_dict["x_norm_clstoken"]
+        inputs_for_student_head_list.append(student_local_cls_tokens.unsqueeze(0))
+
+        # 1b: global crops cls tokens
+        student_global_cls_tokens = student_global_backbone_output_dict["x_norm_clstoken"]
+        inputs_for_student_head_list.append(student_global_cls_tokens.unsqueeze(0))
+
+        # 1c: global crops patch tokens
+        if do_ibot:
+            _dim = student_global_backbone_output_dict["x_norm_clstoken"].shape[-1]
+            ibot_student_patch_tokens = student_global_backbone_output_dict["x_norm_patchtokens"]
+            buffer_tensor_patch_tokens = ibot_student_patch_tokens.new_zeros(upperbound, _dim)
+            buffer_tensor_patch_tokens[:n_masked_patches].copy_(
+                torch.index_select(ibot_student_patch_tokens.flatten(0, 1), dim=0, index=mask_indices_list)
+            )
+            if not self.ibot_separate_head:
+                inputs_for_student_head_list.append(buffer_tensor_patch_tokens.unsqueeze(0))
+            else:
+                student_global_masked_patch_tokens_after_head = self.student.ibot_head(buffer_tensor_patch_tokens)[
+                    :n_masked_patches
+                ]
+
+        # 2: run
+        _attn_bias, cat_inputs = fmha.BlockDiagonalMask.from_tensor_list(inputs_for_student_head_list)
+        outputs_list = _attn_bias.split(self.student.dino_head(cat_inputs))
+
+        # 3a: local crops cls tokens
+        student_local_cls_tokens_after_head = outputs_list.pop(0).squeeze(0)
+
+        # 3b: global crops cls tokens
+        student_global_cls_tokens_after_head = outputs_list.pop(0).squeeze(0)
+
+        # 3c: global crops patch tokens
+        if do_ibot and not self.ibot_separate_head:
+            student_global_masked_patch_tokens_after_head = outputs_list.pop(0).squeeze(0)[:n_masked_patches]
+
+        if n_local_crops > 0:
+            dino_local_crops_loss = self.dino_loss(
+                student_output_list=student_local_cls_tokens_after_head.chunk(n_local_crops),
+                teacher_out_softmaxed_centered_list=teacher_dino_softmaxed_centered_list,
+            ) / (n_global_crops_loss_terms + n_local_crops_loss_terms)
+
+            # store for display
+            loss_dict["dino_local_crops_loss"] = dino_local_crops_loss
+
+            # accumulate loss
+            loss_accumulator += self.dino_loss_weight * dino_local_crops_loss
+
+        # process global crops
+        loss_scales = 2  # this is here since we process global crops together
+
+        if do_dino:
+            # compute loss
+            dino_global_crops_loss = (
+                self.dino_loss(
+                    student_output_list=[student_global_cls_tokens_after_head],
+                    teacher_out_softmaxed_centered_list=[
+                        teacher_dino_softmaxed_centered_list.flatten(0, 1)
+                    ],  # these were chunked and stacked in reverse so A is matched to B
+                )
+                * loss_scales
+                / (n_global_crops_loss_terms + n_local_crops_loss_terms)
+            )
+
+            loss_dict["dino_global_crops_loss"] = dino_global_crops_loss
+
+            # accumulate loss
+            loss_accumulator += self.dino_loss_weight * dino_global_crops_loss
+
+            student_cls_tokens = student_global_cls_tokens
+
+            if self.do_koleo:
+                koleo_loss = self.cfg.dino.koleo_loss_weight * sum(
+                    self.koleo_loss(p) for p in student_cls_tokens.chunk(2)
+                )  # we don't apply koleo loss between cls tokens of a same image
+                loss_accumulator += koleo_loss
+                loss_dict["koleo_loss"] = (
+                    koleo_loss / loss_scales
+                )  # this is to display the same losses as before but we can remove eventually
+
+        if do_ibot:
+            # compute loss
+            ibot_patch_loss = (
+                self.ibot_patch_loss.forward_masked(
+                    student_global_masked_patch_tokens_after_head,
+                    masked_teacher_ibot_softmaxed_centered,
+                    student_masks_flat=masks,
+                    n_masked_patches=n_masked_patches,
+                    masks_weight=masks_weight,
+                )
+                * loss_scales
+                * ibot_loss_scale
+            )
+
+            # store for display
+            loss_dict["ibot_loss"] = ibot_patch_loss / 2
+
+            # accumulate loss
+            loss_accumulator += self.ibot_loss_weight * ibot_patch_loss
+
+        self.backprop_loss(loss_accumulator)
+
+        self.fsdp_synchronize_streams()
+
+        return loss_dict
+
+    def fsdp_synchronize_streams(self):
+        if self.need_to_synchronize_fsdp_streams:
+            torch.cuda.synchronize()
+            self.student.dino_head._streams = (
+                self.teacher.dino_head._streams
+            ) = self.student.backbone._streams = self.teacher.backbone._streams
+            self.need_to_synchronize_fsdp_streams = False
+
+    def update_teacher(self, m):
+        student_param_list = []
+        teacher_param_list = []
+        with torch.no_grad():
+            for k in self.student.keys():
+                for ms, mt in zip(get_fsdp_modules(self.student[k]), get_fsdp_modules(self.teacher[k])):
+                    student_param_list += ms.params
+                    teacher_param_list += mt.params
+            torch._foreach_mul_(teacher_param_list, m)
+            torch._foreach_add_(teacher_param_list, student_param_list, alpha=1 - m)
+
+    def train(self):
+        super().train()
+        self.teacher.eval()
+
+    def get_maybe_fused_params_for_submodel(self, m):
+        params_groups = get_params_groups_with_decay(
+            model=m,
+            lr_decay_rate=self.cfg.optim.layerwise_decay,
+            patch_embed_lr_mult=self.cfg.optim.patch_embed_lr_mult,
+        )
+        fused_params_groups = fuse_params_groups(params_groups)
+        logger.info("fusing param groups")
+
+        for g in fused_params_groups:
+            g["foreach"] = True
+        return fused_params_groups
+
+    def get_params_groups(self):
+        all_params_groups = []
+        for m in self.student.values():
+            all_params_groups += self.get_maybe_fused_params_for_submodel(m)
+        return all_params_groups
+
+    def prepare_for_distributed_training(self):
+        logger.info("DISTRIBUTED FSDP -- preparing model for distributed training")
+        if has_batchnorms(self.student):
+            raise NotImplementedError
+        # below will synchronize all student subnetworks across gpus:
+        for k, v in self.student.items():
+            self.teacher[k].load_state_dict(self.student[k].state_dict())
+            student_model_cfg = self.cfg.compute_precision.student[k]
+            self.student[k] = get_fsdp_wrapper(student_model_cfg, modules_to_wrap={BlockChunk})(self.student[k])
+            teacher_model_cfg = self.cfg.compute_precision.teacher[k]
+            self.teacher[k] = get_fsdp_wrapper(teacher_model_cfg, modules_to_wrap={BlockChunk})(self.teacher[k])
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/train/train.py b/torchhub/facebookresearch_dinov2_main/dinov2/train/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..5279b9c4317e56b5c0a9c39f7bf9bf56b04a1f8b
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/train/train.py
@@ -0,0 +1,319 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import logging
+import math
+import os
+from functools import partial
+
+from fvcore.common.checkpoint import PeriodicCheckpointer
+import torch
+
+from dinov2.data import SamplerType, make_data_loader, make_dataset
+from dinov2.data import collate_data_and_cast, DataAugmentationDINO, MaskingGenerator
+import dinov2.distributed as distributed
+from dinov2.fsdp import FSDPCheckpointer
+from dinov2.logging import MetricLogger
+from dinov2.utils.config import setup
+from dinov2.utils.utils import CosineScheduler
+
+from dinov2.train.ssl_meta_arch import SSLMetaArch
+
+
+torch.backends.cuda.matmul.allow_tf32 = True  # PyTorch 1.12 sets this to False by default
+logger = logging.getLogger("dinov2")
+
+
+def get_args_parser(add_help: bool = True):
+    parser = argparse.ArgumentParser("DINOv2 training", add_help=add_help)
+    parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file")
+    parser.add_argument(
+        "--no-resume",
+        action="store_true",
+        help="Whether to not attempt to resume from the checkpoint directory. ",
+    )
+    parser.add_argument("--eval-only", action="store_true", help="perform evaluation only")
+    parser.add_argument("--eval", type=str, default="", help="Eval type to perform")
+    parser.add_argument(
+        "opts",
+        help="""
+Modify config options at the end of the command. For Yacs configs, use
+space-separated "PATH.KEY VALUE" pairs.
+For python-based LazyConfig, use "path.key=value".
+        """.strip(),
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
+    parser.add_argument(
+        "--output-dir",
+        "--output_dir",
+        default="",
+        type=str,
+        help="Output directory to save logs and checkpoints",
+    )
+
+    return parser
+
+
+def build_optimizer(cfg, params_groups):
+    return torch.optim.AdamW(params_groups, betas=(cfg.optim.adamw_beta1, cfg.optim.adamw_beta2))
+
+
+def build_schedulers(cfg):
+    OFFICIAL_EPOCH_LENGTH = cfg.train.OFFICIAL_EPOCH_LENGTH
+    lr = dict(
+        base_value=cfg.optim["lr"],
+        final_value=cfg.optim["min_lr"],
+        total_iters=cfg.optim["epochs"] * OFFICIAL_EPOCH_LENGTH,
+        warmup_iters=cfg.optim["warmup_epochs"] * OFFICIAL_EPOCH_LENGTH,
+        start_warmup_value=0,
+    )
+    wd = dict(
+        base_value=cfg.optim["weight_decay"],
+        final_value=cfg.optim["weight_decay_end"],
+        total_iters=cfg.optim["epochs"] * OFFICIAL_EPOCH_LENGTH,
+    )
+    momentum = dict(
+        base_value=cfg.teacher["momentum_teacher"],
+        final_value=cfg.teacher["final_momentum_teacher"],
+        total_iters=cfg.optim["epochs"] * OFFICIAL_EPOCH_LENGTH,
+    )
+    teacher_temp = dict(
+        base_value=cfg.teacher["teacher_temp"],
+        final_value=cfg.teacher["teacher_temp"],
+        total_iters=cfg.teacher["warmup_teacher_temp_epochs"] * OFFICIAL_EPOCH_LENGTH,
+        warmup_iters=cfg.teacher["warmup_teacher_temp_epochs"] * OFFICIAL_EPOCH_LENGTH,
+        start_warmup_value=cfg.teacher["warmup_teacher_temp"],
+    )
+
+    lr_schedule = CosineScheduler(**lr)
+    wd_schedule = CosineScheduler(**wd)
+    momentum_schedule = CosineScheduler(**momentum)
+    teacher_temp_schedule = CosineScheduler(**teacher_temp)
+    last_layer_lr_schedule = CosineScheduler(**lr)
+
+    last_layer_lr_schedule.schedule[
+        : cfg.optim["freeze_last_layer_epochs"] * OFFICIAL_EPOCH_LENGTH
+    ] = 0  # mimicking the original schedules
+
+    logger.info("Schedulers ready.")
+
+    return (
+        lr_schedule,
+        wd_schedule,
+        momentum_schedule,
+        teacher_temp_schedule,
+        last_layer_lr_schedule,
+    )
+
+
+def apply_optim_scheduler(optimizer, lr, wd, last_layer_lr):
+    for param_group in optimizer.param_groups:
+        is_last_layer = param_group["is_last_layer"]
+        lr_multiplier = param_group["lr_multiplier"]
+        wd_multiplier = param_group["wd_multiplier"]
+        param_group["weight_decay"] = wd * wd_multiplier
+        param_group["lr"] = (last_layer_lr if is_last_layer else lr) * lr_multiplier
+
+
+def do_test(cfg, model, iteration):
+    new_state_dict = model.teacher.state_dict()
+
+    if distributed.is_main_process():
+        iterstring = str(iteration)
+        eval_dir = os.path.join(cfg.train.output_dir, "eval", iterstring)
+        os.makedirs(eval_dir, exist_ok=True)
+        # save teacher checkpoint
+        teacher_ckp_path = os.path.join(eval_dir, "teacher_checkpoint.pth")
+        torch.save({"teacher": new_state_dict}, teacher_ckp_path)
+
+
+def do_train(cfg, model, resume=False):
+    model.train()
+    inputs_dtype = torch.half
+    fp16_scaler = model.fp16_scaler  # for mixed precision training
+
+    # setup optimizer
+
+    optimizer = build_optimizer(cfg, model.get_params_groups())
+    (
+        lr_schedule,
+        wd_schedule,
+        momentum_schedule,
+        teacher_temp_schedule,
+        last_layer_lr_schedule,
+    ) = build_schedulers(cfg)
+
+    # checkpointer
+    checkpointer = FSDPCheckpointer(model, cfg.train.output_dir, optimizer=optimizer, save_to_disk=True)
+
+    start_iter = checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1
+
+    OFFICIAL_EPOCH_LENGTH = cfg.train.OFFICIAL_EPOCH_LENGTH
+    max_iter = cfg.optim.epochs * OFFICIAL_EPOCH_LENGTH
+
+    periodic_checkpointer = PeriodicCheckpointer(
+        checkpointer,
+        period=3 * OFFICIAL_EPOCH_LENGTH,
+        max_iter=max_iter,
+        max_to_keep=3,
+    )
+
+    # setup data preprocessing
+
+    img_size = cfg.crops.global_crops_size
+    patch_size = cfg.student.patch_size
+    n_tokens = (img_size // patch_size) ** 2
+    mask_generator = MaskingGenerator(
+        input_size=(img_size // patch_size, img_size // patch_size),
+        max_num_patches=0.5 * img_size // patch_size * img_size // patch_size,
+    )
+
+    data_transform = DataAugmentationDINO(
+        cfg.crops.global_crops_scale,
+        cfg.crops.local_crops_scale,
+        cfg.crops.local_crops_number,
+        global_crops_size=cfg.crops.global_crops_size,
+        local_crops_size=cfg.crops.local_crops_size,
+    )
+
+    collate_fn = partial(
+        collate_data_and_cast,
+        mask_ratio_tuple=cfg.ibot.mask_ratio_min_max,
+        mask_probability=cfg.ibot.mask_sample_probability,
+        n_tokens=n_tokens,
+        mask_generator=mask_generator,
+        dtype=inputs_dtype,
+    )
+
+    # setup data loader
+
+    dataset = make_dataset(
+        dataset_str=cfg.train.dataset_path,
+        transform=data_transform,
+        target_transform=lambda _: (),
+    )
+    # sampler_type = SamplerType.INFINITE
+    sampler_type = SamplerType.SHARDED_INFINITE
+    data_loader = make_data_loader(
+        dataset=dataset,
+        batch_size=cfg.train.batch_size_per_gpu,
+        num_workers=cfg.train.num_workers,
+        shuffle=True,
+        seed=start_iter,  # TODO: Fix this -- cfg.train.seed
+        sampler_type=sampler_type,
+        sampler_advance=0,  # TODO(qas): fix this -- start_iter * cfg.train.batch_size_per_gpu,
+        drop_last=True,
+        collate_fn=collate_fn,
+    )
+
+    # training loop
+
+    iteration = start_iter
+
+    logger.info("Starting training from iteration {}".format(start_iter))
+    metrics_file = os.path.join(cfg.train.output_dir, "training_metrics.json")
+    metric_logger = MetricLogger(delimiter="  ", output_file=metrics_file)
+    header = "Training"
+
+    for data in metric_logger.log_every(
+        data_loader,
+        10,
+        header,
+        max_iter,
+        start_iter,
+    ):
+        current_batch_size = data["collated_global_crops"].shape[0] / 2
+        if iteration > max_iter:
+            return
+
+        # apply schedules
+
+        lr = lr_schedule[iteration]
+        wd = wd_schedule[iteration]
+        mom = momentum_schedule[iteration]
+        teacher_temp = teacher_temp_schedule[iteration]
+        last_layer_lr = last_layer_lr_schedule[iteration]
+        apply_optim_scheduler(optimizer, lr, wd, last_layer_lr)
+
+        # compute losses
+
+        optimizer.zero_grad(set_to_none=True)
+        loss_dict = model.forward_backward(data, teacher_temp=teacher_temp)
+
+        # clip gradients
+
+        if fp16_scaler is not None:
+            if cfg.optim.clip_grad:
+                fp16_scaler.unscale_(optimizer)
+                for v in model.student.values():
+                    v.clip_grad_norm_(cfg.optim.clip_grad)
+            fp16_scaler.step(optimizer)
+            fp16_scaler.update()
+        else:
+            if cfg.optim.clip_grad:
+                for v in model.student.values():
+                    v.clip_grad_norm_(cfg.optim.clip_grad)
+            optimizer.step()
+
+        # perform teacher EMA update
+
+        model.update_teacher(mom)
+
+        # logging
+
+        if distributed.get_global_size() > 1:
+            for v in loss_dict.values():
+                torch.distributed.all_reduce(v)
+        loss_dict_reduced = {k: v.item() / distributed.get_global_size() for k, v in loss_dict.items()}
+
+        if math.isnan(sum(loss_dict_reduced.values())):
+            logger.info("NaN detected")
+            raise AssertionError
+        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
+
+        metric_logger.update(lr=lr)
+        metric_logger.update(wd=wd)
+        metric_logger.update(mom=mom)
+        metric_logger.update(last_layer_lr=last_layer_lr)
+        metric_logger.update(current_batch_size=current_batch_size)
+        metric_logger.update(total_loss=losses_reduced, **loss_dict_reduced)
+
+        # checkpointing and testing
+
+        if cfg.evaluation.eval_period_iterations > 0 and (iteration + 1) % cfg.evaluation.eval_period_iterations == 0:
+            do_test(cfg, model, f"training_{iteration}")
+            torch.cuda.synchronize()
+        periodic_checkpointer.step(iteration)
+
+        iteration = iteration + 1
+    metric_logger.synchronize_between_processes()
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+def main(args):
+    cfg = setup(args)
+
+    model = SSLMetaArch(cfg).to(torch.device("cuda"))
+    model.prepare_for_distributed_training()
+
+    logger.info("Model:\n{}".format(model))
+    if args.eval_only:
+        iteration = (
+            FSDPCheckpointer(model, save_dir=cfg.train.output_dir)
+            .resume_or_load(cfg.MODEL.WEIGHTS, resume=not args.no_resume)
+            .get("iteration", -1)
+            + 1
+        )
+        return do_test(cfg, model, f"manual_{iteration}")
+
+    do_train(cfg, model, resume=not args.no_resume)
+
+
+if __name__ == "__main__":
+    args = get_args_parser(add_help=True).parse_args()
+    main(args)
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/utils/__init__.py b/torchhub/facebookresearch_dinov2_main/dinov2/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0952fcc3f57e34b3747962e9ebd6fc57aeea63fa
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/utils/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/utils/cluster.py b/torchhub/facebookresearch_dinov2_main/dinov2/utils/cluster.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d98c05d68aa6e9dc165df3db06bd70d999b3fda
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/utils/cluster.py
@@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+import os
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+
+class ClusterType(Enum):
+    AWS = "aws"
+    FAIR = "fair"
+    RSC = "rsc"
+
+
+def _guess_cluster_type() -> ClusterType:
+    uname = os.uname()
+    if uname.sysname == "Linux":
+        if uname.release.endswith("-aws"):
+            # Linux kernel versions on AWS instances are of the form "5.4.0-1051-aws"
+            return ClusterType.AWS
+        elif uname.nodename.startswith("rsc"):
+            # Linux kernel versions on RSC instances are standard ones but hostnames start with "rsc"
+            return ClusterType.RSC
+
+    return ClusterType.FAIR
+
+
+def get_cluster_type(cluster_type: Optional[ClusterType] = None) -> Optional[ClusterType]:
+    if cluster_type is None:
+        return _guess_cluster_type()
+
+    return cluster_type
+
+
+def get_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type is None:
+        return None
+
+    CHECKPOINT_DIRNAMES = {
+        ClusterType.AWS: "checkpoints",
+        ClusterType.FAIR: "checkpoint",
+        ClusterType.RSC: "checkpoint/dino",
+    }
+    return Path("/") / CHECKPOINT_DIRNAMES[cluster_type]
+
+
+def get_user_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
+    checkpoint_path = get_checkpoint_path(cluster_type)
+    if checkpoint_path is None:
+        return None
+
+    username = os.environ.get("USER")
+    assert username is not None
+    return checkpoint_path / username
+
+
+def get_slurm_partition(cluster_type: Optional[ClusterType] = None) -> Optional[str]:
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type is None:
+        return None
+
+    SLURM_PARTITIONS = {
+        ClusterType.AWS: "learnlab",
+        ClusterType.FAIR: "learnlab",
+        ClusterType.RSC: "learn",
+    }
+    return SLURM_PARTITIONS[cluster_type]
+
+
+def get_slurm_executor_parameters(
+    nodes: int, num_gpus_per_node: int, cluster_type: Optional[ClusterType] = None, **kwargs
+) -> Dict[str, Any]:
+    # create default parameters
+    params = {
+        "mem_gb": 0,  # Requests all memory on a node, see https://slurm.schedmd.com/sbatch.html
+        "gpus_per_node": num_gpus_per_node,
+        "tasks_per_node": num_gpus_per_node,  # one task per GPU
+        "cpus_per_task": 10,
+        "nodes": nodes,
+        "slurm_partition": get_slurm_partition(cluster_type),
+    }
+    # apply cluster-specific adjustments
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type == ClusterType.AWS:
+        params["cpus_per_task"] = 12
+        del params["mem_gb"]
+    elif cluster_type == ClusterType.RSC:
+        params["cpus_per_task"] = 12
+    # set additional parameters / apply overrides
+    params.update(kwargs)
+    return params
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/utils/config.py b/torchhub/facebookresearch_dinov2_main/dinov2/utils/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3763a8b0808ad45cbbfc1dcb00d52b00113f9ad
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/utils/config.py
@@ -0,0 +1,73 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import logging
+import os
+
+from omegaconf import OmegaConf
+
+import dinov2.distributed as distributed
+from dinov2.logging import setup_logging
+from dinov2.utils import utils
+from dinov2.configs import dinov2_default_config
+
+
+logger = logging.getLogger("dinov2")
+
+
+def apply_scaling_rules_to_cfg(cfg):  # to fix
+    if cfg.optim.scaling_rule == "sqrt_wrt_1024":
+        base_lr = cfg.optim.base_lr
+        cfg.optim.lr = base_lr
+        cfg.optim.lr *= math.sqrt(cfg.train.batch_size_per_gpu * distributed.get_global_size() / 1024.0)
+        logger.info(f"sqrt scaling learning rate; base: {base_lr}, new: {cfg.optim.lr}")
+    else:
+        raise NotImplementedError
+    return cfg
+
+
+def write_config(cfg, output_dir, name="config.yaml"):
+    logger.info(OmegaConf.to_yaml(cfg))
+    saved_cfg_path = os.path.join(output_dir, name)
+    with open(saved_cfg_path, "w") as f:
+        OmegaConf.save(config=cfg, f=f)
+    return saved_cfg_path
+
+
+def get_cfg_from_args(args):
+    args.output_dir = os.path.abspath(args.output_dir)
+    args.opts += [f"train.output_dir={args.output_dir}"]
+    default_cfg = OmegaConf.create(dinov2_default_config)
+    cfg = OmegaConf.load(args.config_file)
+    cfg = OmegaConf.merge(default_cfg, cfg, OmegaConf.from_cli(args.opts))
+    return cfg
+
+
+def default_setup(args):
+    distributed.enable(overwrite=True)
+    seed = getattr(args, "seed", 0)
+    rank = distributed.get_global_rank()
+
+    global logger
+    setup_logging(output=args.output_dir, level=logging.INFO)
+    logger = logging.getLogger("dinov2")
+
+    utils.fix_random_seeds(seed + rank)
+    logger.info("git:\n  {}\n".format(utils.get_sha()))
+    logger.info("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
+
+
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg_from_args(args)
+    os.makedirs(args.output_dir, exist_ok=True)
+    default_setup(args)
+    apply_scaling_rules_to_cfg(cfg)
+    write_config(cfg, args.output_dir)
+    return cfg
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/utils/dtype.py b/torchhub/facebookresearch_dinov2_main/dinov2/utils/dtype.py
new file mode 100644
index 0000000000000000000000000000000000000000..cef122b25ff3533e004799a1d977f63eb213fee0
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/utils/dtype.py
@@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Dict, Union
+
+import numpy as np
+import torch
+
+
+TypeSpec = Union[str, np.dtype, torch.dtype]
+
+
+_NUMPY_TO_TORCH_DTYPE: Dict[np.dtype, torch.dtype] = {
+    np.dtype("bool"): torch.bool,
+    np.dtype("uint8"): torch.uint8,
+    np.dtype("int8"): torch.int8,
+    np.dtype("int16"): torch.int16,
+    np.dtype("int32"): torch.int32,
+    np.dtype("int64"): torch.int64,
+    np.dtype("float16"): torch.float16,
+    np.dtype("float32"): torch.float32,
+    np.dtype("float64"): torch.float64,
+    np.dtype("complex64"): torch.complex64,
+    np.dtype("complex128"): torch.complex128,
+}
+
+
+def as_torch_dtype(dtype: TypeSpec) -> torch.dtype:
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    if isinstance(dtype, str):
+        dtype = np.dtype(dtype)
+    assert isinstance(dtype, np.dtype), f"Expected an instance of nunpy dtype, got {type(dtype)}"
+    return _NUMPY_TO_TORCH_DTYPE[dtype]
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/utils/param_groups.py b/torchhub/facebookresearch_dinov2_main/dinov2/utils/param_groups.py
new file mode 100644
index 0000000000000000000000000000000000000000..d707e70cc11591858d4166410d6ed80621cd49ff
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/utils/param_groups.py
@@ -0,0 +1,94 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import defaultdict
+import logging
+
+
+logger = logging.getLogger("dinov2")
+
+
+def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12, force_is_backbone=False, chunked_blocks=False):
+    """
+    Calculate lr decay rate for different ViT blocks.
+    Args:
+        name (string): parameter name.
+        lr_decay_rate (float): base lr decay rate.
+        num_layers (int): number of ViT blocks.
+    Returns:
+        lr decay rate for the given parameter.
+    """
+    layer_id = num_layers + 1
+    if name.startswith("backbone") or force_is_backbone:
+        if ".pos_embed" in name or ".patch_embed" in name or ".mask_token" in name or ".cls_token" in name:
+            layer_id = 0
+        elif force_is_backbone and (
+            "pos_embed" in name or "patch_embed" in name or "mask_token" in name or "cls_token" in name
+        ):
+            layer_id = 0
+        elif ".blocks." in name and ".residual." not in name:
+            layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1
+        elif chunked_blocks and "blocks." in name and "residual." not in name:
+            layer_id = int(name[name.find("blocks.") :].split(".")[2]) + 1
+        elif "blocks." in name and "residual." not in name:
+            layer_id = int(name[name.find("blocks.") :].split(".")[1]) + 1
+
+    return lr_decay_rate ** (num_layers + 1 - layer_id)
+
+
+def get_params_groups_with_decay(model, lr_decay_rate=1.0, patch_embed_lr_mult=1.0):
+    chunked_blocks = False
+    if hasattr(model, "n_blocks"):
+        logger.info("chunked fsdp")
+        n_blocks = model.n_blocks
+        chunked_blocks = model.chunked_blocks
+    elif hasattr(model, "blocks"):
+        logger.info("first code branch")
+        n_blocks = len(model.blocks)
+    elif hasattr(model, "backbone"):
+        logger.info("second code branch")
+        n_blocks = len(model.backbone.blocks)
+    else:
+        logger.info("else code branch")
+        n_blocks = 0
+    all_param_groups = []
+
+    for name, param in model.named_parameters():
+        name = name.replace("_fsdp_wrapped_module.", "")
+        if not param.requires_grad:
+            continue
+        decay_rate = get_vit_lr_decay_rate(
+            name, lr_decay_rate, num_layers=n_blocks, force_is_backbone=n_blocks > 0, chunked_blocks=chunked_blocks
+        )
+        d = {"params": param, "is_last_layer": False, "lr_multiplier": decay_rate, "wd_multiplier": 1.0, "name": name}
+
+        if "last_layer" in name:
+            d.update({"is_last_layer": True})
+
+        if name.endswith(".bias") or "norm" in name or "gamma" in name:
+            d.update({"wd_multiplier": 0.0})
+
+        if "patch_embed" in name:
+            d.update({"lr_multiplier": d["lr_multiplier"] * patch_embed_lr_mult})
+
+        all_param_groups.append(d)
+        logger.info(f"""{name}: lr_multiplier: {d["lr_multiplier"]}, wd_multiplier: {d["wd_multiplier"]}""")
+
+    return all_param_groups
+
+
+def fuse_params_groups(all_params_groups, keys=("lr_multiplier", "wd_multiplier", "is_last_layer")):
+    fused_params_groups = defaultdict(lambda: {"params": []})
+    for d in all_params_groups:
+        identifier = ""
+        for k in keys:
+            identifier += k + str(d[k]) + "_"
+
+        for k in keys:
+            fused_params_groups[identifier][k] = d[k]
+        fused_params_groups[identifier]["params"].append(d["params"])
+
+    return fused_params_groups.values()
diff --git a/torchhub/facebookresearch_dinov2_main/dinov2/utils/utils.py b/torchhub/facebookresearch_dinov2_main/dinov2/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..53e63eb427f6d5396c8dc153ab07e825c72b68b4
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/dinov2/utils/utils.py
@@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import random
+import subprocess
+from urllib.parse import urlparse
+
+import numpy as np
+import torch
+from torch import nn
+
+
+logger = logging.getLogger("dinov2")
+
+
+def load_pretrained_weights(model, pretrained_weights, checkpoint_key):
+    if urlparse(pretrained_weights).scheme:  # If it looks like an URL
+        state_dict = torch.hub.load_state_dict_from_url(pretrained_weights, map_location="cpu")
+    else:
+        state_dict = torch.load(pretrained_weights, map_location="cpu")
+    if checkpoint_key is not None and checkpoint_key in state_dict:
+        logger.info(f"Take key {checkpoint_key} in provided checkpoint dict")
+        state_dict = state_dict[checkpoint_key]
+    # remove `module.` prefix
+    state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
+    # remove `backbone.` prefix induced by multicrop wrapper
+    state_dict = {k.replace("backbone.", ""): v for k, v in state_dict.items()}
+    msg = model.load_state_dict(state_dict, strict=False)
+    logger.info("Pretrained weights found at {} and loaded with msg: {}".format(pretrained_weights, msg))
+
+
+def fix_random_seeds(seed=31):
+    """
+    Fix random seeds.
+    """
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+
+
+def get_sha():
+    cwd = os.path.dirname(os.path.abspath(__file__))
+
+    def _run(command):
+        return subprocess.check_output(command, cwd=cwd).decode("ascii").strip()
+
+    sha = "N/A"
+    diff = "clean"
+    branch = "N/A"
+    try:
+        sha = _run(["git", "rev-parse", "HEAD"])
+        subprocess.check_output(["git", "diff"], cwd=cwd)
+        diff = _run(["git", "diff-index", "HEAD"])
+        diff = "has uncommitted changes" if diff else "clean"
+        branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"])
+    except Exception:
+        pass
+    message = f"sha: {sha}, status: {diff}, branch: {branch}"
+    return message
+
+
+class CosineScheduler(object):
+    def __init__(self, base_value, final_value, total_iters, warmup_iters=0, start_warmup_value=0, freeze_iters=0):
+        super().__init__()
+        self.final_value = final_value
+        self.total_iters = total_iters
+
+        freeze_schedule = np.zeros((freeze_iters))
+
+        warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters)
+
+        iters = np.arange(total_iters - warmup_iters - freeze_iters)
+        schedule = final_value + 0.5 * (base_value - final_value) * (1 + np.cos(np.pi * iters / len(iters)))
+        self.schedule = np.concatenate((freeze_schedule, warmup_schedule, schedule))
+
+        assert len(self.schedule) == self.total_iters
+
+    def __getitem__(self, it):
+        if it >= self.total_iters:
+            return self.final_value
+        else:
+            return self.schedule[it]
+
+
+def has_batchnorms(model):
+    bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)
+    for name, module in model.named_modules():
+        if isinstance(module, bn_types):
+            return True
+    return False
diff --git a/torchhub/facebookresearch_dinov2_main/hubconf.py b/torchhub/facebookresearch_dinov2_main/hubconf.py
new file mode 100644
index 0000000000000000000000000000000000000000..b36b42cd2136182ea956d8be785cf492418163d8
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/hubconf.py
@@ -0,0 +1,162 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+from typing import Union
+
+import torch
+
+_DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
+
+
+def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str:
+    compact_arch_name = arch_name.replace("_", "")[:4]
+    registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
+    return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
+
+
+class Weights(Enum):
+    LVD142M = "LVD142M"
+
+
+def _make_dinov2_model(
+    *,
+    arch_name: str = "vit_large",
+    img_size: int = 518,
+    patch_size: int = 14,
+    init_values: float = 1.0,
+    ffn_layer: str = "mlp",
+    block_chunks: int = 0,
+    num_register_tokens: int = 0,
+    interpolate_antialias: bool = False,
+    interpolate_offset: float = 0.1,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.LVD142M,
+    **kwargs,
+):
+    import vision_transformer as vits
+
+    if isinstance(weights, str):
+        try:
+            weights = Weights[weights]
+        except KeyError:
+            raise AssertionError(f"Unsupported weights: {weights}")
+
+    model_base_name = _make_dinov2_model_name(arch_name, patch_size)
+    vit_kwargs = dict(
+        img_size=img_size,
+        patch_size=patch_size,
+        init_values=init_values,
+        ffn_layer=ffn_layer,
+        block_chunks=block_chunks,
+        num_register_tokens=num_register_tokens,
+        interpolate_antialias=interpolate_antialias,
+        interpolate_offset=interpolate_offset,
+    )
+    vit_kwargs.update(**kwargs)
+    model = vits.__dict__[arch_name](**vit_kwargs)
+
+    if pretrained:
+        model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
+        url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth"
+        state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        model.load_state_dict(state_dict, strict=True)
+
+    return model
+
+
+def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs)
+
+
+def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs)
+
+
+def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs)
+
+
+def dinov2_vitg14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        **kwargs,
+    )
+
+
+def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_small",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+
+
+def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_base",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+
+
+def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_large",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+
+
+def dinov2_vitg14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
diff --git a/torchhub/facebookresearch_dinov2_main/pyproject.toml b/torchhub/facebookresearch_dinov2_main/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..da67abd8ceabe6d427a96e5d9d4f04b25aebcd32
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/pyproject.toml
@@ -0,0 +1,29 @@
+[tool.black]
+line-length = 120
+
+[tool.pylint.master]
+persistent = false
+score = false
+
+[tool.pylint.messages_control]
+disable = "all"
+enable = [
+  "miscellaneous",
+  "similarities",
+]
+
+[tool.pylint.similarities]
+ignore-comments = true
+ignore-docstrings = true
+ignore-imports = true
+min-similarity-lines = 8
+
+[tool.pylint.reports]
+reports = false
+
+[tool.pylint.miscellaneous]
+notes = [
+  "FIXME",
+  "XXX",
+  "TODO",
+]
diff --git a/torchhub/facebookresearch_dinov2_main/requirements-dev.txt b/torchhub/facebookresearch_dinov2_main/requirements-dev.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5cad34c34cde3a182b616d68b168588827eb9b7c
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/requirements-dev.txt
@@ -0,0 +1,3 @@
+black==22.6.0
+flake8==5.0.4
+pylint==2.15.0
diff --git a/torchhub/facebookresearch_dinov2_main/requirements.txt b/torchhub/facebookresearch_dinov2_main/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..04c159c443b89330ff3c84257c41b011f9791257
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/requirements.txt
@@ -0,0 +1,11 @@
+--extra-index-url https://download.pytorch.org/whl/cu117
+torch==2.0.0
+torchvision==0.15.0
+omegaconf
+torchmetrics==0.10.3
+fvcore
+iopath
+xformers==0.0.18
+submitit
+--extra-index-url https://pypi.nvidia.com
+cuml-cu11
diff --git a/torchhub/facebookresearch_dinov2_main/scripts/lint.sh b/torchhub/facebookresearch_dinov2_main/scripts/lint.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b91acaf762c4be3a0c9d2a162210bfebfaacba08
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/scripts/lint.sh
@@ -0,0 +1,28 @@
+#!/bin/sh
+
+if [ -n "$1" ]; then
+  echo "linting \"$1\""
+fi
+
+echo "running black"
+if [ -n "$1" ]; then
+  black "$1"
+else
+  black dinov2
+fi
+
+echo "running flake8"
+if [ -n "$1" ]; then
+  flake8 "$1"
+else
+  flake8
+fi
+
+echo "running pylint"
+if [ -n "$1" ]; then
+  pylint "$1"
+else
+  pylint dinov2
+fi
+
+exit 0
diff --git a/torchhub/facebookresearch_dinov2_main/setup.cfg b/torchhub/facebookresearch_dinov2_main/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..3cac0c045434cde205eebe91fd5a2c35a1226b4b
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/setup.cfg
@@ -0,0 +1,7 @@
+[flake8]
+max-line-length = 120
+ignore = E203,E501,W503
+per-file-ignores =
+  __init__.py:F401
+exclude =
+    venv
diff --git a/torchhub/facebookresearch_dinov2_main/setup.py b/torchhub/facebookresearch_dinov2_main/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..001987cfeef6c5fe3469ea09cd4698352fa90939
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/setup.py
@@ -0,0 +1,87 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from pathlib import Path
+import re
+from typing import List, Tuple
+
+from setuptools import setup, find_packages
+
+
+NAME = "dinov2"
+DESCRIPTION = "PyTorch code and models for the DINOv2 self-supervised learning method."
+
+URL = "https://github.com/facebookresearch/dinov2"
+AUTHOR = "FAIR"
+REQUIRES_PYTHON = ">=3.9.0"
+HERE = Path(__file__).parent
+
+
+try:
+    with open(HERE / "README.md", encoding="utf-8") as f:
+        long_description = "\n" + f.read()
+except FileNotFoundError:
+    long_description = DESCRIPTION
+
+
+def get_requirements(path: str = HERE / "requirements.txt") -> Tuple[List[str], List[str]]:
+    requirements = []
+    extra_indices = []
+    with open(path) as f:
+        for line in f.readlines():
+            line = line.rstrip("\r\n")
+            if line.startswith("--extra-index-url "):
+                extra_indices.append(line[18:])
+                continue
+            requirements.append(line)
+    return requirements, extra_indices
+
+
+def get_package_version() -> str:
+    with open(HERE / "dinov2/__init__.py") as f:
+        result = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", f.read(), re.M)
+        if result:
+            return result.group(1)
+    raise RuntimeError("Can't get package version")
+
+
+requirements, extra_indices = get_requirements()
+version = get_package_version()
+dev_requirements, _ = get_requirements(HERE / "requirements-dev.txt")
+
+
+setup(
+    name=NAME,
+    version=version,
+    description=DESCRIPTION,
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    author=AUTHOR,
+    python_requires=REQUIRES_PYTHON,
+    url=URL,
+    packages=find_packages(),
+    package_data={
+        "": ["*.yaml"],
+    },
+    install_requires=requirements,
+    dependency_links=extra_indices,
+    extras_require={
+        "dev": dev_requirements,
+    },
+    install_package_data=True,
+    license="CC-BY-NC",
+    license_files=("LICENSE",),
+    classifiers=[
+        # Trove classifiers: https://github.com/pypa/trove-classifiers/blob/main/src/trove_classifiers/__init__.py
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research",
+        "License :: Other/Proprietary License",
+        "Programming Language :: Python :: 3.9",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Software Development :: Libraries :: Python Modules",
+    ],
+)
diff --git a/torchhub/facebookresearch_dinov2_main/utils.py b/torchhub/facebookresearch_dinov2_main/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c6641404093652d5a2f19b4cf283d976ec39e64
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/utils.py
@@ -0,0 +1,39 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import itertools
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+_DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
+
+
+def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str:
+    compact_arch_name = arch_name.replace("_", "")[:4]
+    registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
+    return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
+
+
+class CenterPadding(nn.Module):
+    def __init__(self, multiple):
+        super().__init__()
+        self.multiple = multiple
+
+    def _get_pad(self, size):
+        new_size = math.ceil(size / self.multiple) * self.multiple
+        pad_size = new_size - size
+        pad_size_left = pad_size // 2
+        pad_size_right = pad_size - pad_size_left
+        return pad_size_left, pad_size_right
+
+    @torch.inference_mode()
+    def forward(self, x):
+        pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1]))
+        output = F.pad(x, pads)
+        return output
diff --git a/torchhub/facebookresearch_dinov2_main/vision_transformer.py b/torchhub/facebookresearch_dinov2_main/vision_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..121318f9c77a69a4467888cce44e49549e9954c0
--- /dev/null
+++ b/torchhub/facebookresearch_dinov2_main/vision_transformer.py
@@ -0,0 +1,395 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+
+from dinov2.layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+
+
+logger = logging.getLogger("dinov2")
+
+
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+
+
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+
+
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+        )
+
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+
+            def f(*args, **kwargs):
+                return nn.Identity()
+
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+
+        self.init_weights()
+
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.register_tokens is not None:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        # DINOv2 with register modify the interpolate_offset from 0.1 to 0.0
+        w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset
+        # w0, h0 = w0 + 0.1, h0 + 0.1
+        
+        sqrt_N = math.sqrt(N)
+        sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2),
+            scale_factor=(sx, sy),
+            # (int(w0), int(h0)), # to solve the upsampling shape issue
+            mode="bicubic",
+            antialias=self.interpolate_antialias
+        )
+        
+        assert int(w0) == patch_pos_embed.shape[-2]
+        assert int(h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+
+        if self.register_tokens is not None:
+            x = torch.cat(
+                (
+                    x[:, :1],
+                    self.register_tokens.expand(x.shape[0], -1, -1),
+                    x[:, 1:],
+                ),
+                dim=1,
+            )
+
+        return x
+
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            x = blk(x)
+
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+                    "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+
+        x = self.prepare_tokens_with_masks(x, masks)
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+            "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+
+    def forward(self, *args, is_training=False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+
+
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+
+
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+
+
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+
+
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+
+
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
diff --git a/ui_prediction.py b/ui_prediction.py
deleted file mode 100644
index 3fe8cda974245f38e5fac66e4eaf744a44ac1a41..0000000000000000000000000000000000000000
--- a/ui_prediction.py
+++ /dev/null
@@ -1,347 +0,0 @@
-# MIT License
-
-# Copyright (c) 2022 Intelligent Systems Lab Org
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-# File author: Zhenyu Li
-
-import gradio as gr
-from PIL import Image
-import tempfile
-import torch
-import numpy as np
-
-from zoedepth.utils.arg_utils import parse_unknown
-import argparse
-from zoedepth.models.builder import build_model
-from zoedepth.utils.config import get_config_user
-import matplotlib
-import cv2
-
-from infer_user import regular_tile_param, random_tile_param
-from zoedepth.models.base_models.midas import Resize
-from torchvision.transforms import Compose
-from PIL import Image
-from torchvision import transforms
-import torch.nn.functional as F
-
-from zoedepth.models.base_models.midas import Resize
-from torchvision.transforms import Compose
-
-import gradio as gr
-import numpy as np
-import trimesh
-from zoedepth.utils.geometry import depth_to_points, create_triangles
-from functools import partial
-import tempfile
-
-def depth_edges_mask(depth, occ_filter_thr):
-    """Returns a mask of edges in the depth map.
-    Args:
-    depth: 2D numpy array of shape (H, W) with dtype float32.
-    Returns:
-    mask: 2D numpy array of shape (H, W) with dtype bool.
-    """
-    # Compute the x and y gradients of the depth map.
-    depth_dx, depth_dy = np.gradient(depth)
-    # Compute the gradient magnitude.
-    depth_grad = np.sqrt(depth_dx ** 2 + depth_dy ** 2)
-    # Compute the edge mask.
-    # mask = depth_grad > 0.05 # default in zoedepth
-    mask = depth_grad > occ_filter_thr # preserve more edges (?)
-    return mask
-
-def load_state_dict(model, state_dict):
-    """Load state_dict into model, handling DataParallel and DistributedDataParallel. Also checks for "model" key in state_dict.
-
-    DataParallel prefixes state_dict keys with 'module.' when saving.
-    If the model is not a DataParallel model but the state_dict is, then prefixes are removed.
-    If the model is a DataParallel model but the state_dict is not, then prefixes are added.
-    """
-    state_dict = state_dict.get('model', state_dict)
-    # if model is a DataParallel model, then state_dict keys are prefixed with 'module.'
-
-    do_prefix = isinstance(
-        model, (torch.nn.DataParallel, torch.nn.parallel.DistributedDataParallel))
-    state = {}
-    for k, v in state_dict.items():
-        if k.startswith('module.') and not do_prefix:
-            k = k[7:]
-
-        if not k.startswith('module.') and do_prefix:
-            k = 'module.' + k
-
-        state[k] = v
-
-    model.load_state_dict(state, strict=True)
-    print("Loaded successfully")
-    return model
-
-def load_wts(model, checkpoint_path):
-    ckpt = torch.load(checkpoint_path, map_location='cpu')
-    return load_state_dict(model, ckpt)
-
-def load_ckpt(model, checkpoint):
-    model = load_wts(model, checkpoint)
-    print("Loaded weights from {0}".format(checkpoint))
-    return model
-
-def colorize(value, cmap='magma_r', vmin=None, vmax=None):
-    # normalize
-    vmin = value.min() if vmin is None else vmin
-    # vmax = value.max() if vmax is None else vmax
-    vmax = np.percentile(value, 95) if vmax is None else vmax
-
-    if vmin != vmax:
-        value = (value - vmin) / (vmax - vmin)  # vmin..vmax
-    else:
-        value = value * 0.
-
-    cmapper = matplotlib.cm.get_cmap(cmap)
-    value = cmapper(value, bytes=True)  # ((1)xhxwx4)
-
-    value = value[:, :, :3] # bgr -> rgb
-    # rgb_value = value[..., ::-1]
-    rgb_value = value
-
-    return rgb_value
-
-def predict_depth(model, image, mode, pn, reso, ps, device=None):
-
-    pil_image = image
-    if device is not None:
-        image = transforms.ToTensor()(pil_image).unsqueeze(0).to(device)
-    else:
-        image = transforms.ToTensor()(pil_image).unsqueeze(0).cuda()
-        
-    image_height, image_width = image.shape[-2], image.shape[-1]
-
-    if reso != '':
-        image_resolution = (int(reso.split('x')[0]), int(reso.split('x')[1]))
-    else:
-        image_resolution = (2160, 3840)
-    image_hr = F.interpolate(image, image_resolution, mode='bicubic', align_corners=True)
-    preprocess = Compose([Resize(512, 384, keep_aspect_ratio=False, ensure_multiple_of=32, resize_method="minimal")])
-    image_lr = preprocess(image)
-
-    if ps != '':
-        patch_size = (int(ps.split('x')[0]), int(ps.split('x')[1]))
-    else:
-        patch_size = (int(image_resolution[0] // 4), int(image_resolution[1] // 4))
-
-    avg_depth_map = regular_tile_param(
-        model, 
-        image_hr, 
-        offset_x=0, 
-        offset_y=0, 
-        img_lr=image_lr,
-        crop_size=patch_size, 
-        img_resolution=image_resolution, 
-        transform=preprocess,
-        blr_mask=True)
-
-    if mode== 'P16':
-        pass
-    elif mode== 'P49':
-        regular_tile_param(
-            model, 
-            image_hr, 
-            offset_x=patch_size[1]//2, 
-            offset_y=0, 
-            img_lr=image_lr, 
-            iter_pred=avg_depth_map.average_map, 
-            boundary=0, 
-            update=True, 
-            avg_depth_map=avg_depth_map, 
-            crop_size=patch_size, 
-            img_resolution=image_resolution, 
-            transform=preprocess,
-            blr_mask=True)
-        regular_tile_param(
-            model, 
-            image_hr, 
-            offset_x=0, 
-            offset_y=patch_size[0]//2, 
-            img_lr=image_lr, 
-            iter_pred=avg_depth_map.average_map, 
-            boundary=0, 
-            update=True, 
-            avg_depth_map=avg_depth_map, 
-            crop_size=patch_size, 
-            img_resolution=image_resolution, 
-            transform=preprocess,
-            blr_mask=True)
-        regular_tile_param(
-            model, 
-            image_hr, 
-            offset_x=patch_size[1]//2, 
-            offset_y=patch_size[0]//2, 
-            img_lr=image_lr, 
-            iter_pred=avg_depth_map.average_map, 
-            boundary=0, 
-            update=True, 
-            avg_depth_map=avg_depth_map, 
-            crop_size=patch_size, 
-            img_resolution=image_resolution, 
-            transform=preprocess,
-            blr_mask=True)
-    elif mode == 'R':
-        regular_tile_param(
-            model, 
-            image_hr, 
-            offset_x=patch_size[1]//2, 
-            offset_y=0, 
-            img_lr=image_lr, 
-            iter_pred=avg_depth_map.average_map, 
-            boundary=0, 
-            update=True, 
-            avg_depth_map=avg_depth_map, 
-            crop_size=patch_size, 
-            img_resolution=image_resolution, 
-            transform=preprocess,
-            blr_mask=True)
-        regular_tile_param(
-            model, 
-            image_hr, 
-            offset_x=0, 
-            offset_y=patch_size[0]//2, 
-            img_lr=image_lr, 
-            iter_pred=avg_depth_map.average_map, 
-            boundary=0, 
-            update=True, 
-            avg_depth_map=avg_depth_map, 
-            crop_size=patch_size, 
-            img_resolution=image_resolution, 
-            transform=preprocess,
-            blr_mask=True)
-        regular_tile_param(
-            model, 
-            image_hr, 
-            offset_x=patch_size[1]//2, 
-            offset_y=patch_size[0]//2, 
-            img_lr=image_lr, 
-            iter_pred=avg_depth_map.average_map, 
-            boundary=0, 
-            update=True, 
-            avg_depth_map=avg_depth_map, 
-            crop_size=patch_size, 
-            img_resolution=image_resolution, 
-            transform=preprocess,
-            blr_mask=True)
-
-        for i in range(int(pn)):
-            random_tile_param(
-                model, 
-                image_hr, 
-                img_lr=image_lr, 
-                iter_pred=avg_depth_map.average_map, 
-                boundary=0, 
-                update=True, 
-                avg_depth_map=avg_depth_map, 
-                crop_size=patch_size, 
-                img_resolution=image_resolution, 
-                transform=preprocess,
-                blr_mask=True)
-    
-    depth = avg_depth_map.average_map.detach().cpu()
-    depth = F.interpolate(depth.unsqueeze(dim=0).unsqueeze(dim=0), (image_height, image_width), mode='bicubic', align_corners=True).squeeze().numpy()
-
-    return depth
-
-def create_demo(model):
-    gr.Markdown("## Depth Prediction Demo")
-
-    with gr.Accordion("Advanced options", open=False):
-        mode = gr.Radio(["P49", "R"], label="Tiling mode", info="We recommand using P49 for fast evaluation and R with 1024 patches for best visualization results, respectively", elem_id='mode', value='R'),
-        patch_number = gr.Slider(1, 1024, label="Please decide the number of random patches (Only useful in mode=R)", step=1, value=256)
-        resolution = gr.Textbox(label="Proccessing resolution (Default 4K. Use 'x' to split height and width.)", elem_id='mode', value='2160x3840')
-        patch_size = gr.Textbox(label="Patch size (Default 1/4 of image resolution. Use 'x' to split height and width.)", elem_id='mode', value='540x960')
-    
-    with gr.Row():
-        input_image = gr.Image(label="Input Image", type='pil', elem_id='img-display-input')
-        depth_image = gr.Image(label="Depth Map", elem_id='img-display-output')
-    raw_file = gr.File(label="16-bit raw depth, multiplier:256")
-    submit = gr.Button("Submit")
-
-    def on_submit(image, mode, pn, reso, ps):
-        depth = predict_depth(model, image, mode, pn, reso, ps)
-        colored_depth = colorize(depth, cmap='gray_r')
-        tmp = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
-        raw_depth = Image.fromarray((depth*256).astype('uint16'))
-        raw_depth.save(tmp.name)
-        return [colored_depth, tmp.name]
-    
-    submit.click(on_submit, inputs=[input_image, mode[0], patch_number, resolution, patch_size], outputs=[depth_image, raw_file])
-    examples = gr.Examples(examples=["examples/example_1.jpeg", "examples/example_2.jpeg", "examples/example_3.jpeg"], inputs=[input_image])
-
-def get_mesh(model, image, mode, pn, reso, ps, keep_edges, occ_filter_thr, fov):
-    depth = predict_depth(model, image, mode, pn, reso, ps)
-
-    image.thumbnail((1024,1024))  # limit the size of the input image
-    depth = F.interpolate(torch.from_numpy(depth).unsqueeze(dim=0).unsqueeze(dim=0), (image.height, image.width), mode='bicubic', align_corners=True).squeeze().numpy()
-
-    pts3d = depth_to_points(depth[None], fov=float(fov))
-    pts3d = pts3d.reshape(-1, 3)
-
-    # Create a trimesh mesh from the points
-    # Each pixel is connected to its 4 neighbors
-    # colors are the RGB values of the image
-
-    verts = pts3d.reshape(-1, 3)
-    image = np.array(image)
-    if keep_edges:
-        triangles = create_triangles(image.shape[0], image.shape[1])
-    else:
-        triangles = create_triangles(image.shape[0], image.shape[1], mask=~depth_edges_mask(depth, occ_filter_thr=float(occ_filter_thr)))
-    colors = image.reshape(-1, 3)
-    mesh = trimesh.Trimesh(vertices=verts, faces=triangles, vertex_colors=colors)
-
-    # Save as glb
-    glb_file = tempfile.NamedTemporaryFile(suffix='.glb', delete=False)
-    glb_path = glb_file.name
-    mesh.export(glb_path)
-    return glb_path
-
-def create_demo_3d(model):
-
-    gr.Markdown("### Image to 3D Mesh")
-    gr.Markdown("Convert a single 2D image to a 3D mesh")
-
-    with gr.Accordion("Advanced options", open=False):
-        mode = gr.Radio(["P49", "R"], label="Tiling mode", info="We recommand using P49 for fast evaluation and R with 1024 patches for best visualization results, respectively", elem_id='mode', value='R'),
-        patch_number = gr.Slider(1, 1024, label="Please decide the number of random patches (Only useful in mode=R)", step=1, value=256)
-        resolution = gr.Textbox(label="Proccessing resolution (Default 4K. Use 'x' to split height and width)", value='2160x3840')
-        patch_size = gr.Textbox(label="Patch size (Default 1/4 of image resolution. Use 'x' to split height and width)", value='540x960')
-
-        checkbox = gr.Checkbox(label="Keep occlusion edges", value=False)
-        # occ_filter_thr = gr.Textbox(label="Occlusion filter threshold", info="Larger value will reserve more edges (Only useful when NOT keeping occlusion edges)", value='0.5')
-        # fov = gr.Textbox(label="FOV for inv-projection", value='55')
-
-        occ_filter_thr = gr.Slider(0.01, 5, label="Occlusion edge filter threshold", info="Larger value will reserve more occlusion edges (Only useful when NOT keeping occlusion edges)", step=0.01, value=0.2)
-        fov = gr.Slider(5, 180, label="FOV for inv-projection", step=1, value=55)
-
-
-    with gr.Row():
-        input_image = gr.Image(label="Input Image", type='pil')
-        result = gr.Model3D(label="3d mesh reconstruction", clear_color=[1.0, 1.0, 1.0, 1.0])
-    
-    submit = gr.Button("Submit")
-    submit.click(partial(get_mesh, model), inputs=[input_image, mode[0], patch_number, resolution, patch_size, checkbox, occ_filter_thr, fov], outputs=[result])
-    examples = gr.Examples(examples=["examples/example_1.jpeg", "examples/example_4.jpeg", "examples/example_3.jpeg"], inputs=[input_image])
diff --git a/ui_requirements.txt b/ui_requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f96f1390dec919edaad836b5bf7cd3b04907b360
--- /dev/null
+++ b/ui_requirements.txt
@@ -0,0 +1,3 @@
+gradio==4.14.0
+gradio_imageslider
+trimesh==3.9.42
\ No newline at end of file
diff --git a/zoedepth/data/data_mono.py b/zoedepth/data/data_mono.py
index f07191ad15de554113a14332849734fede6ffd81..80a8486f239a35331df553f490e213f9bf71e735 100644
--- a/zoedepth/data/data_mono.py
+++ b/zoedepth/data/data_mono.py
@@ -24,8 +24,6 @@
 
 # This file is partly inspired from BTS (https://github.com/cleinc/bts/blob/master/pytorch/bts_dataloader.py); author: Jin Han Lee
 
-# This file may include modifications from author Zhenyu Li
-
 import itertools
 import os
 import random
@@ -51,13 +49,9 @@ from .ibims import get_ibims_loader
 from .sun_rgbd_loader import get_sunrgbd_loader
 from .vkitti import get_vkitti_loader
 from .vkitti2 import get_vkitti2_loader
-from .u4k import get_u4k_loader
-from .middleburry import get_mid_loader
-from .gta import get_gta_loader
+
 from .preprocess import CropParams, get_white_border, get_black_border
-import copy
-from zoedepth.utils.misc import get_boundaries
-from zoedepth.models.base_models.midas import Resize
+
 
 def _is_pil_image(img):
     return isinstance(img, Image.Image)
@@ -67,18 +61,12 @@ def _is_numpy_image(img):
     return isinstance(img, np.ndarray) and (img.ndim in {2, 3})
 
 
-# def preprocessing_transforms(mode, **kwargs):
-#     return transforms.Compose([
-#         ToTensor(mode=mode, **kwargs)
-#     ])
-
-def preprocessing_transforms(mode, sec_stage=False, **kwargs):
+def preprocessing_transforms(mode, **kwargs):
     return transforms.Compose([
-        ToTensor(mode=mode, sec_stage=sec_stage, **kwargs)
+        ToTensor(mode=mode, **kwargs)
     ])
 
 
-
 class DepthDataLoader(object):
     def __init__(self, config, mode, device='cpu', transform=None, **kwargs):
         """
@@ -136,26 +124,13 @@ class DepthDataLoader(object):
             self.data = get_ddad_loader(config.ddad_root, resize_shape=(
                 352, 1216), batch_size=1, num_workers=1)
             return
-        
-        # under construction
-        if config.dataset == 'u4k':
-            self.data = get_u4k_loader(config, mode, transform)
-            return
-        
-        if config.dataset == 'mid':
-            self.data = get_mid_loader(config, mode, transform)
-            return
-        
-        if config.dataset == 'gta':
-            self.data = get_gta_loader(config, mode, transform)
-            return
-        
+
         img_size = self.config.get("img_size", None)
         img_size = img_size if self.config.get(
             "do_input_resize", False) else None
+
         if transform is None:
-            # transform = preprocessing_transforms(mode, size=img_size)
-            transform = preprocessing_transforms(mode, size=img_size, sec_stage=config.get("sec_stage", False))
+            transform = preprocessing_transforms(mode, size=img_size)
 
         if mode == 'train':
 
@@ -187,7 +162,7 @@ class DepthDataLoader(object):
             else:
                 self.eval_sampler = None
             self.data = DataLoader(self.testing_samples, 1,
-                                   shuffle=False,
+                                   shuffle=kwargs.get("shuffle_test", False),
                                    num_workers=1,
                                    pin_memory=False,
                                    sampler=self.eval_sampler)
@@ -302,16 +277,6 @@ class DataLoadPreprocess(Dataset):
             with open(config.filenames_file, 'r') as f:
                 self.filenames = f.readlines()
 
-        self.sec_stage = self.config.get("sec_stage", False)
-        # self.crop_size = [120, 160] # 1/4
-        self.crop_size = [120*2, 160*2] # 1/4
-        self.overlap = self.config.get("overlap", False)
-
-        self.consistency_training = self.config.get("consistency_training", False)
-        self.overlap_length_h = self.config.get("overlap_length_h", int(60))
-        self.overlap_length_w = self.config.get("overlap_length_w", int(80))
-        print("current overlap_length_h and overlap_length_w are {} and {}".format(self.overlap_length_h, self.overlap_length_w))
-        
         self.mode = mode
         self.transform = transform
         self.to_tensor = ToTensor(mode)
@@ -324,39 +289,11 @@ class DataLoadPreprocess(Dataset):
     def postprocess(self, sample):
         return sample
 
-    def get_crop_bbox(self, img):
-        """Randomly get a crop bounding box."""
-        margin_h = max(img.shape[0] - self.crop_size[0], 0)
-        margin_w = max(img.shape[1] - self.crop_size[1], 0)
-        offset_h = np.random.randint(0, margin_h + 1)
-        offset_w = np.random.randint(0, margin_w + 1)
-        crop_y1, crop_y2 = offset_h, offset_h + self.crop_size[0]
-        crop_x1, crop_x2 = offset_w, offset_w + self.crop_size[1]
-
-        return crop_y1, crop_y2, crop_x1, crop_x2
-
-    def crop(self, img, crop_bbox, tmp=False):
-        """Crop from ``img``"""
-        
-        crop_y1, crop_y2, crop_x1, crop_x2 = crop_bbox
-        if tmp:
-            templete = np.zeros((img.shape[0], img.shape[1], 1), dtype=np.float32)
-            templete[crop_y1:crop_y2, crop_x1:crop_x2, :] = 1.0
-            img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
-            return img, templete
-        
-        else:
-            img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
-            return img
-    
     def __getitem__(self, idx):
         sample_path = self.filenames[idx]
         focal = float(sample_path.split()[2])
         sample = {}
 
-        height=480 
-        width=640
-        
         if self.mode == 'train':
             if self.config.dataset == 'kitti' and self.config.use_right and random.random() > 0.5:
                 image_path = os.path.join(
@@ -404,7 +341,6 @@ class DataLoadPreprocess(Dataset):
 
 
             if self.config.do_random_rotate and (self.config.aug):
-                # NOTE: YES!
                 random_angle = (random.random() - 0.5) * 2 * self.config.degree
                 image = self.rotate_image(image, random_angle)
                 depth_gt = self.rotate_image(
@@ -413,91 +349,25 @@ class DataLoadPreprocess(Dataset):
             image = np.asarray(image, dtype=np.float32) / 255.0
             depth_gt = np.asarray(depth_gt, dtype=np.float32)
             depth_gt = np.expand_dims(depth_gt, axis=2)
-            disp_gt_copy = depth_gt[:, :, 0].copy()
 
             if self.config.dataset == 'nyu':
                 depth_gt = depth_gt / 1000.0
             else:
                 depth_gt = depth_gt / 256.0
 
-            # if self.config.aug and (self.config.random_crop):
-            #     image, depth_gt = self.random_crop(
-            #         image, depth_gt, self.config.input_height, self.config.input_width)
+            if self.config.aug and (self.config.random_crop):
+                image, depth_gt = self.random_crop(
+                    image, depth_gt, self.config.input_height, self.config.input_width)
             
-            image, depth_gt = self.train_preprocess(image, depth_gt)
-            img_temp = copy.deepcopy(image)
-            depth_gt_temp = copy.deepcopy(depth_gt)
-            if self.random_crop: # use in sec_stage
-                if self.consistency_training:
-                    crop_y1, crop_y2, crop_x1, crop_x2 = self.get_crop_bbox(image) # ensure the prob of crop is the same
-                    while True:
-                        # shift_x = random.randint(self.overlap_length//3, self.overlap_length)
-                        # shift_y = random.randint(self.overlap_length//3, self.overlap_length)
-                        shift_x = self.overlap_length_w
-                        shift_y = self.overlap_length_h
-                        if random.random() > 0.5:
-                            shift_x = shift_x * -1
-                        if random.random() > 0.5:
-                            shift_y = shift_y * -1
-                        crop_y1_shift, crop_y2_shift, crop_x1_shift, crop_x2_shift = crop_y1 + shift_y, crop_y2 + shift_y, crop_x1 + shift_x, crop_x2 + shift_x
-                        if crop_y1_shift > 0 and crop_x1_shift > 0 and crop_y2_shift < image.shape[0] and crop_x2_shift < image.shape[1]:
-                            break
-                    bbox_ori = (crop_y1, crop_y2, crop_x1, crop_x2)
-                    bbox_shift = (crop_y1_shift, crop_y2_shift, crop_x1_shift, crop_x2_shift)
-                    image_ori, crop_area_ori = self.crop(image, bbox_ori, tmp=True)
-                    image_shift, crop_area_shift = self.crop(image, bbox_shift, tmp=True)
-                    depth_gt_ori = self.crop(depth_gt, bbox_ori)
-                    depth_gt_shift = self.crop(depth_gt, bbox_shift)
-                    disp_gt_copy_ori = self.crop(disp_gt_copy, bbox_ori)
-                    disp_gt_copy_shift = self.crop(disp_gt_copy, bbox_shift)
-
-                    bboxs_ori = torch.tensor([crop_x1 / width * 160 * 2, crop_y1 / height * 120 * 2, crop_x2 / width * 160 * 2, crop_y2 / height * 120 * 2])
-                    bboxs_shift = torch.tensor([crop_x1_shift / width * 160 * 2, crop_y1_shift / height * 120 * 2, crop_x2_shift / width * 160 * 2, crop_y2_shift / height * 120 * 2])
-                    bboxs_raw = torch.tensor([crop_x1, crop_y1, crop_x2, crop_y2]) 
-                    bboxs_raw_shift = torch.tensor([crop_x1_shift, crop_y1_shift, crop_x2_shift, crop_y2_shift]) 
-
-                else:
-                    bbox = self.get_crop_bbox(image)
-                    image, crop_area = self.crop(image, bbox, tmp=True)
-                    depth_gt = self.crop(depth_gt, bbox)
-                    disp_gt_copy = self.crop(disp_gt_copy, bbox)
-
-                    crop_y1, crop_y2, crop_x1, crop_x2 = bbox
-                    bboxs_res = torch.tensor([crop_x1 / width * 160 * 2, crop_y1 / height * 120 * 2, crop_x2 / width * 160 * 2, crop_y2 / height * 120 * 2]) # coord in 384, 512
-                    bboxs_raw = torch.tensor([crop_x1, crop_y1, crop_x2, crop_y2]) 
+            if self.config.aug and self.config.random_translate:
+                # print("Random Translation!")
+                image, depth_gt = self.random_translate(image, depth_gt, self.config.max_translation)
 
+            image, depth_gt = self.train_preprocess(image, depth_gt)
             mask = np.logical_and(depth_gt > self.config.min_depth,
                                   depth_gt < self.config.max_depth).squeeze()[None, ...]
-            mask_raw = np.logical_and(depth_gt_temp > self.config.min_depth, depth_gt_temp < self.config.max_depth).squeeze()[None, ...]
-            sample = {'image': image, 'depth': depth_gt, 'focal': focal, 'mask': mask, 'image_raw': image.copy(), 'mask_raw': mask_raw}
-            
-            if self.random_crop:
-                if self.consistency_training:
-                    image = np.concatenate([image_ori, image_shift], axis=-1)
-                    depth_gt = np.concatenate([depth_gt_ori, depth_gt_shift], axis=-1)
-                    crop_area = np.concatenate([crop_area_ori, crop_area_shift], axis=-1)
-                    bboxs_res = torch.cat([bboxs_ori, bboxs_shift], dim=-1)
-                    bboxes_raw_res = torch.cat([bboxs_raw, bboxs_raw_shift], dim=-1)
-                    mask = np.logical_and(depth_gt > self.config.min_depth,
-                                          depth_gt < self.config.max_depth)
-
-                    # hack the sample dict
-                    sample['image'] = image
-                    sample['depth'] = depth_gt
-                    sample['crop_area'] = crop_area
-                    sample['bbox'] = bboxs_res
-                    sample['bbox_raw'] = bboxes_raw_res
-                    sample['shift'] = torch.tensor([shift_y, shift_x]) # h direction, then w direction
-                    sample['mask'] = mask
-                        
-                else:
-                    if bboxs_res is not None:
-                        sample['bbox'] = bboxs_res
-                        sample['bbox_raw'] = bboxs_raw
-                    sample['crop_area'] = crop_area
-            
-            if self.config.aug and self.config.random_translate:
-                image, depth_gt = self.random_translate(image, depth_gt, self.config.max_translation)
+            sample = {'image': image, 'depth': depth_gt, 'focal': focal,
+                      'mask': mask, **sample}
 
         else:
             if self.mode == 'online_eval':
@@ -525,7 +395,6 @@ class DataLoadPreprocess(Dataset):
                 if has_valid_depth:
                     depth_gt = np.asarray(depth_gt, dtype=np.float32)
                     depth_gt = np.expand_dims(depth_gt, axis=2)
-                    disp_gt_copy = depth_gt[:, :, 0].copy()
                     if self.config.dataset == 'nyu':
                         depth_gt = depth_gt / 1000.0
                     else:
@@ -547,46 +416,10 @@ class DataLoadPreprocess(Dataset):
                     depth_gt = depth_gt[top_margin:top_margin +
                                         352, left_margin:left_margin + 1216, :]
 
-            # NOTE: start insert something new for sec_stage training
-            if self.sec_stage:
-                img_temp = copy.deepcopy(image)
-                depth_gt_temp = copy.deepcopy(depth_gt)
-            
-                x_start, y_start = [0, 240], [0, 320]
-                # x_start, y_start = [0 + 3 * self.overlap / 2, 120 + self.overlap / 2, 240 - self.overlap / 2, 360 - 3 * self.overlap / 2], \
-                #     [0 + 3 * self.overlap / 2, 160 + self.overlap / 2, 320 - self.overlap / 2, 480 - 3 * self.overlap / 2]
-                img_crops = []
-                bboxs_roi = []
-                crop_areas = []
-                bboxs_raw_list = []
-                for x in x_start:
-                    for y in y_start:
-                        bbox = (int(x), int(x+240), int(y), int(y+320))
-                        img_crop, crop_area = self.crop(image, bbox, tmp=True)
-                        img_crops.append(img_crop)
-                        crop_areas.append(crop_area)
-                        crop_y1, crop_y2, crop_x1, crop_x2 = bbox
-                        bbox_roi = torch.tensor([crop_x1 / width * 160 * 2, crop_y1 / height * 120 * 2, crop_x2 / width * 160 * 2, crop_y2 / height * 120 * 2])
-                        bboxs_roi.append(bbox_roi)
-                        bboxs_raw = torch.tensor([crop_x1, crop_y1, crop_x2, crop_y2]) 
-                        bboxs_raw_list.append(bboxs_raw)
-
-                image = img_crops
-                bboxs_roi = torch.stack(bboxs_roi, dim=0)
-                bboxs_raw = torch.stack(bboxs_raw_list, dim=0)
-                disp_gt_edges = get_boundaries(disp_gt_copy, th=1, dilation=0)
-                
-            
             if self.mode == 'online_eval':
                 sample = {'image': image, 'depth': depth_gt, 'focal': focal, 'has_valid_depth': has_valid_depth,
                           'image_path': sample_path.split()[0], 'depth_path': sample_path.split()[1],
-                          'mask': mask, 'image_raw': image.copy(), 'disp_gt_edges': disp_gt_edges}
-                if bboxs_roi is not None:
-                    sample['bbox'] = bboxs_roi
-                    sample['bbox_raw'] = bboxs_raw
-                if crop_areas is not None:
-                    sample['crop_area'] = crop_areas
-                    
+                          'mask': mask}
             else:
                 sample = {'image': image, 'focal': focal}
 
@@ -596,9 +429,6 @@ class DataLoadPreprocess(Dataset):
             sample['mask'] = mask
 
         if self.transform:
-            # sample = self.transform(sample)
-            sample['img_temp'] = img_temp
-            sample['depth_gt_temp'] = depth_gt_temp
             sample = self.transform(sample)
 
         sample = self.postprocess(sample)
@@ -680,177 +510,64 @@ class DataLoadPreprocess(Dataset):
         return len(self.filenames)
 
 
-# class ToTensor(object):
-#     def __init__(self, mode, do_normalize=False, size=None):
-#         self.mode = mode
-#         self.normalize = transforms.Normalize(
-#             mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) if do_normalize else nn.Identity()
-#         self.size = size
-#         if size is not None:
-#             self.resize = transforms.Resize(size=size)
-#         else:
-#             self.resize = nn.Identity()
-
-#     def __call__(self, sample):
-#         image, focal = sample['image'], sample['focal']
-#         image = self.to_tensor(image)
-#         image = self.normalize(image)
-#         image = self.resize(image)
-
-#         if self.mode == 'test':
-#             return {'image': image, 'focal': focal}
-
-#         depth = sample['depth']
-#         if self.mode == 'train':
-#             depth = self.to_tensor(depth)
-#             return {**sample, 'image': image, 'depth': depth, 'focal': focal}
-#         else:
-#             has_valid_depth = sample['has_valid_depth']
-#             image = self.resize(image)
-#             return {**sample, 'image': image, 'depth': depth, 'focal': focal, 'has_valid_depth': has_valid_depth,
-#                     'image_path': sample['image_path'], 'depth_path': sample['depth_path']}
-
-#     def to_tensor(self, pic):
-#         if not (_is_pil_image(pic) or _is_numpy_image(pic)):
-#             raise TypeError(
-#                 'pic should be PIL Image or ndarray. Got {}'.format(type(pic)))
-
-#         if isinstance(pic, np.ndarray):
-#             img = torch.from_numpy(pic.transpose((2, 0, 1)))
-#             return img
-
-#         # handle PIL Image
-#         if pic.mode == 'I':
-#             img = torch.from_numpy(np.array(pic, np.int32, copy=False))
-#         elif pic.mode == 'I;16':
-#             img = torch.from_numpy(np.array(pic, np.int16, copy=False))
-#         else:
-#             img = torch.ByteTensor(
-#                 torch.ByteStorage.from_buffer(pic.tobytes()))
-#         # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
-#         if pic.mode == 'YCbCr':
-#             nchannel = 3
-#         elif pic.mode == 'I;16':
-#             nchannel = 1
-#         else:
-#             nchannel = len(pic.mode)
-#         img = img.view(pic.size[1], pic.size[0], nchannel)
-
-#         img = img.transpose(0, 1).transpose(0, 2).contiguous()
-#         if isinstance(img, torch.ByteTensor):
-#             return img.float()
-#         else:
-#             return img
-
-
 class ToTensor(object):
-    def __init__(self, mode, do_normalize=False, size=None, sec_stage=False):
+    def __init__(self, mode, do_normalize=False, size=None):
         self.mode = mode
-        # don't do normalization as default
         self.normalize = transforms.Normalize(
             mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) if do_normalize else nn.Identity()
         self.size = size
         if size is not None:
-            # self.resize = transforms.Resize(size=size)
-            net_h, net_w = size
-            self.resize = Resize(net_w, net_h, keep_aspect_ratio=False, ensure_multiple_of=32, resize_method="minimal")
+            self.resize = transforms.Resize(size=size)
         else:
             self.resize = nn.Identity()
-        self.sec_stage = sec_stage
 
     def __call__(self, sample):
         image, focal = sample['image'], sample['focal']
-        crop_areas = sample.get('crop_area', None)
-
-        if isinstance(image, list):
-            # there must be crop_areas
-            # only infer on eval sec_stage
-            imgs_process = []
-            crp_process = []
-            for img, crp in zip(image, crop_areas):
-                img = self.to_tensor(img)
-                img = self.normalize(img)
-                img = img.unsqueeze(dim=0)
-                img = self.resize(img)
-                img = img.squeeze(dim=0)
-                imgs_process.append(img)
-
-                crp = self.to_tensor(crp)
-                crp = crp.unsqueeze(dim=0)
-                crp = self.resize(crp)
-                crp = crp.squeeze(dim=0)
-                crp_process.append(crp)
-
-            image = torch.cat(imgs_process, dim=0)
-            crop_areas = torch.cat(crp_process, dim=0)
-
-            img_temp = sample['img_temp']
-            img_temp = self.to_tensor(img_temp)
-            img_temp = self.normalize(img_temp)
-            img_temp = img_temp.unsqueeze(dim=0)
-            img_temp = self.resize(img_temp) #NOTE: hack
-            img_temp = img_temp.squeeze(dim=0)
-            image_raw = copy.deepcopy(img_temp)
-
-            
-        else:
-            image = self.to_tensor(image)
-            image = self.normalize(image)
-
-            if crop_areas is not None:
-                crop_areas = self.to_tensor(crop_areas)
-                crop_areas = crop_areas.unsqueeze(dim=0)
-                crop_areas = self.resize(crop_areas)
-                crop_areas = crop_areas.squeeze(dim=0)
-
-            if self.sec_stage:
-                img_temp = sample['img_temp']
-                img_temp = self.to_tensor(img_temp)
-                img_temp = self.normalize(img_temp)
-
-                img_temp = img_temp.unsqueeze(dim=0)
-                img_temp = self.resize(img_temp)
-                image_raw = img_temp.squeeze(dim=0)
-
-                image = image.unsqueeze(dim=0)
-                image = self.resize(image)
-                image = image.squeeze(dim=0)
-
-            else:
-                # in the first stage, this hr info is reserved
-                image_raw = copy.deepcopy(image)
-                image = image.unsqueeze(dim=0)
-                image = self.resize(image)
-                image = image.squeeze(dim=0)
+        image = self.to_tensor(image)
+        image = self.normalize(image)
+        image = self.resize(image)
 
         if self.mode == 'test':
-            return_dict =  {'image': image, 'focal': focal}
-            if crop_areas is not None:
-                return_dict['crop_area'] = crop_areas
-            return return_dict
-        
+            return {'image': image, 'focal': focal}
 
         depth = sample['depth']
-        depth = self.to_tensor(depth)
-        depth_gt_temp = sample['depth_gt_temp']
-        depth_gt_raw = self.to_tensor(depth_gt_temp)
-        
         if self.mode == 'train':
-            return_dict = {**sample, 'image': image, 'depth': depth, 'focal': focal, 'image_raw': image_raw, 'depth_raw': depth_gt_raw}
-            if crop_areas is not None:
-                return_dict['crop_area'] = crop_areas
-            return return_dict
+            depth = self.to_tensor(depth)
+            return {**sample, 'image': image, 'depth': depth, 'focal': focal}
         else:
             has_valid_depth = sample['has_valid_depth']
-            # image = self.resize(image)
-            return_dict = {**sample, 'image': image, 'depth': depth, 'focal': focal, 'image_raw': image_raw, 
-                    'has_valid_depth': has_valid_depth, 'image_path': sample['image_path'], 'depth_path': sample['depth_path'],
-                    'depth_raw': depth_gt_raw}
-            if crop_areas is not None:
-                return_dict['crop_area'] = crop_areas
-            return return_dict
+            image = self.resize(image)
+            return {**sample, 'image': image, 'depth': depth, 'focal': focal, 'has_valid_depth': has_valid_depth,
+                    'image_path': sample['image_path'], 'depth_path': sample['depth_path']}
 
     def to_tensor(self, pic):
+        if not (_is_pil_image(pic) or _is_numpy_image(pic)):
+            raise TypeError(
+                'pic should be PIL Image or ndarray. Got {}'.format(type(pic)))
+
         if isinstance(pic, np.ndarray):
-            img = torch.from_numpy(pic.transpose((2, 0, 1))) # img here
+            img = torch.from_numpy(pic.transpose((2, 0, 1)))
+            return img
+
+        # handle PIL Image
+        if pic.mode == 'I':
+            img = torch.from_numpy(np.array(pic, np.int32, copy=False))
+        elif pic.mode == 'I;16':
+            img = torch.from_numpy(np.array(pic, np.int16, copy=False))
+        else:
+            img = torch.ByteTensor(
+                torch.ByteStorage.from_buffer(pic.tobytes()))
+        # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
+        if pic.mode == 'YCbCr':
+            nchannel = 3
+        elif pic.mode == 'I;16':
+            nchannel = 1
+        else:
+            nchannel = len(pic.mode)
+        img = img.view(pic.size[1], pic.size[0], nchannel)
+
+        img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        if isinstance(img, torch.ByteTensor):
+            return img.float()
+        else:
             return img
diff --git a/zoedepth/data/gta.py b/zoedepth/data/gta.py
deleted file mode 100644
index 8f7eb133e158546a01eeb188c8ce2fc77984a915..0000000000000000000000000000000000000000
--- a/zoedepth/data/gta.py
+++ /dev/null
@@ -1,320 +0,0 @@
-# MIT License
-
-# Copyright (c) 2022 Intelligent Systems Lab Org
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-# File author: Zhenyu Li
-
-# This file is partly inspired from ZoeDepth (https://github.com/isl-org/ZoeDepth/blob/main/zoedepth/data/data_mono.py); author: Shariq Farooq Bhat
-
-import os
-import numpy as np
-import torch
-from PIL import Image
-from torch.utils.data import DataLoader, Dataset
-from torchvision import transforms
-import os.path as osp
-import random
-import torch.nn as nn
-import cv2
-import copy
-from zoedepth.utils.misc import get_boundaries
-from zoedepth.models.base_models.midas import Resize
-
-
-from .u4k import U4KDataset, remove_leading_slash
-
-import re
-import numpy as np
-import sys
-import matplotlib.pyplot as plt
-import imageio
-
-class GTA(U4KDataset):
-    def __init__(self, config, mode, data_root, split):
-        super().__init__(config, mode, data_root, split)
-        self.crop_size = [270, 480] # 1/4
-
-    def load_data_list(self):
-        # os.environ["OPENCV_IO_ENABLE_OPENEXR"]="1"
-        
-        """Load annotation from directory.
-        Args:
-            data_root (str): Data root for img_dir/ann_dir.
-            split (str|None): Split txt file. If split is specified, only file
-                with suffix in the splits will be loaded. Otherwise, all images
-                in img_dir/ann_dir will be loaded. Default: None
-        Returns:
-            list[dict]: All image info of dataset.
-        """
-        data_root = self.data_root
-        split = self.split
-        
-        self.invalid_depth_num = 0
-        img_infos = []
-        if split is not None:
-            with open(split) as f:
-                for line in f:
-                    img_info_l = dict()
-
-                    img_l, depth_map_l = line.strip().split(" ")
-
-                    img_info_l['depth_map_path'] = osp.join(data_root, remove_leading_slash(depth_map_l))
-                    img_info_l['img_path'] = osp.join(data_root, remove_leading_slash(img_l))
-                    img_info_l['depth_fields'] = []
-
-                    img_infos.append(img_info_l)
-        else:
-            raise NotImplementedError 
-
-        # github issue:: make sure the same order
-        img_infos = sorted(img_infos, key=lambda x: x['img_path'])
-        return img_infos
-
-    
-    def __getitem__(self, idx):
-
-        img_file_path = self.data_infos[idx]['img_path']
-        disp_path = self.data_infos[idx]['depth_map_path']
-
-        height=1080 
-        width=1920
-
-        image = Image.open(img_file_path).convert("RGB")
-        image = np.asarray(image, dtype=np.uint8) / 255.0
-        image = image.astype(np.float32)
-
-        # disp_gt, scale = readPFM(disp_path)
-        
-        # ref_depth_full = cv2.imread(disp_path, cv2.IMREAD_UNCHANGED)
-        ref_depth_full = imageio.imread(disp_path)
-        ref_depth_full = np.array(ref_depth_full).astype(np.float32) / 256
-        # invalid_mask_full = np.logical_or(ref_depth_full < self.config.min_depth, ref_depth_full > self.config.max_depth)
-        # ref_depth_full[invalid_mask_full] = 0
-        # ref_depth_full[ref_depth_full < self.config.min_depth] = self.config.min_depth
-        # ref_depth_full[ref_depth_full > self.config.max_depth] = self.config.max_depth
-
-        depth_gt = ref_depth_full
-        disp_gt_copy = depth_gt
-        depth_gt = depth_gt[..., np.newaxis]
-        depth_gt[depth_gt > self.config.max_depth] = self.config.max_depth # for vis
-
-        # disp_gt_edges = get_boundaries(disp_gt_copy, th=1/20, dilation=0)
-        
-        bbox = None
-        bboxs_res = None
-        crop_areas = None
-        bboxs_roi = None # hack for infer
-
-        if self.mode == 'train':
-            image, depth_gt = self.train_preprocess(image, depth_gt)
-            img_temp = copy.deepcopy(image)
-            depth_gt_temp = copy.deepcopy(depth_gt)
-            
-            if self.random_crop: # use in sec_stage
-                if self.consistency_training:
-                    crop_y1, crop_y2, crop_x1, crop_x2 = self.get_crop_bbox(image) # ensure the prob of crop is the same
-                    while True:
-                        # shift_x = random.randint(self.overlap_length//3, self.overlap_length)
-                        # shift_y = random.randint(self.overlap_length//3, self.overlap_length)
-                        shift_x = self.overlap_length_w
-                        shift_y = self.overlap_length_h
-                        if random.random() > 0.5:
-                            shift_x = shift_x * -1
-                        if random.random() > 0.5:
-                            shift_y = shift_y * -1
-                        crop_y1_shift, crop_y2_shift, crop_x1_shift, crop_x2_shift = crop_y1 + shift_y, crop_y2 + shift_y, crop_x1 + shift_x, crop_x2 + shift_x
-                        if crop_y1_shift > 0 and crop_x1_shift > 0 and crop_y2_shift < image.shape[0] and crop_x2_shift < image.shape[1]:
-                            break
-                    bbox_ori = (crop_y1, crop_y2, crop_x1, crop_x2)
-                    bbox_shift = (crop_y1_shift, crop_y2_shift, crop_x1_shift, crop_x2_shift)
-                    image_ori, crop_area_ori = self.crop(image, bbox_ori, tmp=True)
-                    image_shift, crop_area_shift = self.crop(image, bbox_shift, tmp=True)
-                    depth_gt_ori = self.crop(depth_gt, bbox_ori)
-                    depth_gt_shift = self.crop(depth_gt, bbox_shift)
-                    disp_gt_copy_ori = self.crop(disp_gt_copy, bbox_ori)
-                    disp_gt_copy_shift = self.crop(disp_gt_copy, bbox_shift)
-
-                    bboxs_ori = torch.tensor([crop_x1 / width * 512, crop_y1 / height * 384, crop_x2 / width * 512, crop_y2 / height * 384])
-                    bboxs_shift = torch.tensor([crop_x1_shift / width * 512, crop_y1_shift / height * 384, crop_x2_shift / width * 512, crop_y2_shift / height * 384])
-                    bboxs_raw = torch.tensor([crop_x1, crop_y1, crop_x2, crop_y2]) 
-                    bboxs_raw_shift = torch.tensor([crop_x1_shift, crop_y1_shift, crop_x2_shift, crop_y2_shift]) 
-
-                else:
-                    bbox = self.get_crop_bbox(image)
-                    image, crop_area = self.crop(image, bbox, tmp=True)
-                    depth_gt = self.crop(depth_gt, bbox)
-                    disp_gt_copy = self.crop(disp_gt_copy, bbox)
-
-                    crop_y1, crop_y2, crop_x1, crop_x2 = bbox
-                    bboxs_res = torch.tensor([crop_x1 / width * 512, crop_y1 / height * 384, crop_x2 / width * 512, crop_y2 / height * 384]) # coord in 384, 512
-                    bboxs_raw = torch.tensor([crop_x1, crop_y1, crop_x2, crop_y2]) 
-
-
-            mask = np.logical_and(depth_gt > self.config.min_depth,
-                                  depth_gt < self.config.max_depth).squeeze()[None, ...]
-            mask_raw = np.logical_and(depth_gt_temp > self.config.min_depth, depth_gt_temp < self.config.max_depth).squeeze()[None, ...]
-            sample = {'image': image, 'depth': depth_gt, 'focal': 0, 'mask': mask, 'image_raw': image.copy(), 'mask_raw': mask_raw}
-            
-            
-            if self.random_crop:
-                if self.consistency_training:
-                    image = np.concatenate([image_ori, image_shift], axis=-1)
-                    depth_gt = np.concatenate([depth_gt_ori, depth_gt_shift], axis=-1)
-                    crop_area = np.concatenate([crop_area_ori, crop_area_shift], axis=-1)
-                    bboxs_res = torch.cat([bboxs_ori, bboxs_shift], dim=-1)
-                    bboxes_raw_res = torch.cat([bboxs_raw, bboxs_raw_shift], dim=-1)
-                    mask = np.logical_and(depth_gt > self.config.min_depth,
-                                          depth_gt < self.config.max_depth)
-
-                    # hack the sample dict
-                    sample['image'] = image
-                    sample['depth'] = depth_gt
-                    sample['crop_area'] = crop_area
-                    sample['bbox'] = bboxs_res
-                    sample['bbox_raw'] = bboxes_raw_res
-                    sample['shift'] = torch.tensor([shift_y, shift_x]) # h direction, then w direction
-                    sample['mask'] = mask
-                        
-                else:
-                    if bboxs_res is not None:
-                        sample['bbox'] = bboxs_res
-                        sample['bbox_raw'] = bboxs_raw
-                    sample['crop_area'] = crop_area
-
-            if self.sampled_training:
-                self.data_sampler(sample, disp_gt_copy)
-
-                # update mask
-                sample_points = sample['sample_points']
-                sample_mask = np.logical_and(sample_points[:, -1] > self.config.min_depth,
-                                             sample_points[:, -1] < self.config.max_depth).squeeze()[None, ...]
-                sample['sample_mask'] = sample_mask
-
-            
-        else:
-            # nothing needs to be changed for consistency training.
-
-            img_temp = copy.deepcopy(image)
-            depth_gt_temp = copy.deepcopy(depth_gt)
-            if self.sec_stage:
-                # x_start, y_start = [0, 540, 1080, 1620], [0, 960, 1920, 2880]
-                x_start, y_start = [0 + 3 * self.overlap / 2, 270 + self.overlap / 2, 540 - self.overlap / 2, 810 - 3 * self.overlap / 2], \
-                    [0 + 3 * self.overlap / 2, 480 + self.overlap / 2, 960 - self.overlap / 2, 1440 - 3 * self.overlap / 2]
-                img_crops = []
-                bboxs_roi = []
-                crop_areas = []
-                bboxs_raw_list = []
-                for x in x_start:
-                    for y in y_start:
-                        bbox = (int(x), int(x+270), int(y), int(y+480))
-                        img_crop, crop_area = self.crop(image, bbox, tmp=True)
-                        img_crops.append(img_crop)
-                        crop_areas.append(crop_area)
-                        crop_y1, crop_y2, crop_x1, crop_x2 = bbox
-                        bbox_roi = torch.tensor([crop_x1 / width * 512, crop_y1 / height * 384, crop_x2 / width * 512, crop_y2 / height * 384])
-                        bboxs_roi.append(bbox_roi)
-                        bboxs_raw = torch.tensor([crop_x1, crop_y1, crop_x2, crop_y2]) 
-                        bboxs_raw_list.append(bboxs_raw)
-
-                image = img_crops
-                bboxs_roi = torch.stack(bboxs_roi, dim=0)
-                bboxs_raw = torch.stack(bboxs_raw_list, dim=0)
-
-                # bbox = (820, 1360 ,1440, 2400) # a hack version for quick evaluation
-                # image = self.crop(image, bbox)
-                # depth_gt = self.crop(depth_gt, bbox)
-                # disp_gt_copy = self.crop(disp_gt_copy, bbox)
-
-            mask = np.logical_and(depth_gt > self.config.min_depth,
-                                  depth_gt < self.config.max_depth).squeeze()[None, ...]
-            
-            disp_gt_edges = get_boundaries(disp_gt_copy, th=1, dilation=0)
-            if self.mode == 'online_eval':
-                sample = {'image': image, 'depth': depth_gt, 'focal': 0, 'has_valid_depth': True,
-                          'image_path': img_file_path, 'depth_path': disp_path, 'depth_factor_path': 0,
-                          'mask': mask, 'image_raw': image.copy(), 'disp_gt_edges': disp_gt_edges}
-                if bboxs_roi is not None:
-                    sample['bbox'] = bboxs_roi
-                    sample['bbox_raw'] = bboxs_raw
-                if crop_areas is not None:
-                    sample['crop_area'] = crop_areas
-                
-
-            else:
-                sample = {'image': image, 'focal': 0, 'image_raw': image.copy(), 'disp_gt_edges': disp_gt_edges, 'image_path': img_file_path}
-                if bboxs_roi is not None:
-                    sample['bbox'] = bboxs_roi
-                    sample['bbox_raw'] = bboxs_raw
-                if crop_areas is not None:
-                    sample['crop_area'] = crop_areas
-                
-        if self.transform:
-            sample['img_temp'] = img_temp
-            sample['depth_gt_temp'] = depth_gt_temp
-            sample = self.transform(sample)
-
-        sample['dataset'] = self.config.dataset
-        return sample
-
-    def __len__(self):
-        return len(self.data_infos)
-    
-def get_gta_loader(config, mode, transform):
-    if mode == 'train':
-        log = 0
-        dataset = GTA(config, mode, config.data_path, config.filenames_train)
-        dataset[0]
-        if config.distributed:
-            train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
-        else:
-            train_sampler = None
-        
-        dataloader = DataLoader(dataset,
-                                batch_size=config.batch_size,
-                                shuffle=(train_sampler is None),
-                                num_workers=config.workers,
-                                pin_memory=True,
-                                persistent_workers=True,
-                                sampler=train_sampler)
-            
-    elif mode == 'online_eval':
-        dataset = GTA(config, mode, config.data_path, config.filenames_val)
-        # dataset = U4KDataset(config, mode, config.data_path, config.filenames_train)
-
-
-        if config.distributed:  # redundant. here only for readability and to be more explicit
-            # Give whole test set to all processes (and report evaluation only on one) regardless
-            eval_sampler = None
-        else:
-            eval_sampler = None
-
-        dataloader = DataLoader(dataset, 1,
-                                shuffle=False,
-                                num_workers=1,
-                                pin_memory=False,
-                                sampler=eval_sampler)
-        
-    else:
-        dataset = GTA(config, mode, config.data_path, config.filenames_test)
-        dataloader = DataLoader(dataset, 1, shuffle=False, num_workers=1)
-        
-    return dataloader
-
diff --git a/zoedepth/data/middleburry.py b/zoedepth/data/middleburry.py
deleted file mode 100644
index cf3b82fabb09d986d9901dbd456c8f6cb671ffda..0000000000000000000000000000000000000000
--- a/zoedepth/data/middleburry.py
+++ /dev/null
@@ -1,373 +0,0 @@
-# MIT License
-
-# Copyright (c) 2022 Intelligent Systems Lab Org
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-# File author: Zhenyu Li
-
-# This file is partly inspired from ZoeDepth (https://github.com/isl-org/ZoeDepth/blob/main/zoedepth/data/data_mono.py); author: Shariq Farooq Bhat
-
-import os
-import numpy as np
-import torch
-from PIL import Image
-from torch.utils.data import DataLoader, Dataset
-from torchvision import transforms
-import os.path as osp
-import random
-import torch.nn as nn
-import cv2
-import copy
-from zoedepth.utils.misc import get_boundaries
-from zoedepth.models.base_models.midas import Resize
-
-
-from .u4k import U4KDataset, remove_leading_slash
-
-import re
-import numpy as np
-import sys
-import matplotlib.pyplot as plt
-
-def readPFM(file):
-    file = open(file, 'rb')
-
-    color = None
-    width = None
-    height = None
-    scale = None
-    endian = None
-
-    header = file.readline().rstrip()
-    if (sys.version[0]) == '3':
-        header = header.decode('utf-8')
-    if header == 'PF':
-        color = True
-    elif header == 'Pf':
-        color = False
-    else:
-        raise Exception('Not a PFM file.')
-
-    if (sys.version[0]) == '3':
-        dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline().decode('utf-8'))
-    else:
-        dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline())
-    if dim_match:
-        width, height = map(int, dim_match.groups())
-    else:
-        raise Exception('Malformed PFM header.')
-
-    if (sys.version[0]) == '3':
-        scale = float(file.readline().rstrip().decode('utf-8'))
-    else:
-        scale = float(file.readline().rstrip())
-        
-    if scale < 0: # little-endian
-        endian = '<'
-        scale = -scale
-    else:
-        endian = '>' # big-endian
-
-    data = np.fromfile(file, endian + 'f')
-    shape = (height, width, 3) if color else (height, width)
-
-    data = np.reshape(data, shape)
-    data = np.flipud(data)
-    return data, scale
-
-class MiddleBurry(U4KDataset):
-    def load_data_list(self):
-        """Load annotation from directory.
-        Args:
-            data_root (str): Data root for img_dir/ann_dir.
-            split (str|None): Split txt file. If split is specified, only file
-                with suffix in the splits will be loaded. Otherwise, all images
-                in img_dir/ann_dir will be loaded. Default: None
-        Returns:
-            list[dict]: All image info of dataset.
-        """
-        data_root = self.data_root
-        split = self.split
-        
-        self.invalid_depth_num = 0
-        img_infos = []
-        if split is not None:
-            with open(split) as f:
-                for line in f:
-                    img_info_l = dict()
-
-                    img_l, depth_map_l = line.strip().split(" ")
-
-                    img_info_l['depth_map_path'] = osp.join(data_root, remove_leading_slash(depth_map_l))
-                    img_info_l['img_path'] = osp.join(data_root, remove_leading_slash(img_l))
-                    img_info_l['depth_fields'] = []
-
-                    filename = img_info_l['depth_map_path']
-                    ext_name_l = filename.replace('disp0.pfm', 'calib.txt')
-                    with open(ext_name_l, 'r') as f:
-                        ext_l = f.readlines()
-
-                    cam_info = ext_l[0].strip()
-                    cam_info_f = float(cam_info.split(' ')[0].split('[')[1])
-                    base = float(ext_l[3].strip().split('=')[1])
-                    doffs = float(ext_l[2].strip().split('=')[1])
-    
-                    f = cam_info_f
-                    img_info_l['focal'] = f
-                    base = base
-                    img_info_l['depth_factor'] = base * f
-                    img_info_l['doffs'] = doffs
-
-                    img_infos.append(img_info_l)
-        else:
-            raise NotImplementedError 
-
-        # github issue:: make sure the same order
-        img_infos = sorted(img_infos, key=lambda x: x['img_path'])
-        if self.mode == 'train':
-            img_infos = img_infos * 100
-        return img_infos
-
-    
-    def __getitem__(self, idx):
-
-        img_file_path = self.data_infos[idx]['img_path']
-        disp_path = self.data_infos[idx]['depth_map_path']
-        depth_factor = self.data_infos[idx]['depth_factor']
-
-        height=2160 
-        width=3840
-        height = 1840
-        width = 2300
-
-        image = Image.open(img_file_path).convert("RGB")
-        image = np.asarray(image, dtype=np.uint8) / 255.0
-        image = image.astype(np.float32)
-
-        disp_gt, scale = readPFM(disp_path)
-        disp_gt = disp_gt.astype(np.float32)
-
-        h, w, _ = image.shape
-        h_start = int(h / 2 - height / 2)
-        h_end = h_start + height
-        w_start = int(w / 2 - width / 2)
-        w_end = w_start + width
-        image = image[h_start:h_end, w_start:w_end, :]
-        disp_gt = disp_gt[h_start:h_end, w_start:w_end]
-
-        disp_gt_copy = disp_gt.copy()
-        disp_gt = disp_gt[..., np.newaxis]
-        invalid_mask = disp_gt == np.inf
-        
-        depth_gt = depth_factor / (disp_gt + self.data_infos[idx]['doffs'])
-        depth_gt = depth_gt / 1000
-        depth_gt[invalid_mask] = 0 # set to a invalid number
-        disp_gt_copy[invalid_mask[:, :, 0]] = 0
-        focal = self.data_infos[idx]['focal']
-
-        bbox = None
-        bboxs_res = None
-        crop_areas = None
-        bboxs_roi = None # hack for infer
-
-        if self.mode == 'train':
-            image, depth_gt = self.train_preprocess(image, depth_gt)
-            img_temp = copy.deepcopy(image)
-            depth_gt_temp = copy.deepcopy(depth_gt)
-            
-            if self.random_crop: # use in sec_stage
-                if self.consistency_training:
-                    crop_y1, crop_y2, crop_x1, crop_x2 = self.get_crop_bbox(image) # ensure the prob of crop is the same
-                    while True:
-                        # shift_x = random.randint(self.overlap_length//3, self.overlap_length)
-                        # shift_y = random.randint(self.overlap_length//3, self.overlap_length)
-                        shift_x = self.overlap_length_w
-                        shift_y = self.overlap_length_h
-                        if random.random() > 0.5:
-                            shift_x = shift_x * -1
-                        if random.random() > 0.5:
-                            shift_y = shift_y * -1
-                        crop_y1_shift, crop_y2_shift, crop_x1_shift, crop_x2_shift = crop_y1 + shift_y, crop_y2 + shift_y, crop_x1 + shift_x, crop_x2 + shift_x
-                        if crop_y1_shift > 0 and crop_x1_shift > 0 and crop_y2_shift < image.shape[0] and crop_x2_shift < image.shape[1]:
-                            break
-                    bbox_ori = (crop_y1, crop_y2, crop_x1, crop_x2)
-                    bbox_shift = (crop_y1_shift, crop_y2_shift, crop_x1_shift, crop_x2_shift)
-                    image_ori, crop_area_ori = self.crop(image, bbox_ori, tmp=True)
-                    image_shift, crop_area_shift = self.crop(image, bbox_shift, tmp=True)
-                    depth_gt_ori = self.crop(depth_gt, bbox_ori)
-                    depth_gt_shift = self.crop(depth_gt, bbox_shift)
-                    disp_gt_copy_ori = self.crop(disp_gt_copy, bbox_ori)
-                    disp_gt_copy_shift = self.crop(disp_gt_copy, bbox_shift)
-
-                    bboxs_ori = torch.tensor([crop_x1 / width * 512, crop_y1 / height * 384, crop_x2 / width * 512, crop_y2 / height * 384])
-                    bboxs_shift = torch.tensor([crop_x1_shift / width * 512, crop_y1_shift / height * 384, crop_x2_shift / width * 512, crop_y2_shift / height * 384])
-
-                else:
-                    bbox = self.get_crop_bbox(image)
-                    image, crop_area = self.crop(image, bbox, tmp=True)
-                    depth_gt = self.crop(depth_gt, bbox)
-                    disp_gt_copy = self.crop(disp_gt_copy, bbox)
-
-                    crop_y1, crop_y2, crop_x1, crop_x2 = bbox
-                    bboxs_res = torch.tensor([crop_x1 / width * 512, crop_y1 / height * 384, crop_x2 / width * 512, crop_y2 / height * 384]) # coord in 384, 512
-
-
-            mask = np.logical_and(depth_gt > self.config.min_depth,
-                                  depth_gt < self.config.max_depth).squeeze()[None, ...]
-            mask_raw = np.logical_and(depth_gt_temp > self.config.min_depth, depth_gt_temp < self.config.max_depth).squeeze()[None, ...]
-            sample = {'image': image, 'depth': depth_gt, 'focal': focal, 'mask': mask, 'image_raw': image.copy(), 'mask_raw': mask_raw}
-            
-            
-            if self.random_crop:
-                if self.consistency_training:
-                    image = np.concatenate([image_ori, image_shift], axis=-1)
-                    depth_gt = np.concatenate([depth_gt_ori, depth_gt_shift], axis=-1)
-                    crop_area = np.concatenate([crop_area_ori, crop_area_shift], axis=-1)
-                    bboxs_res = torch.cat([bboxs_ori, bboxs_shift], dim=-1)
-                    mask = np.logical_and(depth_gt > self.config.min_depth,
-                                          depth_gt < self.config.max_depth)
-
-                    # hack the sample dict
-                    sample['image'] = image
-                    sample['depth'] = depth_gt
-                    sample['crop_area'] = crop_area
-                    sample['bbox'] = bboxs_res
-                    sample['shift'] = torch.tensor([shift_y, shift_x]) # h direction, then w direction
-                    sample['mask'] = mask
-                        
-                else:
-                    if bboxs_res is not None:
-                        sample['bbox'] = bboxs_res
-                    sample['crop_area'] = crop_area
-
-            if self.sampled_training:
-                self.data_sampler(sample, disp_gt_copy)
-
-                # update mask
-                sample_points = sample['sample_points']
-                sample_mask = np.logical_and(sample_points[:, -1] > self.config.min_depth,
-                                             sample_points[:, -1] < self.config.max_depth).squeeze()[None, ...]
-                sample['sample_mask'] = sample_mask
-
-            
-        else:
-            # nothing needs to be changed for consistency training.
-
-            img_temp = copy.deepcopy(image)
-            depth_gt_temp = copy.deepcopy(depth_gt)
-            
-            if self.sec_stage:
-                # x_start, y_start = [0, 540, 1080, 1620], [0, 960, 1920, 2880]
-                x_start, y_start = [0 + 3 * self.overlap / 2, 540 + self.overlap / 2, 1080 - self.overlap / 2, 1620 - 3 * self.overlap / 2], \
-                    [0 + 3 * self.overlap / 2, 960 + self.overlap / 2, 1920 - self.overlap / 2, 2880 - 3 * self.overlap / 2]
-                img_crops = []
-                bboxs_roi = []
-                crop_areas = []
-                for x in x_start:
-                    for y in y_start:
-                        bbox = (int(x), int(x+540), int(y), int(y+960))
-                        img_crop, crop_area = self.crop(image, bbox, tmp=True)
-                        img_crops.append(img_crop)
-                        crop_areas.append(crop_area)
-                        crop_y1, crop_y2, crop_x1, crop_x2 = bbox
-                        bbox_roi = torch.tensor([crop_x1 / width * 512, crop_y1 / height * 384, crop_x2 / width * 512, crop_y2 / height * 384])
-                        bboxs_roi.append(bbox_roi)
-
-                image = img_crops
-                bboxs_roi = torch.stack(bboxs_roi, dim=0)
-
-                # bbox = (820, 1360 ,1440, 2400) # a hack version for quick evaluation
-                # image = self.crop(image, bbox)
-                # depth_gt = self.crop(depth_gt, bbox)
-                # disp_gt_copy = self.crop(disp_gt_copy, bbox)
-
-            mask = np.logical_and(depth_gt > self.config.min_depth,
-                                  depth_gt < self.config.max_depth).squeeze()[None, ...]
-            
-            disp_gt_edges = get_boundaries(disp_gt_copy, th=1, dilation=0)
-
-            if self.mode == 'online_eval':
-                sample = {'image': image, 'depth': depth_gt, 'focal': focal, 'has_valid_depth': True,
-                          'image_path': img_file_path, 'depth_path': disp_path, 'depth_factor_path': depth_factor,
-                          'mask': mask, 'image_raw': image.copy(), 'disp_gt_edges': disp_gt_edges}
-                if bboxs_roi is not None:
-                    sample['bbox'] = bboxs_roi
-                if crop_areas is not None:
-                    sample['crop_area'] = crop_areas
-                
-
-            else:
-                sample = {'image': image, 'focal': focal, 'image_raw': image.copy(), 'disp_gt_edges': disp_gt_edges, 'image_path': img_file_path}
-                if bboxs_roi is not None:
-                    sample['bbox'] = bboxs_roi
-                if crop_areas is not None:
-                    sample['crop_area'] = crop_areas
-                
-        if self.transform:
-            sample['img_temp'] = img_temp
-            sample['depth_gt_temp'] = depth_gt_temp
-            sample = self.transform(sample)
-
-        sample['dataset'] = self.config.dataset
-        return sample
-
-    def __len__(self):
-        return len(self.data_infos)
-    
-def get_mid_loader(config, mode, transform):
-    if mode == 'train':
-        log = 0
-        dataset = MiddleBurry(config, mode, config.data_path, config.filenames_train)
-        if config.distributed:
-            train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
-        else:
-            train_sampler = None
-        
-        dataloader = DataLoader(dataset,
-                                batch_size=config.batch_size,
-                                shuffle=(train_sampler is None),
-                                num_workers=config.workers,
-                                pin_memory=True,
-                                persistent_workers=True,
-                                sampler=train_sampler)
-            
-    elif mode == 'online_eval':
-        dataset = MiddleBurry(config, mode, config.data_path, config.filenames_val)
-        # dataset = U4KDataset(config, mode, config.data_path, config.filenames_train)
-
-
-        if config.distributed:  # redundant. here only for readability and to be more explicit
-            # Give whole test set to all processes (and report evaluation only on one) regardless
-            eval_sampler = None
-        else:
-            eval_sampler = None
-
-        dataloader = DataLoader(dataset, 1,
-                                shuffle=False,
-                                num_workers=1,
-                                pin_memory=False,
-                                sampler=eval_sampler)
-        
-    else:
-        dataset = MiddleBurry(config, mode, config.data_path, config.filenames_test)
-        dataloader = DataLoader(dataset, 1, shuffle=False, num_workers=1)
-        
-    return dataloader
-
diff --git a/zoedepth/data/u4k.py b/zoedepth/data/u4k.py
deleted file mode 100644
index 0249c239486ee7e78c6ce261c2e85b9a2e9213c1..0000000000000000000000000000000000000000
--- a/zoedepth/data/u4k.py
+++ /dev/null
@@ -1,668 +0,0 @@
-# MIT License
-
-# Copyright (c) 2022 Intelligent Systems Lab Org
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-# File author: Zhenyu Li
-
-# This file is partly inspired from ZoeDepth (https://github.com/isl-org/ZoeDepth/blob/main/zoedepth/data/data_mono.py); author: Shariq Farooq Bhat
-
-import os
-import numpy as np
-import torch
-from PIL import Image
-from torch.utils.data import DataLoader, Dataset
-from torchvision import transforms
-import os.path as osp
-import random
-import torch.nn as nn
-import cv2
-import copy
-from zoedepth.utils.misc import get_boundaries
-from zoedepth.models.base_models.midas import Resize
-
-class SampleDataPairs(object):
-    
-    def __init__(self,
-                 num_sample_inout=50000,
-                 sampling_strategy='random', # or 'dda'
-                 dilation_factor=10,
-                 crop_size=(2160, 3840),
-                 ):
-
-        self.num_sample_inout = num_sample_inout
-        self.sampling_strategy = sampling_strategy
-        self.dilation_factor = dilation_factor
-        self.crop_height, self.crop_width = crop_size[0], crop_size[1]
-        self.__init_grid()
-
-    def __init_grid(self):
-        nu = np.linspace(0, self.crop_width - 1, self.crop_width)
-        nv = np.linspace(0, self.crop_height - 1, self.crop_height)
-        u, v = np.meshgrid(nu, nv)
-
-        self.u = u.flatten()
-        self.v = v.flatten()
-
-    def get_coords(self, gt):
-        #  get subpixel coordinates
-        u = self.u + np.random.random_sample(self.u.size)
-        v = self.v + np.random.random_sample(self.v.size)
-
-        # use nearest neighbor to get gt for each samples
-        d = gt[np.clip(np.rint(v).astype(np.uint16), 0, self.crop_height-1),
-               np.clip(np.rint(u).astype(np.uint16), 0, self.crop_width-1)]
-
-        # remove invalid depth values
-        u = u[np.nonzero(d)]
-        v = v[np.nonzero(d)]
-        d = d[np.nonzero(d)]
-
-        return np.stack((u, v, d), axis=-1)
-
-
-    def get_boundaries(self, disp, th=1., dilation=10):
-        edges_y = np.logical_or(np.pad(np.abs(disp[1:, :] - disp[:-1, :]) > th, ((1, 0), (0, 0))),
-                                np.pad(np.abs(disp[:-1, :] - disp[1:, :]) > th, ((0, 1), (0, 0))))
-        edges_x = np.logical_or(np.pad(np.abs(disp[:, 1:] - disp[:, :-1]) > th, ((0, 0), (1, 0))),
-                                np.pad(np.abs(disp[:, :-1] - disp[:,1:]) > th, ((0, 0), (0, 1))))
-        edges = np.logical_or(edges_y,  edges_x).astype(np.float32)
-
-        if dilation > 0:
-            kernel = np.ones((dilation, dilation), np.uint8)
-            edges = cv2.dilate(edges, kernel, iterations=1)
-
-        return edges
-
-
-    def __call__(self, results, disp_gt_copy):
-        """Call function to load multiple types annotations.
-
-        Args:
-            results (dict): Result dict from :obj:`depth.CustomDataset`.
-
-        Returns:
-            dict: The dict contains loaded depth estimation annotations.
-        """
-
-        gt = results['depth']
-        gt_squeeze = gt[:, :, 0]
-        if self.sampling_strategy == "random":
-            random_points = self.get_coords(gt_squeeze)
-            idx = np.random.choice(random_points.shape[0], self.num_sample_inout)
-            points = random_points[idx, :]
-
-        elif self.sampling_strategy == "dda":
-            disp_gt_squeeze = disp_gt_copy
-            edges = self.get_boundaries(disp_gt_squeeze, dilation=self.dilation_factor)
-            random_points = self.get_coords(gt_squeeze * (1. - edges))
-            edge_points = self.get_coords(gt_squeeze * edges)
-
-            # if edge points exist
-            if edge_points.shape[0]>0 and random_points.shape[0]>0:
-                # Check tot num of edge points
-                cond = edges.sum()//2 -  self.num_sample_inout//2 < 0
-                tot= (self.num_sample_inout - int(edges.sum())//2, int(edges.sum())//2) if cond else \
-                     (self.num_sample_inout//2, self.num_sample_inout//2)
-
-                idx = np.random.choice(random_points.shape[0], tot[0])
-                idx_edges = np.random.choice(edge_points.shape[0], tot[1])
-                points = np.concatenate([edge_points[idx_edges, :], random_points[idx, :]], 0)
-            # use uniform sample otherwise
-            else:
-                random_points = self.get_coords(gt_squeeze)
-                idx = np.random.choice(random_points.shape[0], self.num_sample_inout)
-                points = random_points[idx, :]
-
-        
-        results['sample_points'] = points
-
-        return results
-
-def _is_pil_image(img):
-    return isinstance(img, Image.Image)
-
-def _is_numpy_image(img):
-    return isinstance(img, np.ndarray) and (img.ndim in {2, 3})
-
-class ToTensor(object):
-    def __init__(self, mode, do_normalize=False, size=None, sec_stage=False):
-        self.mode = mode
-        # don't do normalization as default
-        self.normalize = transforms.Normalize(
-            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) if do_normalize else nn.Identity()
-        self.size = size
-        if size is not None:
-            # self.resize = transforms.Resize(size=size)
-            net_h, net_w = size
-            self.resize = Resize(net_w, net_h, keep_aspect_ratio=False, ensure_multiple_of=32, resize_method="minimal")
-        else:
-            self.resize = nn.Identity()
-        self.sec_stage = sec_stage
-
-    def __call__(self, sample):
-        image, focal = sample['image'], sample['focal']
-        crop_areas = sample.get('crop_area', None)
-
-        if isinstance(image, list):
-            # there must be crop_areas
-            # only infer on eval sec_stage
-            imgs_process = []
-            crp_process = []
-            for img, crp in zip(image, crop_areas):
-                img = self.to_tensor(img)
-                img = self.normalize(img)
-                img = img.unsqueeze(dim=0)
-                img = self.resize(img)
-                img = img.squeeze(dim=0)
-                imgs_process.append(img)
-
-                crp = self.to_tensor(crp)
-                crp = crp.unsqueeze(dim=0)
-                crp = self.resize(crp)
-                crp = crp.squeeze(dim=0)
-                crp_process.append(crp)
-
-            image = torch.cat(imgs_process, dim=0)
-            crop_areas = torch.cat(crp_process, dim=0)
-
-            img_temp = sample['img_temp']
-            img_temp = self.to_tensor(img_temp)
-            img_temp = self.normalize(img_temp)
-            img_temp = img_temp.unsqueeze(dim=0)
-            img_temp = self.resize(img_temp) #NOTE: hack
-            img_temp = img_temp.squeeze(dim=0)
-            image_raw = copy.deepcopy(img_temp)
-
-            
-        else:
-            image = self.to_tensor(image)
-            image = self.normalize(image)
-
-            if crop_areas is not None:
-                crop_areas = self.to_tensor(crop_areas)
-                crop_areas = crop_areas.unsqueeze(dim=0)
-                crop_areas = self.resize(crop_areas)
-                crop_areas = crop_areas.squeeze(dim=0)
-
-            if self.sec_stage:
-                img_temp = sample['img_temp']
-                img_temp = self.to_tensor(img_temp)
-                img_temp = self.normalize(img_temp)
-
-                img_temp = img_temp.unsqueeze(dim=0)
-                img_temp = self.resize(img_temp)
-                image_raw = img_temp.squeeze(dim=0)
-
-                image = image.unsqueeze(dim=0)
-                image = self.resize(image)
-                image = image.squeeze(dim=0)
-
-            else:
-                # in the first stage, this hr info is reserved
-                image_raw = copy.deepcopy(image)
-                image = image.unsqueeze(dim=0)
-                image = self.resize(image)
-                image = image.squeeze(dim=0)
-
-        if self.mode == 'test':
-            return_dict =  {'image': image, 'focal': focal}
-            if crop_areas is not None:
-                return_dict['crop_area'] = crop_areas
-            return return_dict
-        
-
-        depth = sample['depth']
-        depth = self.to_tensor(depth)
-        depth_gt_temp = sample['depth_gt_temp']
-        depth_gt_raw = self.to_tensor(depth_gt_temp)
-        
-        if self.mode == 'train':
-            return_dict = {**sample, 'image': image, 'depth': depth, 'focal': focal, 'image_raw': image_raw, 'depth_raw': depth_gt_raw}
-            if crop_areas is not None:
-                return_dict['crop_area'] = crop_areas
-            return return_dict
-        else:
-            has_valid_depth = sample['has_valid_depth']
-            # image = self.resize(image)
-            return_dict = {**sample, 'image': image, 'depth': depth, 'focal': focal, 'image_raw': image_raw, 
-                    'has_valid_depth': has_valid_depth, 'image_path': sample['image_path'], 'depth_path': sample['depth_path'],
-                    'depth_raw': depth_gt_raw}
-            if crop_areas is not None:
-                return_dict['crop_area'] = crop_areas
-            return return_dict
-
-    def to_tensor(self, pic):
-        if isinstance(pic, np.ndarray):
-            img = torch.from_numpy(pic.transpose((2, 0, 1))) # img here
-            return img
-
-
-def preprocessing_transforms(mode, sec_stage=False, **kwargs):
-    return transforms.Compose([
-        ToTensor(mode=mode, sec_stage=sec_stage, **kwargs)
-    ])
-
-def remove_leading_slash(s):
-    if s[0] == '/' or s[0] == '\\':
-        return s[1:]
-    return s
-
-class U4KDataset(Dataset):
-    def __init__(self, config, mode, data_root, split):
-        self.mode = mode
-        self.config = config
-        self.data_root = data_root
-        self.split = split
-
-        img_size = self.config.get("img_size", None)
-        img_size = img_size if self.config.get(
-            "do_input_resize", False) else None
-        self.sec_stage = self.config.get("sec_stage", False)
-        self.transform = preprocessing_transforms(mode, size=img_size, sec_stage=self.sec_stage)
-
-        self.data_infos = self.load_data_list()
-        
-        self.sampled_training = self.config.get("sampled_training", False)
-        if self.sampled_training:
-            self.data_sampler = SampleDataPairs(
-                num_sample_inout=config.num_sample_inout,
-                sampling_strategy=config.sampling_strategy, # or 'dda'
-                dilation_factor=config.dilation_factor,
-                crop_size=config.transform_sample_gt_size)
-        
-        self.random_crop = self.config.get("random_crop", False)
-        self.crop_size = [540, 960] # 1/4
-        self.overlap = self.config.get("overlap", False)
-
-        self.consistency_training = self.config.get("consistency_training", False)
-        self.overlap_length_h = self.config.get("overlap_length_h", int(256))
-        self.overlap_length_w = self.config.get("overlap_length_w", int(384))
-        print("current overlap_length_h and overlap_length_w are {} and {}".format(self.overlap_length_h, self.overlap_length_w))
-        
-        
-    def load_data_list(self):
-        """Load annotation from directory.
-        Args:
-            data_root (str): Data root for img_dir/ann_dir.
-            split (str|None): Split txt file. If split is specified, only file
-                with suffix in the splits will be loaded. Otherwise, all images
-                in img_dir/ann_dir will be loaded. Default: None
-        Returns:
-            list[dict]: All image info of dataset.
-        """
-        data_root = self.data_root
-        split = self.split
-        
-        self.invalid_depth_num = 0
-        img_infos = []
-        if split is not None:
-            with open(split) as f:
-                for line in f:
-                    img_info_l = dict()
-                    # img_info_r = dict()
-
-                    img_l, img_r, depth_map_l, depth_map_r = line.strip().split(" ")
-
-                    # HACK: a hack to replace the png with raw to accelerate training
-                    img_l = img_l[:-3] + 'raw'
-                    # img_r = img_r[:-3] + 'raw'
-
-                    img_info_l['depth_map_path'] = osp.join(data_root, remove_leading_slash(depth_map_l))
-                    # img_info_r['depth_map_path'] = osp.join(data_root, remove_leading_slash(depth_map_r))
-
-                    img_info_l['img_path'] = osp.join(data_root, remove_leading_slash(img_l))
-                    # img_info_r['filename'] = osp.join(data_root, remove_leading_slash(img_r))
-
-                    img_info_l['depth_fields'] = []
-
-                    filename = img_info_l['depth_map_path']
-                    ext_name_l = filename.replace('Disp0', 'Extrinsics0')
-                    ext_name_l = ext_name_l.replace('npy', 'txt')
-                    ext_name_r = filename.replace('Disp0', 'Extrinsics1')
-                    ext_name_r = ext_name_r.replace('npy', 'txt')
-                    with open(ext_name_l, 'r') as f:
-                        ext_l = f.readlines()
-                    with open(ext_name_r, 'r') as f:
-                        ext_r = f.readlines()
-                    f = float(ext_l[0].split(' ')[0])
-                    img_info_l['focal'] = f
-                    base = abs(float(ext_l[1].split(' ')[3]) - float(ext_r[1].split(' ')[3]))
-                    img_info_l['depth_factor'] = base * f
-
-                    img_infos.append(img_info_l)
-                    # img_infos.append(img_info_r)
-        else:
-            raise NotImplementedError 
-
-        # github issue:: make sure the same order
-        img_infos = sorted(img_infos, key=lambda x: x['img_path'])
-        return img_infos
-    
-    
-    def augment_image(self, image):
-        # gamma augmentation
-        gamma = random.uniform(0.9, 1.1)
-        image_aug = image ** gamma
-
-        # brightness augmentation
-        if self.config.dataset == 'nyu':
-            brightness = random.uniform(0.75, 1.25)
-        else:
-            brightness = random.uniform(0.9, 1.1)
-        image_aug = image_aug * brightness
-
-        # color augmentation
-        colors = np.random.uniform(0.9, 1.1, size=3)
-        white = np.ones((image.shape[0], image.shape[1]))
-        color_image = np.stack([white * colors[i] for i in range(3)], axis=2)
-        image_aug *= color_image
-        image_aug = np.clip(image_aug, 0, 1)
-
-        return image_aug
-    
-    def train_preprocess(self, image, depth_gt):
-        if self.config.aug:
-            # Random flipping
-            do_flip = random.random()
-            if do_flip > 0.5:
-                image = (image[:, ::-1, :]).copy()
-                depth_gt = (depth_gt[:, ::-1, :]).copy()
-
-            # Random gamma, brightness, color augmentation
-            do_augment = random.random()
-            if do_augment > 0.5:
-                image = self.augment_image(image)
-
-        return image, depth_gt
-    
-    def get_crop_bbox(self, img):
-        """Randomly get a crop bounding box."""
-        margin_h = max(img.shape[0] - self.crop_size[0], 0)
-        margin_w = max(img.shape[1] - self.crop_size[1], 0)
-        offset_h = np.random.randint(0, margin_h + 1)
-        offset_w = np.random.randint(0, margin_w + 1)
-        crop_y1, crop_y2 = offset_h, offset_h + self.crop_size[0]
-        crop_x1, crop_x2 = offset_w, offset_w + self.crop_size[1]
-
-        return crop_y1, crop_y2, crop_x1, crop_x2
-
-    def crop(self, img, crop_bbox, tmp=False):
-        """Crop from ``img``"""
-        
-        crop_y1, crop_y2, crop_x1, crop_x2 = crop_bbox
-        if tmp:
-            templete = np.zeros((img.shape[0], img.shape[1], 1), dtype=np.float32)
-            templete[crop_y1:crop_y2, crop_x1:crop_x2, :] = 1.0
-            img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
-            return img, templete
-        
-        else:
-            img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
-            return img
-    
-    def __getitem__(self, idx):
-
-        img_file_path = self.data_infos[idx]['img_path']
-        disp_path = self.data_infos[idx]['depth_map_path']
-        depth_factor = self.data_infos[idx]['depth_factor']
-
-        height=2160 
-        width=3840
-        image = np.fromfile(open(img_file_path, 'rb'), dtype=np.uint8).reshape(height, width, 3) / 255.0
-        if self.config.get("use_rgb", False):
-            image = image.astype(np.float32)[:, :, ::-1].copy()
-        elif self.config.get("use_brg", False):
-            image = image.astype(np.float32)[:, :, [0, 2, 1]].copy()
-        elif self.config.get("use_gbr", False):
-            image = image.astype(np.float32)[:, :, [1, 0, 2]].copy()
-        elif self.config.get("use_rbg", False):
-            image = image.astype(np.float32)[:, :, [2, 0, 1]].copy()
-        elif self.config.get("use_grb", False):
-            image = image.astype(np.float32)[:, :, [1, 2, 0]].copy()
-        else:
-            image = image.astype(np.float32)
-            
-        disp_gt = np.expand_dims(np.load(disp_path, mmap_mode='c'), -1)
-        disp_gt = disp_gt.astype(np.float32)
-        disp_gt_copy = disp_gt[:, :, 0].copy()
-        
-        depth_gt = depth_factor / disp_gt
-        depth_gt[depth_gt > self.config.max_depth] = self.config.max_depth # for vis
-        focal = self.data_infos[idx]['focal']
-
-        
-        bbox = None
-        bboxs_res = None
-        crop_areas = None
-        bboxs_roi = None # hack for infer
-
-        if self.mode == 'train':
-            image, depth_gt = self.train_preprocess(image, depth_gt)
-            img_temp = copy.deepcopy(image)
-            depth_gt_temp = copy.deepcopy(depth_gt)
-            
-            if self.random_crop: # use in sec_stage
-                if self.consistency_training:
-                    crop_y1, crop_y2, crop_x1, crop_x2 = self.get_crop_bbox(image) # ensure the prob of crop is the same
-                    while True:
-                        # shift_x = random.randint(self.overlap_length//3, self.overlap_length)
-                        # shift_y = random.randint(self.overlap_length//3, self.overlap_length)
-                        shift_x = self.overlap_length_w
-                        shift_y = self.overlap_length_h
-                        if random.random() > 0.5:
-                            shift_x = shift_x * -1
-                        if random.random() > 0.5:
-                            shift_y = shift_y * -1
-                        crop_y1_shift, crop_y2_shift, crop_x1_shift, crop_x2_shift = crop_y1 + shift_y, crop_y2 + shift_y, crop_x1 + shift_x, crop_x2 + shift_x
-                        if crop_y1_shift > 0 and crop_x1_shift > 0 and crop_y2_shift < image.shape[0] and crop_x2_shift < image.shape[1]:
-                            break
-                    bbox_ori = (crop_y1, crop_y2, crop_x1, crop_x2)
-                    bbox_shift = (crop_y1_shift, crop_y2_shift, crop_x1_shift, crop_x2_shift)
-                    image_ori, crop_area_ori = self.crop(image, bbox_ori, tmp=True)
-                    image_shift, crop_area_shift = self.crop(image, bbox_shift, tmp=True)
-                    depth_gt_ori = self.crop(depth_gt, bbox_ori)
-                    depth_gt_shift = self.crop(depth_gt, bbox_shift)
-                    disp_gt_copy_ori = self.crop(disp_gt_copy, bbox_ori)
-                    disp_gt_copy_shift = self.crop(disp_gt_copy, bbox_shift)
-
-                    bboxs_ori = torch.tensor([crop_x1 / width * 512, crop_y1 / height * 384, crop_x2 / width * 512, crop_y2 / height * 384])
-                    bboxs_shift = torch.tensor([crop_x1_shift / width * 512, crop_y1_shift / height * 384, crop_x2_shift / width * 512, crop_y2_shift / height * 384])
-                    bboxs_raw = torch.tensor([crop_x1, crop_y1, crop_x2, crop_y2]) 
-                    bboxs_raw_shift = torch.tensor([crop_x1_shift, crop_y1_shift, crop_x2_shift, crop_y2_shift]) 
-
-
-                else:
-                    bbox = self.get_crop_bbox(image)
-                    image, crop_area = self.crop(image, bbox, tmp=True)
-                    depth_gt = self.crop(depth_gt, bbox)
-                    disp_gt_copy = self.crop(disp_gt_copy, bbox)
-
-                    crop_y1, crop_y2, crop_x1, crop_x2 = bbox
-                    bboxs_res = torch.tensor([crop_x1 / width * 512, crop_y1 / height * 384, crop_x2 / width * 512, crop_y2 / height * 384]) # coord in 384, 512
-                    bboxs_raw = torch.tensor([crop_x1, crop_y1, crop_x2, crop_y2]) 
-
-
-            mask = np.logical_and(depth_gt > self.config.min_depth,
-                                  depth_gt < self.config.max_depth).squeeze()[None, ...]
-            mask_raw = np.logical_and(depth_gt_temp > self.config.min_depth, depth_gt_temp < self.config.max_depth).squeeze()[None, ...]
-            sample = {'image': image, 'depth': depth_gt, 'focal': focal, 'mask': mask, 'image_raw': image.copy(), 'mask_raw': mask_raw, 'image_path': img_file_path}
-            
-            if self.random_crop:
-                if self.consistency_training:
-                    image = np.concatenate([image_ori, image_shift], axis=-1)
-                    depth_gt = np.concatenate([depth_gt_ori, depth_gt_shift], axis=-1)
-                    crop_area = np.concatenate([crop_area_ori, crop_area_shift], axis=-1)
-                    bboxs_res = torch.cat([bboxs_ori, bboxs_shift], dim=-1)
-                    bboxes_raw_res = torch.cat([bboxs_raw, bboxs_raw_shift], dim=-1)
-                    mask = np.logical_and(depth_gt > self.config.min_depth,
-                                          depth_gt < self.config.max_depth)
-
-                    # hack the sample dict
-                    sample['image'] = image
-                    sample['depth'] = depth_gt
-                    sample['crop_area'] = crop_area
-                    sample['bbox'] = bboxs_res
-                    sample['bbox_raw'] = bboxes_raw_res
-                    sample['shift'] = torch.tensor([shift_y, shift_x]) # h direction, then w direction
-                    sample['mask'] = mask
-                        
-                else:
-                    if bboxs_res is not None:
-                        sample['bbox'] = bboxs_res
-                        sample['bbox_raw'] = bboxs_raw
-                    sample['crop_area'] = crop_area
-
-            if self.sampled_training:
-                self.data_sampler(sample, disp_gt_copy)
-
-                # update mask
-                sample_points = sample['sample_points']
-                sample_mask = np.logical_and(sample_points[:, -1] > self.config.min_depth,
-                                             sample_points[:, -1] < self.config.max_depth).squeeze()[None, ...]
-                sample['sample_mask'] = sample_mask
-
-            
-        else:
-            # nothing needs to be changed for consistency training.
-
-            img_temp = copy.deepcopy(image)
-            depth_gt_temp = copy.deepcopy(depth_gt)
-            
-            if self.sec_stage:
-                # x_start, y_start = [0, 540, 1080, 1620], [0, 960, 1920, 2880]
-                x_start, y_start = [0 + 3 * self.overlap / 2, 540 + self.overlap / 2, 1080 - self.overlap / 2, 1620 - 3 * self.overlap / 2], \
-                    [0 + 3 * self.overlap / 2, 960 + self.overlap / 2, 1920 - self.overlap / 2, 2880 - 3 * self.overlap / 2]
-                img_crops = []
-                bboxs_roi = []
-                crop_areas = []
-                bboxs_raw_list = []
-                for x in x_start:
-                    for y in y_start:
-                        bbox = (int(x), int(x+540), int(y), int(y+960))
-                        img_crop, crop_area = self.crop(image, bbox, tmp=True)
-                        img_crops.append(img_crop)
-                        crop_areas.append(crop_area)
-                        crop_y1, crop_y2, crop_x1, crop_x2 = bbox
-                        bbox_roi = torch.tensor([crop_x1 / width * 512, crop_y1 / height * 384, crop_x2 / width * 512, crop_y2 / height * 384])
-                        bboxs_roi.append(bbox_roi)
-                        bboxs_raw = torch.tensor([crop_x1, crop_y1, crop_x2, crop_y2]) 
-                        bboxs_raw_list.append(bboxs_raw)
-
-                image = img_crops
-                bboxs_roi = torch.stack(bboxs_roi, dim=0)
-                bboxs_raw = torch.stack(bboxs_raw_list, dim=0)
-
-                # bbox = (820, 1360 ,1440, 2400) # a hack version for quick evaluation
-                # image = self.crop(image, bbox)
-                # depth_gt = self.crop(depth_gt, bbox)
-                # disp_gt_copy = self.crop(disp_gt_copy, bbox)
-
-            mask = np.logical_and(depth_gt > self.config.min_depth,
-                                  depth_gt < self.config.max_depth).squeeze()[None, ...]
-            
-            disp_gt_edges = get_boundaries(disp_gt_copy, th=1, dilation=0)
-
-            if self.mode == 'online_eval':
-                sample = {'image': image, 'depth': depth_gt, 'focal': focal, 'has_valid_depth': True,
-                          'image_path': img_file_path, 'depth_path': disp_path, 'depth_factor_path': depth_factor,
-                          'mask': mask, 'image_raw': image.copy(), 'disp_gt_edges': disp_gt_edges, 'image_path': img_file_path}
-                if bboxs_roi is not None:
-                    sample['bbox'] = bboxs_roi
-                    sample['bbox_raw'] = bboxs_raw
-                if crop_areas is not None:
-                    sample['crop_area'] = crop_areas
-                
-
-            else:
-                sample = {'image': image, 'focal': focal, 'image_raw': image.copy(), 'disp_gt_edges': disp_gt_edges, 'image_path': img_file_path}
-                if bboxs_roi is not None:
-                    sample['bbox'] = bboxs_roi
-                    sample['bbox_raw'] = bboxs_raw
-                if crop_areas is not None:
-                    sample['crop_area'] = crop_areas
-                
-        if self.transform:
-            sample['img_temp'] = img_temp
-            sample['depth_gt_temp'] = depth_gt_temp
-            sample = self.transform(sample)
-
-        sample['dataset'] = self.config.dataset
-        return sample
-
-    def __len__(self):
-        return len(self.data_infos)
-
-
-def get_u4k_loader(config, mode, transform):
-    if mode == 'train':
-        dataset = U4KDataset(config, mode, config.data_path, config.filenames_train)
-        dataset[0]
-
-        if config.distributed:
-            train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
-        else:
-            train_sampler = None
-        
-        dataloader = DataLoader(dataset,
-                                batch_size=config.batch_size,
-                                shuffle=(train_sampler is None),
-                                num_workers=config.workers,
-                                pin_memory=True,
-                                persistent_workers=True,
-                                sampler=train_sampler)
-    
-    elif mode == 'train_save':
-        dataset = U4KDataset(config, 'online_eval', config.data_path, config.filenames_train)
-
-        if config.distributed:
-            train_sampler = None
-        else:
-            train_sampler = None
-        
-        dataloader = DataLoader(dataset, 1,
-                                shuffle=False,
-                                num_workers=1,
-                                pin_memory=False,
-                                sampler=train_sampler)
-            
-
-    elif mode == 'online_eval':
-        dataset = U4KDataset(config, mode, config.data_path, config.filenames_val)
-        # dataset = U4KDataset(config, mode, config.data_path, config.filenames_train)
-
-
-        if config.distributed:  # redundant. here only for readability and to be more explicit
-            # Give whole test set to all processes (and report evaluation only on one) regardless
-            eval_sampler = None
-        else:
-            eval_sampler = None
-
-        dataloader = DataLoader(dataset, 1,
-                                shuffle=False,
-                                num_workers=1,
-                                pin_memory=False,
-                                sampler=eval_sampler)
-        
-    else:
-        dataset = U4KDataset(config, mode, config.data_path, config.filenames_test)
-        dataloader = DataLoader(dataset, 1, shuffle=False, num_workers=1)
-        
-    return dataloader
\ No newline at end of file
diff --git a/zoedepth/models/base_models/depth_anything.py b/zoedepth/models/base_models/depth_anything.py
new file mode 100644
index 0000000000000000000000000000000000000000..63f9a0b8f1c1b8507ca73d441e7f99019347cb7f
--- /dev/null
+++ b/zoedepth/models/base_models/depth_anything.py
@@ -0,0 +1,391 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
+
+import torch
+import torch.nn as nn
+import numpy as np
+from torchvision.transforms import Normalize
+# from zoedepth.models.base_models.dpt_dinov2.dpt import DPT_DINOv2
+from depth_anything.dpt import DPT_DINOv2
+
+
+def denormalize(x):
+    """Reverses the imagenet normalization applied to the input.
+
+    Args:
+        x (torch.Tensor - shape(N,3,H,W)): input tensor
+
+    Returns:
+        torch.Tensor - shape(N,3,H,W): Denormalized input
+    """
+    mean = torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(x.device)
+    std = torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(x.device)
+    return x * std + mean
+
+def get_activation(name, bank):
+    def hook(model, input, output):
+        bank[name] = output
+    return hook
+
+
+class Resize(object):
+    """Resize sample to given size (width, height).
+    """
+
+    def __init__(
+        self,
+        width,
+        height,
+        resize_target=True,
+        keep_aspect_ratio=False,
+        ensure_multiple_of=1,
+        resize_method="lower_bound",
+    ):
+        """Init.
+        Args:
+            width (int): desired output width
+            height (int): desired output height
+            resize_target (bool, optional):
+                True: Resize the full sample (image, mask, target).
+                False: Resize image only.
+                Defaults to True.
+            keep_aspect_ratio (bool, optional):
+                True: Keep the aspect ratio of the input sample.
+                Output sample might not have the given width and height, and
+                resize behaviour depends on the parameter 'resize_method'.
+                Defaults to False.
+            ensure_multiple_of (int, optional):
+                Output width and height is constrained to be multiple of this parameter.
+                Defaults to 1.
+            resize_method (str, optional):
+                "lower_bound": Output will be at least as large as the given size.
+                "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
+                "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
+                Defaults to "lower_bound".
+        """
+        # print("Params passed to Resize transform:")
+        # print("\twidth: ", width)
+        # print("\theight: ", height)
+        # print("\tresize_target: ", resize_target)
+        # print("\tkeep_aspect_ratio: ", keep_aspect_ratio)
+        # print("\tensure_multiple_of: ", ensure_multiple_of)
+        # print("\tresize_method: ", resize_method)
+
+        self.__width = width
+        self.__height = height
+
+        self.__keep_aspect_ratio = keep_aspect_ratio
+        self.__multiple_of = ensure_multiple_of
+        self.__resize_method = resize_method
+
+    def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
+        y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
+
+        if max_val is not None and y > max_val:
+            y = (np.floor(x / self.__multiple_of)
+                 * self.__multiple_of).astype(int)
+
+        if y < min_val:
+            y = (np.ceil(x / self.__multiple_of)
+                 * self.__multiple_of).astype(int)
+
+        return y
+
+    def get_size(self, width, height):
+        # determine new height and width
+        scale_height = self.__height / height
+        scale_width = self.__width / width
+
+        if self.__keep_aspect_ratio:
+            if self.__resize_method == "lower_bound":
+                # scale such that output size is lower bound
+                if scale_width > scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "upper_bound":
+                # scale such that output size is upper bound
+                if scale_width < scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "minimal":
+                # scale as least as possbile
+                if abs(1 - scale_width) < abs(1 - scale_height):
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            else:
+                raise ValueError(
+                    f"resize_method {self.__resize_method} not implemented"
+                )
+
+        if self.__resize_method == "lower_bound":
+            new_height = self.constrain_to_multiple_of(
+                scale_height * height, min_val=self.__height
+            )
+            new_width = self.constrain_to_multiple_of(
+                scale_width * width, min_val=self.__width
+            )
+        elif self.__resize_method == "upper_bound":
+            new_height = self.constrain_to_multiple_of(
+                scale_height * height, max_val=self.__height
+            )
+            new_width = self.constrain_to_multiple_of(
+                scale_width * width, max_val=self.__width
+            )
+        elif self.__resize_method == "minimal":
+            new_height = self.constrain_to_multiple_of(scale_height * height)
+            new_width = self.constrain_to_multiple_of(scale_width * width)
+        else:
+            raise ValueError(
+                f"resize_method {self.__resize_method} not implemented")
+
+        return (new_width, new_height)
+
+    def __call__(self, x):
+        width, height = self.get_size(*x.shape[-2:][::-1])
+        return nn.functional.interpolate(x, (int(height), int(width)), mode='bilinear', align_corners=True)
+
+class PrepForMidas(object):
+    def __init__(self, resize_mode="minimal", keep_aspect_ratio=True, img_size=384, do_resize=True):
+        if isinstance(img_size, int):
+            img_size = (img_size, img_size)
+        net_h, net_w = img_size
+        # self.normalization = Normalize(
+        #     mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+        self.normalization = Normalize(
+            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        self.resizer = Resize(net_w, net_h, keep_aspect_ratio=keep_aspect_ratio, ensure_multiple_of=14, resize_method=resize_mode) \
+            if do_resize else nn.Identity()
+
+    def __call__(self, x):
+        return self.normalization(self.resizer(x))
+
+
+class DepthAnythingCore(nn.Module):
+    def __init__(self, midas, trainable=False, fetch_features=True, layer_names=('out_conv', 'l4_rn', 'r4', 'r3', 'r2', 'r1'), freeze_bn=False, keep_aspect_ratio=True, img_size=384, core_type='vits', **kwargs):
+        """Midas Base model used for multi-scale feature extraction.
+
+        Args:
+            midas (torch.nn.Module): Midas model.
+            trainable (bool, optional): Train midas model. Defaults to False.
+            fetch_features (bool, optional): Extract multi-scale features. Defaults to True.
+            layer_names (tuple, optional): Layers used for feature extraction. Order = (head output features, last layer features, ...decoder features). Defaults to ('out_conv', 'l4_rn', 'r4', 'r3', 'r2', 'r1').
+            freeze_bn (bool, optional): Freeze BatchNorm. Generally results in better finetuning performance. Defaults to False.
+            keep_aspect_ratio (bool, optional): Keep the aspect ratio of input images while resizing. Defaults to True.
+            img_size (int, tuple, optional): Input resolution. Defaults to 384.
+        """
+        super().__init__()
+        self.core_type = core_type
+        self.core = midas
+        self.output_channels = None
+        self.core_out = {}
+        self.trainable = trainable
+        self.fetch_features = fetch_features
+        # midas.scratch.output_conv = nn.Identity()
+        self.handles = []
+        # self.layer_names = ['out_conv','l4_rn', 'r4', 'r3', 'r2', 'r1']
+        self.layer_names = layer_names
+
+        self.set_trainable(trainable)
+        self.set_fetch_features(fetch_features)
+
+        self.prep = PrepForMidas(keep_aspect_ratio=keep_aspect_ratio,
+                                 img_size=img_size, do_resize=kwargs.get('do_resize', True))
+
+        if freeze_bn:
+            self.freeze_bn()
+
+    def set_trainable(self, trainable):
+        self.trainable = trainable
+        if trainable:
+            self.unfreeze()
+        else:
+            self.freeze()
+        return self
+
+    def set_fetch_features(self, fetch_features):
+        self.fetch_features = fetch_features
+        if fetch_features:
+            if len(self.handles) == 0:
+                self.attach_hooks(self.core)
+        else:
+            self.remove_hooks()
+        return self
+
+    def freeze(self):
+        for p in self.parameters():
+            p.requires_grad = False
+        self.trainable = False
+        return self
+
+    def unfreeze(self):
+        for p in self.parameters():
+            p.requires_grad = True
+        self.trainable = True
+        return self
+
+    def freeze_bn(self):
+        for m in self.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eval()
+        return self
+
+    def forward(self, x, denorm=False, return_rel_depth=False):
+        # print('input to midas:', x.shape)
+        with torch.no_grad():
+            if denorm:
+                x = denormalize(x)
+            x = self.prep(x)
+        
+        with torch.set_grad_enabled(self.trainable):
+
+            rel_depth = self.core(x)
+            if not self.fetch_features:
+                return rel_depth
+        out = [self.core_out[k] for k in self.layer_names]
+
+        if return_rel_depth:
+            return rel_depth, out
+        return out
+
+    def get_rel_pos_params(self):
+        for name, p in self.core.pretrained.named_parameters():
+            if "pos_embed" in name:
+                yield p
+
+    def get_enc_params_except_rel_pos(self):
+        for name, p in self.core.pretrained.named_parameters():
+            if "pos_embed" not in name:
+                yield p
+
+    def freeze_encoder(self, freeze_rel_pos=False):
+        if freeze_rel_pos:
+            for p in self.core.pretrained.parameters():
+                p.requires_grad = False
+        else:
+            for p in self.get_enc_params_except_rel_pos():
+                p.requires_grad = False
+        return self
+
+    def attach_hooks(self, midas):
+        if len(self.handles) > 0:
+            self.remove_hooks()
+        if "out_conv" in self.layer_names:
+            self.handles.append(list(midas.depth_head.scratch.output_conv2.children())[
+                                1].register_forward_hook(get_activation("out_conv", self.core_out)))
+        if "r4" in self.layer_names:
+            self.handles.append(midas.depth_head.scratch.refinenet4.register_forward_hook(
+                get_activation("r4", self.core_out)))
+        if "r3" in self.layer_names:
+            self.handles.append(midas.depth_head.scratch.refinenet3.register_forward_hook(
+                get_activation("r3", self.core_out)))
+        if "r2" in self.layer_names:
+            self.handles.append(midas.depth_head.scratch.refinenet2.register_forward_hook(
+                get_activation("r2", self.core_out)))
+        if "r1" in self.layer_names:
+            self.handles.append(midas.depth_head.scratch.refinenet1.register_forward_hook(
+                get_activation("r1", self.core_out)))
+        if "l4_rn" in self.layer_names:
+            self.handles.append(midas.depth_head.scratch.layer4_rn.register_forward_hook(
+                get_activation("l4_rn", self.core_out)))
+
+        return self
+
+    def remove_hooks(self):
+        for h in self.handles:
+            h.remove()
+        return self
+
+    def __del__(self):
+        self.remove_hooks()
+
+    def set_output_channels(self):
+        if self.core_type == 'vits':
+            self.output_channels = [64, 64, 64, 64, 64]
+        elif self.core_type == 'vitb':
+            self.output_channels = [128, 128, 128, 128, 128]
+        elif self.core_type == 'vitl':
+            self.output_channels = [256, 256, 256, 256, 256]
+
+    @staticmethod
+    def build(midas_model_type="dinov2_large", train_midas=False, use_pretrained_midas=True, fetch_features=False, freeze_bn=True, force_keep_ar=False, force_reload=False, **kwargs):
+        if "img_size" in kwargs:
+            kwargs = DepthAnythingCore.parse_img_size(kwargs)
+        img_size = kwargs.pop("img_size", [384, 384])
+        
+        if midas_model_type == 'vits':
+            depth_anything = DPT_DINOv2(encoder=midas_model_type, features=64, out_channels=[48, 96, 192, 384], use_clstoken=False)
+            state_dict = torch.load('/ibex/ai/home/liz0l/codes/ZoeDepth/depth_anything_vits14.pth', map_location='cpu')
+        elif midas_model_type == 'vitb':
+            depth_anything = DPT_DINOv2(encoder=midas_model_type, features=128, out_channels=[96, 192, 384, 768], use_clstoken=False)
+            state_dict = torch.load('/ibex/ai/home/liz0l/codes/ZoeDepth/depth_anything_vitb14.pth', map_location='cpu')
+        elif midas_model_type == 'vitl':
+            depth_anything = DPT_DINOv2(encoder=midas_model_type, features=256, out_channels=[256, 512, 1024, 1024], use_clstoken=False)
+            state_dict = torch.load('/ibex/ai/home/liz0l/codes/ZoeDepth/depth_anything_vitl14.pth', map_location='cpu')
+        else:
+            raise NotImplementedError
+        
+        depth_anything.load_state_dict(state_dict)
+        
+        kwargs.update({'keep_aspect_ratio': force_keep_ar})
+        
+        depth_anything_core = DepthAnythingCore(depth_anything, trainable=train_midas, fetch_features=fetch_features,
+                               freeze_bn=freeze_bn, img_size=img_size, core_type=midas_model_type, **kwargs)
+        
+        depth_anything_core.set_output_channels()
+        return depth_anything_core
+
+    @staticmethod
+    def parse_img_size(config):
+        assert 'img_size' in config
+        if isinstance(config['img_size'], str):
+            assert "," in config['img_size'], "img_size should be a string with comma separated img_size=H,W"
+            config['img_size'] = list(map(int, config['img_size'].split(",")))
+            assert len(
+                config['img_size']) == 2, "img_size should be a string with comma separated img_size=H,W"
+        elif isinstance(config['img_size'], int):
+            config['img_size'] = [config['img_size'], config['img_size']]
+        else:
+            assert isinstance(config['img_size'], list) and len(
+                config['img_size']) == 2, "img_size should be a list of H,W"
+        return config
+
+
+nchannels2models = {
+    tuple([256]*5): ["DPT_BEiT_L_384", "DPT_BEiT_L_512", "DPT_BEiT_B_384", "DPT_SwinV2_L_384", "DPT_SwinV2_B_384", "DPT_SwinV2_T_256", "DPT_Large", "DPT_Hybrid"],
+    (512, 256, 128, 64, 64): ["MiDaS_small"]
+}
+
+# Model name to number of output channels
+MIDAS_SETTINGS = {m: k for k, v in nchannels2models.items()
+                  for m in v
+                  }
\ No newline at end of file
diff --git a/zoedepth/models/base_models/midas.py b/zoedepth/models/base_models/midas.py
index e26f8589502f8298bde8820262083f54b254f70e..7ad698d2906b405520a551ddccaa92eb81edc897 100644
--- a/zoedepth/models/base_models/midas.py
+++ b/zoedepth/models/base_models/midas.py
@@ -82,13 +82,13 @@ class Resize(object):
                 "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
                 Defaults to "lower_bound".
         """
-        print("Params passed to Resize transform:")
-        print("\twidth: ", width)
-        print("\theight: ", height)
-        print("\tresize_target: ", resize_target)
-        print("\tkeep_aspect_ratio: ", keep_aspect_ratio)
-        print("\tensure_multiple_of: ", ensure_multiple_of)
-        print("\tresize_method: ", resize_method)
+        # print("Params passed to Resize transform:")
+        # print("\twidth: ", width)
+        # print("\theight: ", height)
+        # print("\tresize_target: ", resize_target)
+        # print("\tkeep_aspect_ratio: ", keep_aspect_ratio)
+        # print("\tensure_multiple_of: ", ensure_multiple_of)
+        # print("\tresize_method: ", resize_method)
 
         self.__width = width
         self.__height = height
@@ -170,7 +170,7 @@ class Resize(object):
 
     def __call__(self, x):
         width, height = self.get_size(*x.shape[-2:][::-1])
-        return nn.functional.interpolate(x, (height, width), mode='bilinear', align_corners=True)
+        return nn.functional.interpolate(x, (int(height), int(width)), mode='bilinear', align_corners=True)
 
 class PrepForMidas(object):
     def __init__(self, resize_mode="minimal", keep_aspect_ratio=True, img_size=384, do_resize=True):
@@ -261,10 +261,7 @@ class MidasCore(nn.Module):
                 x = denormalize(x)
             x = self.prep(x)
             # print("Shape after prep: ", x.shape)
-
         with torch.set_grad_enabled(self.trainable):
-
-            # print("Input size to Midascore", x.shape)
             rel_depth = self.core(x)
             # print("Output from midas shape", rel_depth.shape)
             if not self.fetch_features:
@@ -337,9 +334,11 @@ class MidasCore(nn.Module):
         if "img_size" in kwargs:
             kwargs = MidasCore.parse_img_size(kwargs)
         img_size = kwargs.pop("img_size", [384, 384])
-        print("img_size", img_size)
-        midas = torch.hub.load("intel-isl/MiDaS", midas_model_type,
-                               pretrained=use_pretrained_midas, force_reload=force_reload)
+        # print("img_size", img_size)
+        # midas = torch.hub.load("intel-isl/MiDaS", midas_model_type,
+        #                        pretrained=use_pretrained_midas, force_reload=force_reload)
+        midas = torch.hub.load("AyaanShah2204/MiDaS", midas_model_type,
+                               pretrained=use_pretrained_midas, force_reload=force_reload) # switcher to a better version?
         kwargs.update({'keep_aspect_ratio': force_keep_ar})
         midas_core = MidasCore(midas, trainable=train_midas, fetch_features=fetch_features,
                                freeze_bn=freeze_bn, img_size=img_size, **kwargs)
diff --git a/zoedepth/models/builder.py b/zoedepth/models/builder.py
index d3c18c96179da6056029e5407e1446a17abba42d..4363d59689158912a412feb5c296b4a72bc2c608 100644
--- a/zoedepth/models/builder.py
+++ b/zoedepth/models/builder.py
@@ -22,8 +22,6 @@
 
 # File author: Shariq Farooq Bhat
 
-# This file may include modifications from author Zhenyu Li
-
 from importlib import import_module
 from zoedepth.models.depth_model import DepthModel
 
diff --git a/zoedepth/models/layers/dist_layers.py b/zoedepth/models/layers/dist_layers.py
index 5165fb3eb09c34f9fd281f1040afc9602055a047..572fc0e389cc970c96bb1d4a0c18d9f6e1b42d56 100644
--- a/zoedepth/models/layers/dist_layers.py
+++ b/zoedepth/models/layers/dist_layers.py
@@ -121,6 +121,7 @@ class ConditionalLogBinomial(nn.Module):
         return self.log_binomial_transform(p, t)
 
 
+
 class ConditionalLogBinomialV2(nn.Module):
     def __init__(self, in_features, condition_dim, n_classes=256, bottleneck_factor=2, p_eps=1e-4, max_temp=50, min_temp=1e-7, act=torch.softmax):
         """Conditional Log Binomial distribution
@@ -163,4 +164,4 @@ class ConditionalLogBinomialV2(nn.Module):
         """
         pt = self.mlp(torch.concat((x, cond), dim=1))
         prob, shift = pt[:, :self.n_classes, ...], pt[:, self.n_classes:, ...]
-        return prob, shift
+        return prob, shift
\ No newline at end of file
diff --git a/zoedepth/models/layers/fusion_network.py b/zoedepth/models/layers/fusion_network.py
deleted file mode 100644
index 3ecef6d4427201e99be3f163984909646e3efec6..0000000000000000000000000000000000000000
--- a/zoedepth/models/layers/fusion_network.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# MIT License
-
-# Copyright (c) 2022 Intelligent Systems Lab Org
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-# File author: Zhenyu Li
-
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from zoedepth.models.layers.swin_layers import G2LFusion
-from zoedepth.models.layers.transformer import TransformerEncoder, TransformerEncoderLayer
-from torchvision.ops import roi_align as torch_roi_align
-
-class DoubleConvWOBN(nn.Module):
-    """(convolution => [BN] => ReLU) * 2"""
-
-    def __init__(self, in_channels, out_channels, mid_channels=None):
-        super().__init__()
-        if not mid_channels:
-            mid_channels = out_channels
-        self.double_conv = nn.Sequential(
-            nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=True),
-            # nn.BatchNorm2d(mid_channels),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=True),
-            # nn.BatchNorm2d(mid_channels),
-            nn.ReLU(inplace=True))
-
-    def forward(self, x):
-        return self.double_conv(x)
-
-class DoubleConv(nn.Module):
-    """(convolution => [BN] => ReLU) * 2"""
-
-    def __init__(self, in_channels, out_channels, mid_channels=None):
-        super().__init__()
-        if not mid_channels:
-            mid_channels = out_channels
-        self.double_conv = nn.Sequential(
-            nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=False),
-            nn.BatchNorm2d(mid_channels),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False),
-            nn.BatchNorm2d(out_channels),
-            nn.ReLU(inplace=True)
-        )
-
-    def forward(self, x):
-        return self.double_conv(x)
-    
-
-class Down(nn.Module):
-    """Downscaling with maxpool then double conv"""
-
-    def __init__(self, in_channels, out_channels):
-        super().__init__()
-        self.maxpool_conv = nn.Sequential(
-            nn.MaxPool2d(2),
-            DoubleConv(in_channels, out_channels)
-        )
-
-    def forward(self, x):
-        return self.maxpool_conv(x)
-
-class Upv1(nn.Module):
-    """Upscaling then double conv"""
-
-    def __init__(self, in_channels, out_channels, mid_channels=None):
-        super().__init__()
-        self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
-
-        if mid_channels is not None:
-            self.conv = DoubleConvWOBN(in_channels, out_channels, mid_channels)
-        else:
-            self.conv = DoubleConvWOBN(in_channels, out_channels, in_channels)
-
-    def forward(self, x1, x2):
-        x1 = self.up(x1)
-        x = torch.cat([x2, x1], dim=1)
-        return self.conv(x)
-    
-class UNetv1(nn.Module):
-    def __init__(self, n_channels, g2l, pos_embed=False, use_area_prior=True):
-        super(UNetv1, self).__init__()
-        self.n_channels = n_channels
-
-        self.inc = DoubleConv(n_channels, 32)
-        self.down1 = Down(32, 256)
-        self.down2 = Down(256, 256)
-        self.down3 = Down(256, 256)
-        self.down4 = Down(256, 256)
-        self.down5 = Down(256, 256)
-
-        self.up1 = Upv1(256+256+256, 256, 384)
-        self.up2 = Upv1(256+256+256, 256, 384)
-        self.up3 = Upv1(256+256+256, 256, 384)
-        self.up4 = Upv1(256+256+256, 256, 384)
-        self.up5 = Upv1(256+32+256, 32, 272)
-
-        self.g2l = g2l
-        
-        if self.g2l:
-            self.g2l_att = nn.ModuleList()
-            win = 12
-            in_channels = [32, 256, 256, 256, 256, 256]
-            crf_dims = [32, 256, 256, 256, 256, 256]
-
-            self.g2l5 = G2LFusion(input_dim=in_channels[5], embed_dim=crf_dims[5], window_size=win, num_heads=32, depth=4, num_patches=12*16)
-            self.g2l4 = G2LFusion(input_dim=in_channels[4], embed_dim=crf_dims[4], window_size=win, num_heads=32, depth=4, num_patches=24*32)
-            self.g2l3 = G2LFusion(input_dim=in_channels[3], embed_dim=crf_dims[3], window_size=win, num_heads=16, depth=3, num_patches=48*64)
-            self.g2l2 = G2LFusion(input_dim=in_channels[2], embed_dim=crf_dims[2], window_size=win, num_heads=16, depth=3, num_patches=96*128)
-            self.g2l1 = G2LFusion(input_dim=in_channels[1], embed_dim=crf_dims[1], window_size=win, num_heads=8, depth=2, num_patches=192*256)
-            self.g2l0 = G2LFusion(input_dim=in_channels[0], embed_dim=crf_dims[0], window_size=win, num_heads=8, depth=2, num_patches=384*512) 
-
-            self.conv5 = DoubleConvWOBN(in_channels[4] * 2, in_channels[4], in_channels[4])
-            self.conv4 = DoubleConvWOBN(in_channels[4] * 2, in_channels[4], in_channels[4])
-            self.conv3 = DoubleConvWOBN(in_channels[3] * 2, in_channels[3], in_channels[3])
-            self.conv2 = DoubleConvWOBN(in_channels[2] * 2, in_channels[2], in_channels[2])
-            self.conv1 = DoubleConvWOBN(in_channels[1] * 2, in_channels[1], in_channels[1])
-            self.conv0 = DoubleConvWOBN(in_channels[0] * 2, in_channels[0], in_channels[0])
-            
-    def forward(self, 
-                input_tensor, 
-                guide_plus, 
-                guide_cat, 
-                crop_area_resize=None, 
-                bbox=None, 
-                fine_feat_crop=None, 
-                coarse_feat_whole=None, 
-                coarse_feat_whole_hack=None, 
-                coarse_feat_crop=None):
-
-        # apply unscaled feat to swin
-        if coarse_feat_whole_hack is not None:
-            coarse_feat_whole = coarse_feat_whole_hack
-
-        if crop_area_resize is None:
-            not_use_prior = True
-        else:
-            not_use_prior = False
-        
-        x1 = self.inc(input_tensor)
-        x2 = self.down1(x1)
-        x3 = self.down2(x2)
-        x4 = self.down3(x3) 
-        x5 = self.down4(x4)
-        x6 = self.down5(x5)
-        if self.g2l:
-            g2l_feat5 = self.g2l5(coarse_feat_whole[0], crop_area_resize[0])
-            g2l_feat5 = torch_roi_align(g2l_feat5, bbox, (12, 16), 12/384, aligned=True)
-            x6 = self.conv5(torch.cat([x6, g2l_feat5], dim=1))
-  
-        x5 = self.up1(torch.cat([x6, guide_cat[0]], dim=1), x5)
-        if self.g2l:
-            g2l_feat4 = self.g2l4(coarse_feat_whole[1], crop_area_resize[1])
-            g2l_feat4 = torch_roi_align(g2l_feat4, bbox, (24, 32), 24/384, aligned=True)
-            x5 = self.conv4(torch.cat([x5, g2l_feat4], dim=1))   
-
-        x4 = self.up2(torch.cat([x5, guide_cat[1]], dim=1), x4)
-        if self.g2l:
-            g2l_feat3 = self.g2l3(coarse_feat_whole[2], crop_area_resize[2])
-            g2l_feat3 = torch_roi_align(g2l_feat3, bbox, (48, 64), 48/384, aligned=True)
-            x4 = self.conv3(torch.cat([x4, g2l_feat3], dim=1))
-
-        x3 = self.up3(torch.cat([x4, guide_cat[2]], dim=1), x3)
-        if self.g2l:
-            g2l_feat2 = self.g2l2(coarse_feat_whole[3], crop_area_resize[3])
-            g2l_feat2 = torch_roi_align(g2l_feat2, bbox, (96, 128), 96/384, aligned=True)
-            x3 = self.conv2(torch.cat([x3, g2l_feat2], dim=1))
-
-        x2 = self.up4(torch.cat([x3, guide_cat[3]], dim=1), x2)
-        if self.g2l:
-            g2l_feat1 = self.g2l1(coarse_feat_whole[4], crop_area_resize[4])
-            g2l_feat1 = torch_roi_align(g2l_feat1, bbox, (192, 256), 192/384, aligned=True)
-            x2 = self.conv1(torch.cat([x2, g2l_feat1], dim=1))
-
-        x1 = self.up5(torch.cat([x2, guide_cat[4]], dim=1), x1)
-        if self.g2l:
-            g2l_feat0 = self.g2l0(coarse_feat_whole[5], crop_area_resize[5])
-            g2l_feat0 = torch_roi_align(g2l_feat0, bbox, (384, 512), 384/384, aligned=True)
-            x1 = self.conv0(torch.cat([x1, g2l_feat0], dim=1))
-
-        output = [x1, x2, x3, x4, x5, x6]
-        return output
\ No newline at end of file
diff --git a/zoedepth/models/layers/patch_transformer.py b/zoedepth/models/layers/patch_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..99d9e51a06b981bae45ce7dd64eaef19a4121991
--- /dev/null
+++ b/zoedepth/models/layers/patch_transformer.py
@@ -0,0 +1,91 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
+
+import torch
+import torch.nn as nn
+
+
+class PatchTransformerEncoder(nn.Module):
+    def __init__(self, in_channels, patch_size=10, embedding_dim=128, num_heads=4, use_class_token=False):
+        """ViT-like transformer block
+
+        Args:
+            in_channels (int): Input channels
+            patch_size (int, optional): patch size. Defaults to 10.
+            embedding_dim (int, optional): Embedding dimension in transformer model. Defaults to 128.
+            num_heads (int, optional): number of attention heads. Defaults to 4.
+            use_class_token (bool, optional): Whether to use extra token at the start for global accumulation (called as "class token"). Defaults to False.
+        """
+        super(PatchTransformerEncoder, self).__init__()
+        self.use_class_token = use_class_token
+        encoder_layers = nn.TransformerEncoderLayer(
+            embedding_dim, num_heads, dim_feedforward=1024)
+        self.transformer_encoder = nn.TransformerEncoder(
+            encoder_layers, num_layers=4)  # takes shape S,N,E
+
+        self.embedding_convPxP = nn.Conv2d(in_channels, embedding_dim,
+                                           kernel_size=patch_size, stride=patch_size, padding=0)
+        
+    def positional_encoding_1d(self, sequence_length, batch_size, embedding_dim, device='cpu'):
+        """Generate positional encodings
+
+        Args:
+            sequence_length (int): Sequence length
+            embedding_dim (int): Embedding dimension
+
+        Returns:
+            torch.Tensor SBE: Positional encodings
+        """
+        position = torch.arange(
+            0, sequence_length, dtype=torch.float32, device=device).unsqueeze(1)
+        index = torch.arange(
+            0, embedding_dim, 2, dtype=torch.float32, device=device).unsqueeze(0)
+        div_term = torch.exp(index * (-torch.log(torch.tensor(10000.0, device=device)) / embedding_dim))
+        pos_encoding = position * div_term
+        pos_encoding = torch.cat([torch.sin(pos_encoding), torch.cos(pos_encoding)], dim=1)
+        pos_encoding = pos_encoding.unsqueeze(1).repeat(1, batch_size, 1)
+        return pos_encoding
+        
+
+    def forward(self, x):
+        """Forward pass
+
+        Args:
+            x (torch.Tensor - NCHW): Input feature tensor
+
+        Returns:
+            torch.Tensor - SNE: Transformer output embeddings. S - sequence length (=HW/patch_size^2), N - batch size, E - embedding dim
+        """
+        embeddings = self.embedding_convPxP(x).flatten(
+            2)  # .shape = n,c,s = n, embedding_dim, s
+        if self.use_class_token:
+            # extra special token at start ?
+            embeddings = nn.functional.pad(embeddings, (1, 0))
+        
+        # change to S,N,E format required by transformer
+        embeddings = embeddings.permute(2, 0, 1)
+        S, N, E = embeddings.shape
+        embeddings = embeddings + self.positional_encoding_1d(S, N, E, device=embeddings.device)
+        x = self.transformer_encoder(embeddings)  # .shape = S, N, E
+        return x
diff --git a/zoedepth/models/model_io.py b/zoedepth/models/model_io.py
index ff0e10ed7597dffa3123219d8be69795f2b0438b..78b6579631dd847ac76651238cb5a948b5a66286 100644
--- a/zoedepth/models/model_io.py
+++ b/zoedepth/models/model_io.py
@@ -46,7 +46,7 @@ def load_state_dict(model, state_dict):
 
         state[k] = v
 
-    model.load_state_dict(state, strict=False)
+    model.load_state_dict(state)
     print("Loaded successfully")
     return model
 
diff --git a/zoedepth/models/zoedepth/__init__.py b/zoedepth/models/zoedepth/__init__.py
index b972c255db238aabe44d955c55f5c8221dcffede..cc33f737d238766559f0e3a8def3c0b568f23b7f 100644
--- a/zoedepth/models/zoedepth/__init__.py
+++ b/zoedepth/models/zoedepth/__init__.py
@@ -22,7 +22,7 @@
 
 # File author: Shariq Farooq Bhat
 
-from .zoedepth_v1 import ZoeDepth
+from .zoedepth_v1 import ZoeDepth 
 
 all_versions = {
     "v1": ZoeDepth,
diff --git a/zoedepth/models/zoedepth/color_channel/config_zoedepth_rgb.json b/zoedepth/models/zoedepth/color_channel/config_zoedepth_rgb.json
deleted file mode 100644
index 61f6520afc4c034304fbe852b06c2b08001a22c4..0000000000000000000000000000000000000000
--- a/zoedepth/models/zoedepth/color_channel/config_zoedepth_rgb.json
+++ /dev/null
@@ -1,66 +0,0 @@
-{
-    "model": {
-        "name": "ZoeDepth",
-        "version_name": "v1",
-        "n_bins": 64,
-        "bin_embedding_dim": 128,
-        "bin_centers_type": "softplus",
-        "n_attractors":[16, 8, 4, 1],
-        "attractor_alpha": 1000,
-        "attractor_gamma": 2,
-        "attractor_kind" : "mean",
-        "attractor_type" : "inv",
-        "midas_model_type" : "DPT_BEiT_L_384",
-        "min_temp": 0.0212,
-        "max_temp": 50.0,
-        "output_distribution": "logbinomial",
-        "memory_efficient": true,
-        "inverse_midas": false,
-        "img_size": [384, 512],
-        "sampled_training": false,
-        "do_resize": false // do resize in dataloader to speed up
-    },
-    
-    "train": {
-        "use_rbg": true,
-        "train_midas": true,
-        "use_pretrained_midas": true,
-        "trainer": "zoedepth",
-        // "epochs": 5,
-        // "epochs": 8,
-        // "epochs": 12,
-        "epochs": 16,
-        "bs": 16,
-        // "optim_kwargs": {"lr": 0.0001, "wd": 0.01},
-        "optim_kwargs": {"lr": 0.000161, "wd": 0.01},
-        "sched_kwargs": {"div_factor": 1, "final_div_factor": 10000, "pct_start": 0.5, "three_phase":false, "cycle_momentum": true},
-        // "sched_kwargs": {"div_factor": 1, "final_div_factor": 10000, "pct_start": 0.7, "three_phase":false, "cycle_momentum": true},
-        "same_lr": false,
-        "w_si": 1,
-        "w_domain": 0.2,
-        "w_reg": 0,
-        "w_grad": 0,
-        "avoid_boundary": false,
-        "random_crop": false,
-        "input_width": 640,
-        "input_height": 480,
-        "midas_lr_factor": 1,
-        "encoder_lr_factor":10,
-        "pos_enc_lr_factor":10,
-        "freeze_midas_bn": true
-
-    },
-
-    "infer":{
-        "train_midas": false,
-        "use_pretrained_midas": false,
-        "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_N.pt",
-        "force_keep_ar": true
-    },
-
-    "eval":{
-        "train_midas": false,
-        "use_pretrained_midas": false,
-        "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_N.pt"
-    }
-}
\ No newline at end of file
diff --git a/zoedepth/models/zoedepth/config_zoedepth.json b/zoedepth/models/zoedepth/config_zoedepth.json
index 70b77dd0dd419eff1a54209bc7786bedc945fa1c..99beb2dcd886006ba87805bddbe408b6d5fdff78 100644
--- a/zoedepth/models/zoedepth/config_zoedepth.json
+++ b/zoedepth/models/zoedepth/config_zoedepth.json
@@ -16,15 +16,11 @@
         "output_distribution": "logbinomial",
         "memory_efficient": true,
         "inverse_midas": false,
-        "img_size": [384, 512],
-        "sampled_training": false,
-        // "do_resize": false // do resize in dataloader to speed up
-        "do_resize": true
+        "img_size": [384, 512]
     },
     
     "train": {
         "train_midas": true,
-        "use_rgb": true,
         "use_pretrained_midas": true,
         "trainer": "zoedepth",
         "epochs": 5,
diff --git a/zoedepth/models/zoedepth/config_zoedepth_fine.json b/zoedepth/models/zoedepth/config_zoedepth_fine.json
deleted file mode 100644
index ced6717389d0394b6ae869dcb84d00f8353566f4..0000000000000000000000000000000000000000
--- a/zoedepth/models/zoedepth/config_zoedepth_fine.json
+++ /dev/null
@@ -1,66 +0,0 @@
-{
-    "model": {
-        "name": "ZoeDepth",
-        "version_name": "v1",
-        "n_bins": 64,
-        "bin_embedding_dim": 128,
-        "bin_centers_type": "softplus",
-        "n_attractors":[16, 8, 4, 1],
-        "attractor_alpha": 1000,
-        "attractor_gamma": 2,
-        "attractor_kind" : "mean",
-        "attractor_type" : "inv",
-        "midas_model_type" : "DPT_BEiT_L_384",
-        "min_temp": 0.0212,
-        "max_temp": 50.0,
-        "output_distribution": "logbinomial",
-        "memory_efficient": true,
-        "inverse_midas": false,
-        "img_size": [384, 512],
-        "sampled_training": false,
-        // "do_resize": false // do resize in dataloader to speed up
-        "do_resize": false,
-
-        "do_normalize": true,
-        "do_input_resize": true
-    },
-    
-    "train": {
-        "train_midas": true,
-        "use_rgb": true,
-        "use_pretrained_midas": true,
-        "trainer": "zoedepth_custom",
-        "epochs": 5,
-        "bs": 16,
-        "optim_kwargs": {"lr": 0.000161, "wd": 0.01},
-        "sched_kwargs": {"div_factor": 1, "final_div_factor": 10000, "pct_start": 0.7, "three_phase":false, "cycle_momentum": true},
-        "same_lr": false,
-        "w_si": 1,
-        "w_domain": 0.2,
-        "w_reg": 0,
-        "w_grad": 0,
-        "avoid_boundary": false,
-        "random_crop": true,
-        "input_width": 160,
-        "input_height": 120,
-        "midas_lr_factor": 1,
-        "encoder_lr_factor":10,
-        "pos_enc_lr_factor":10,
-        "freeze_midas_bn": true,
-        "sec_stage": true
-
-    },
-
-    "infer":{
-        "train_midas": false,
-        "use_pretrained_midas": false,
-        "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_N.pt",
-        "force_keep_ar": true
-    },
-
-    "eval":{
-        "train_midas": false,
-        "use_pretrained_midas": false,
-        "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_N.pt"
-    }
-}
\ No newline at end of file
diff --git a/zoedepth/models/zoedepth/zoedepth_v1.py b/zoedepth/models/zoedepth/zoedepth_v1.py
index f57d00c90e96a189b3a9da5d63613ba1ce315cd0..091e2a2f0e1c11288c523d30e20b3923b47ff85c 100644
--- a/zoedepth/models/zoedepth/zoedepth_v1.py
+++ b/zoedepth/models/zoedepth/zoedepth_v1.py
@@ -28,16 +28,17 @@ import torch
 import torch.nn as nn
 from zoedepth.models.depth_model import DepthModel
 from zoedepth.models.base_models.midas import MidasCore
+from zoedepth.models.base_models.depth_anything import DepthAnythingCore
 from zoedepth.models.layers.attractor import AttractorLayer, AttractorLayerUnnormed
 from zoedepth.models.layers.dist_layers import ConditionalLogBinomial
 from zoedepth.models.layers.localbins_layers import (Projector, SeedBinRegressor,
                                             SeedBinRegressorUnnormed)
 from zoedepth.models.model_io import load_state_from_resource
-
+from mmengine import print_log
 
 class ZoeDepth(DepthModel):
     def __init__(self, core,  n_bins=64, bin_centers_type="softplus", bin_embedding_dim=128, min_depth=1e-3, max_depth=10,
-                 n_attractors=[16, 8, 4, 1], attractor_alpha=300, attractor_gamma=2, attractor_kind='sum', attractor_type='exp', min_temp=5, max_temp=50, train_midas=True,
+                 n_attractors=[16, 8, 4, 1], attractor_alpha=300, attractor_gamma=2, attractor_kind='sum', attractor_type='exp', min_temp=5, max_temp=50, train_midas=True, depth_anything=False, 
                  midas_lr_factor=10, encoder_lr_factor=10, pos_enc_lr_factor=10, inverse_midas=False, **kwargs):
         """ZoeDepth model. This is the version of ZoeDepth that has a single metric head
 
@@ -63,6 +64,7 @@ class ZoeDepth(DepthModel):
         """
         super().__init__()
 
+        self.depth_anything = depth_anything
         self.core = core
         self.max_depth = max_depth
         self.min_depth = min_depth
@@ -82,7 +84,6 @@ class ZoeDepth(DepthModel):
         N_MIDAS_OUT = 32
         btlnck_features = self.core.output_channels[0]
         num_out_features = self.core.output_channels[1:]
-
         self.conv2 = nn.Conv2d(btlnck_features, btlnck_features,
                                kernel_size=1, stride=1, padding=0)  # btlnck conv
 
@@ -120,10 +121,8 @@ class ZoeDepth(DepthModel):
         # use log binomial instead of softmax
         self.conditional_log_binomial = ConditionalLogBinomial(
             last_in, bin_embedding_dim, n_classes=n_bins, min_temp=min_temp, max_temp=max_temp)
-
-        self.collect_feat = {}
-
-    def forward(self, x, return_final_centers=False, denorm=False, return_probs=False, **kwargs):
+        
+    def forward(self, x, return_final_centers=False, denorm=False, return_probs=False, hack_feature=None, only_encoder=False, **kwargs):
         """
         Args:
             x (torch.Tensor): Input image tensor of shape (B, C, H, W)
@@ -139,18 +138,40 @@ class ZoeDepth(DepthModel):
                 - probs (torch.Tensor): Output probability distribution of shape (B, n_bins, H, W). Present only if return_probs is True
 
         """
-        b, c, h, w = x.shape
-        # print("input shape ", x.shape)
-        self.orig_input_width = w
-        self.orig_input_height = h
-        rel_depth, out = self.core(x, denorm=denorm, return_rel_depth=True)
-        # print("output shapes", rel_depth.shape, out.shape)
-
-        outconv_activation = out[0]
-        btlnck = out[1]
-        x_blocks = out[2:]
-
+        temp_features = {}
+        
+        if hack_feature is None:
+            b, c, h, w = x.shape
+            # print("input shape ", x.shape)
+            self.orig_input_width = w
+            self.orig_input_height = h
+            rel_depth, out = self.core(x, denorm=denorm, return_rel_depth=True)
+            
+            outconv_activation = out[0]
+            btlnck = out[1]
+            x_blocks = out[2:]
+            
+            # prepare encoder features
+            encoder_feat_list = [btlnck]
+            for i in x_blocks:
+                encoder_feat_list.append(i)
+            encoder_feat_list.append(outconv_activation)
+            
+            if only_encoder:
+                # only use zoe encoder in this case
+                return encoder_feat_list
+            
+        else:
+            rel_depth, out = hack_feature[0], hack_feature[1] # forward encoder outputs as inputs in zoedepth
+            
+            outconv_activation = out[-1]
+            btlnck = out[0]
+            x_blocks = out[1:-1]
+        
+        
+        
         x_d0 = self.conv2(btlnck)
+        temp_features['x_d0'] = x_d0
         x = x_d0
         _, seed_b_centers = self.seed_bin_regressor(x)
 
@@ -163,14 +184,17 @@ class ZoeDepth(DepthModel):
         prev_b_embedding = self.seed_projector(x)
 
         # unroll this loop for better performance
-        for projector, attractor, x in zip(self.projectors, self.attractors, x_blocks):
+        # for projector, attractor, x in zip(self.projectors, self.attractors, x_blocks):
+        for idx, (projector, attractor, x) in enumerate(zip(self.projectors, self.attractors, x_blocks)):
             b_embedding = projector(x)
+            temp_features['x_blocks_feat_{}'.format(idx)] = x
             b, b_centers = attractor(
                 b_embedding, b_prev, prev_b_embedding, interpolate=True)
             b_prev = b.clone()
             prev_b_embedding = b_embedding.clone()
 
         last = outconv_activation
+        temp_features['midas_final_feat'] = last
 
         if self.inverse_midas:
             # invert depth followed by normalization
@@ -181,6 +205,7 @@ class ZoeDepth(DepthModel):
         rel_cond = rel_depth.unsqueeze(1)
         rel_cond = nn.functional.interpolate(
             rel_cond, size=last.shape[2:], mode='bilinear', align_corners=True)
+        temp_features['last'] = last
         last = torch.cat([last, rel_cond], dim=1)
 
         b_embedding = nn.functional.interpolate(
@@ -197,11 +222,14 @@ class ZoeDepth(DepthModel):
         output = dict(metric_depth=out)
         if return_final_centers or return_probs:
             output['bin_centers'] = b_centers
+            output['b_embedding'] = b_embedding
 
         if return_probs:
             output['probs'] = x
         
-        output['coarse_depth_pred'] = out
+        output['temp_features'] = temp_features
+        # output['encoder_feats'] = encoder_feat_list
+
         return output
 
     def get_lr_params(self, lr):
@@ -212,39 +240,73 @@ class ZoeDepth(DepthModel):
         Returns:
             list : list of parameters to optimize and their learning rates, in the format required by torch optimizers.
         """
-        param_conf = []
-        if self.train_midas:
-            if self.encoder_lr_factor > 0:
-                param_conf.append({'params': self.core.get_enc_params_except_rel_pos(
-                ), 'lr': lr / self.encoder_lr_factor})
-
-            if self.pos_enc_lr_factor > 0:
+        if self.depth_anything:
+            param_conf = []
+            if self.train_midas:
+                if self.encoder_lr_factor > 0:
+                    param_conf.append({'params': self.core.get_enc_params_except_rel_pos(
+                    ), 'lr': lr / self.encoder_lr_factor})
+
+                if self.pos_enc_lr_factor > 0:
+                    param_conf.append(
+                        {'params': self.core.get_rel_pos_params(), 'lr': lr / self.pos_enc_lr_factor})
+
+                # midas_params = self.core.core.scratch.parameters()
+                midas_params = self.core.core.depth_head.parameters()
+                midas_lr_factor = self.midas_lr_factor
                 param_conf.append(
-                    {'params': self.core.get_rel_pos_params(), 'lr': lr / self.pos_enc_lr_factor})
+                    {'params': midas_params, 'lr': lr / midas_lr_factor})
 
-            midas_params = self.core.core.scratch.parameters()
-            midas_lr_factor = self.midas_lr_factor
-            param_conf.append(
-                {'params': midas_params, 'lr': lr / midas_lr_factor})
+            remaining_modules = []
+            for name, child in self.named_children():
+                if name != 'core':
+                    remaining_modules.append(child)
+            remaining_params = itertools.chain(
+                *[child.parameters() for child in remaining_modules])
 
-        remaining_modules = []
-        for name, child in self.named_children():
-            if name != 'core':
-                remaining_modules.append(child)
-        remaining_params = itertools.chain(
-            *[child.parameters() for child in remaining_modules])
+            param_conf.append({'params': remaining_params, 'lr': lr})
+            
+        else:
+            param_conf = []
+            if self.train_midas:
+                if self.encoder_lr_factor > 0:
+                    param_conf.append({'params': self.core.get_enc_params_except_rel_pos(
+                    ), 'lr': lr / self.encoder_lr_factor})
+
+                if self.pos_enc_lr_factor > 0:
+                    param_conf.append(
+                        {'params': self.core.get_rel_pos_params(), 'lr': lr / self.pos_enc_lr_factor})
+
+                midas_params = self.core.core.scratch.parameters()
+                midas_lr_factor = self.midas_lr_factor
+                param_conf.append(
+                    {'params': midas_params, 'lr': lr / midas_lr_factor})
 
-        param_conf.append({'params': remaining_params, 'lr': lr})
+            remaining_modules = []
+            for name, child in self.named_children():
+                if name != 'core':
+                    remaining_modules.append(child)
+            remaining_params = itertools.chain(
+                *[child.parameters() for child in remaining_modules])
+
+            param_conf.append({'params': remaining_params, 'lr': lr})
 
         return param_conf
 
     @staticmethod
     def build(midas_model_type="DPT_BEiT_L_384", pretrained_resource=None, use_pretrained_midas=False, train_midas=False, freeze_midas_bn=True, **kwargs):
-        core = MidasCore.build(midas_model_type=midas_model_type, use_pretrained_midas=use_pretrained_midas,
+        # core = MidasCore.build(midas_model_type=midas_model_type, use_pretrained_midas=use_pretrained_midas,
+        #                        train_midas=train_midas, fetch_features=True, freeze_bn=freeze_midas_bn, **kwargs)
+        if midas_model_type == 'vits' or midas_model_type == 'vitb' or midas_model_type == 'vitl':
+            core = DepthAnythingCore.build(midas_model_type=midas_model_type, use_pretrained_midas=use_pretrained_midas,
                                train_midas=train_midas, fetch_features=True, freeze_bn=freeze_midas_bn, **kwargs)
+        else:
+            core = MidasCore.build(midas_model_type=midas_model_type, use_pretrained_midas=use_pretrained_midas,
+                                train_midas=train_midas, fetch_features=True, freeze_bn=freeze_midas_bn, **kwargs)
         model = ZoeDepth(core, **kwargs)
         if pretrained_resource:
             assert isinstance(pretrained_resource, str), "pretrained_resource must be a string"
+            print_log("Loading deepnet from {}".format(pretrained_resource), logger='current')
             model = load_state_from_resource(model, pretrained_resource)
         return model
 
diff --git a/zoedepth/models/zoedepth_custom/configs/config_zoedepth_custom_coarse_stage1.json b/zoedepth/models/zoedepth_custom/configs/config_zoedepth_custom_coarse_stage1.json
deleted file mode 100644
index e93da621ac468a6240bb056ef4635c419c46206a..0000000000000000000000000000000000000000
--- a/zoedepth/models/zoedepth_custom/configs/config_zoedepth_custom_coarse_stage1.json
+++ /dev/null
@@ -1,72 +0,0 @@
-{
-    "model": {
-        "name": "ZoeDepthCustom",
-        "version_name": "custom",
-        "n_bins": 64,
-        "bin_embedding_dim": 128,
-        "bin_centers_type": "softplus",
-        "n_attractors":[16, 8, 4, 1],
-        "attractor_alpha": 1000,
-        "attractor_gamma": 2,
-        "attractor_kind" : "mean",
-        "attractor_type" : "inv",
-        "midas_model_type" : "DPT_BEiT_L_384",
-        "min_temp": 0.0212,
-        "max_temp": 50.0,
-        "output_distribution": "logbinomial",
-        "memory_efficient": true,
-        "inverse_midas": false,
-        "img_size": [384, 512],
-        "do_resize": false, // do resize in dataloader to speed up
-        "raw_depth_shape": [2160, 3840],
-        "sr_ratio": 1,
-        "sampled_training": false,
-        "transform_sample_gt_size": [2160, 3840],
-        "sample_feat_level": 4,
-        "use_hr": false,
-        "baseline": true
-    },
-    
-    "train": {
-        "use_rgb": true,
-        "train_midas": true,
-        "use_pretrained_midas": true,
-        "trainer": "zoedepth_custom",
-        // "epochs": 5,
-        // "epochs": 12,
-        "epochs": 16,
-        // "epochs": 20,
-        "bs": 16,
-        // "optim_kwargs": {"lr": 0.0001, "wd": 0.01},
-        "optim_kwargs": {"lr": 0.000161, "wd": 0.01},
-        "sched_kwargs": {"div_factor": 1, "final_div_factor": 10000, "pct_start": 0.5, "three_phase":false, "cycle_momentum": true},
-        // "sched_kwargs": {"div_factor": 1, "final_div_factor": 10000, "pct_start": 0.7, "three_phase":false, "cycle_momentum": true},
-        "same_lr": false,
-        "w_si": 1,
-        "w_domain": 0.2,
-        "w_reg": 0,
-        "w_grad": 0,
-        "avoid_boundary": false,
-        "random_crop": false,
-        "input_width": 640,
-        "input_height": 480,
-        "midas_lr_factor": 1,
-        "encoder_lr_factor":10,
-        "pos_enc_lr_factor":10,
-        "freeze_midas_bn": true,
-    },
-
-    "infer":{
-        "train_midas": false,
-        "use_pretrained_midas": false,
-        "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_N.pt",
-        "force_keep_ar": true
-    },
-
-    "eval":{
-        "train_midas": false,
-        "use_pretrained_midas": false,
-        "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_N.pt"
-    }
-}
-
diff --git a/zoedepth/models/zoedepth_custom/configs/config_zoedepth_custom_fine_stage2.json b/zoedepth/models/zoedepth_custom/configs/config_zoedepth_custom_fine_stage2.json
deleted file mode 100644
index caa79553f8f35d60b42d72253a97f8079c90a7e0..0000000000000000000000000000000000000000
--- a/zoedepth/models/zoedepth_custom/configs/config_zoedepth_custom_fine_stage2.json
+++ /dev/null
@@ -1,74 +0,0 @@
-{
-    "model": {
-        "name": "ZoeDepthCustom",
-        "version_name": "custom",
-        "n_bins": 64,
-        "bin_embedding_dim": 128,
-        "bin_centers_type": "softplus",
-        "n_attractors":[16, 8, 4, 1],
-        "attractor_alpha": 1000,
-        "attractor_gamma": 2,
-        "attractor_kind" : "mean",
-        "attractor_type" : "inv",
-        "midas_model_type" : "DPT_BEiT_L_384",
-        "min_temp": 0.0212,
-        "max_temp": 50.0,
-        "output_distribution": "logbinomial",
-        "memory_efficient": true,
-        "inverse_midas": false,
-        "img_size": [384, 512],
-        "do_resize": false, // do resize in dataloader to speed up
-        "raw_depth_shape": [2160, 3840],
-        "sr_ratio": 1,
-        "sampled_training": false,
-        "transform_sample_gt_size": [2160, 3840],
-        "sample_feat_level": 4,
-        "use_hr": false,
-        "baseline": true
-    },
-    
-    "train": {
-        "use_rgb": true,
-        "use_blur": false,
-        "train_midas": true,
-        "use_pretrained_midas": true,
-        "trainer": "zoedepth_custom",
-        // "epochs": 5,
-        // "epochs": 12,
-        "epochs": 16,
-        // "epochs": 20,
-        "bs": 16,
-        // "optim_kwargs": {"lr": 0.0001, "wd": 0.01},
-        "optim_kwargs": {"lr": 0.000161, "wd": 0.01},
-        "sched_kwargs": {"div_factor": 1, "final_div_factor": 10000, "pct_start": 0.5, "three_phase":false, "cycle_momentum": true},
-        // "sched_kwargs": {"div_factor": 1, "final_div_factor": 10000, "pct_start": 0.7, "three_phase":false, "cycle_momentum": true},
-        "same_lr": false,
-        "w_si": 1,
-        "w_domain": 0.2,
-        "w_reg": 0,
-        "w_grad": 0,
-        "avoid_boundary": false,
-        "random_crop": true,
-        "input_width": 640,
-        "input_height": 480,
-        "midas_lr_factor": 1,
-        "encoder_lr_factor":10,
-        "pos_enc_lr_factor":10,
-        "freeze_midas_bn": true,
-        "sec_stage": true
-    },
-
-    "infer":{
-        "train_midas": false,
-        "use_pretrained_midas": false,
-        "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_N.pt",
-        "force_keep_ar": true
-    },
-
-    "eval":{
-        "train_midas": false,
-        "use_pretrained_midas": false,
-        "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_N.pt"
-    }
-}
-
diff --git a/zoedepth/models/zoedepth_custom/configs/config_zoedepth_patchfusion.json b/zoedepth/models/zoedepth_custom/configs/config_zoedepth_patchfusion.json
deleted file mode 100644
index 33d5bd96012540496a080825aed76675fbb6d3b1..0000000000000000000000000000000000000000
--- a/zoedepth/models/zoedepth_custom/configs/config_zoedepth_patchfusion.json
+++ /dev/null
@@ -1,75 +0,0 @@
-{
-    "model": {
-        "name": "PatchFusion",
-        "version_name": "patchfusion",
-        "n_bins": 64,
-        "bin_embedding_dim": 128,
-        "bin_centers_type": "softplus",
-        "n_attractors":[16, 8, 4, 1],
-        "attractor_alpha": 1000,
-        "attractor_gamma": 2,
-        "attractor_kind" : "mean",
-        "attractor_type" : "inv",
-        "midas_model_type" : "DPT_BEiT_L_384",
-        "min_temp": 0.0212,
-        "max_temp": 50.0,
-        "output_distribution": "logbinomial",
-        "memory_efficient": true,
-        "inverse_midas": false,
-        "img_size": [384, 512],
-        "do_resize": false, // do resize in dataloader to speed up
-        "raw_depth_shape": [2160, 3840],
-        "sr_ratio": 1,
-        "sampled_training": false,
-        "transform_sample_gt_size": [2160, 3840],
-        "sample_feat_level": 4,
-        "use_hr": false,
-        "baseline": true,
-        "condition": true,
-        "freeze": true,
-        "g2l": true,
-        "use_fusion_network": true,
-        "use_area_prior": true
-    },
-    
-    "train": {
-        "use_rgb": true,
-        "use_blur": false,
-        "train_midas": true,
-        "use_pretrained_midas": true,
-        "trainer": "zoedepth_custom",
-        "epochs": 12,
-        "bs": 16,
-        "optim_kwargs": {"lr": 0.0001, "wd": 0.01},
-        "sched_kwargs": {"div_factor": 10, "final_div_factor": 10000, "pct_start": 0.5, "three_phase":false, "cycle_momentum": true},
-        "same_lr": false,
-        "w_si": 1,
-        "w_domain": 0.2,
-        "w_reg": 0,
-        "w_grad": 0,
-        "avoid_boundary": false,
-        "random_crop": true,
-        "input_width": 640,
-        "input_height": 480,
-        "midas_lr_factor": 1,
-        "encoder_lr_factor":10,
-        "pos_enc_lr_factor":10,
-        "freeze_midas_bn": true,
-        "sec_stage": true,
-        "multi_consistency": true
-    },
-
-    "infer":{
-        "train_midas": false,
-        "use_pretrained_midas": false,
-        "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_N.pt",
-        "force_keep_ar": true
-    },
-
-    "eval":{
-        "train_midas": false,
-        "use_pretrained_midas": false,
-        "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_N.pt"
-    }
-}
-
diff --git a/zoedepth/models/zoedepth_custom/configs/config_zoedepth_patchfusion_finetune.json b/zoedepth/models/zoedepth_custom/configs/config_zoedepth_patchfusion_finetune.json
deleted file mode 100644
index d1e574488daf4a6b9ac004c77766e2c1de99afbc..0000000000000000000000000000000000000000
--- a/zoedepth/models/zoedepth_custom/configs/config_zoedepth_patchfusion_finetune.json
+++ /dev/null
@@ -1,82 +0,0 @@
-{
-    "model": {
-        "name": "PatchFusion",
-        "version_name": "patchfusion",
-        "n_bins": 64,
-        "bin_embedding_dim": 128,
-        "bin_centers_type": "softplus",
-        "n_attractors":[16, 8, 4, 1],
-        "attractor_alpha": 1000,
-        "attractor_gamma": 2,
-        "attractor_kind" : "mean",
-        "attractor_type" : "inv",
-        "midas_model_type" : "DPT_BEiT_L_384",
-        "min_temp": 0.0212,
-        "max_temp": 50.0,
-        "output_distribution": "logbinomial",
-        "memory_efficient": true,
-        "inverse_midas": false,
-        "img_size": [384, 512],
-        "do_resize": false, // do resize in dataloader to speed up
-        "raw_depth_shape": [2160, 3840], // 540, 960
-        "sr_ratio": 1,
-        "sampled_training": false,
-        "transform_sample_gt_size": [2160, 3840],
-        "sample_feat_level": 4,
-        "use_hr": false,
-        "baseline": true,
-        "condition": true,
-        "freeze": true,
-        "g2l": true,
-        "use_fusion_network": true,
-        "use_area_prior": true,
-        "consistency_training": true,
-        // "consistency_target": "final_feat",
-        "consistency_target": "mix",
-    },
-    
-    "train": {
-        "use_rgb": true,
-        "use_blur": false,
-        "train_midas": true,
-        "use_pretrained_midas": true,
-        "trainer": "zoedepth_custom",
-        "epochs": 12,
-        "bs": 16,
-        "optim_kwargs": {"lr": 0.0001, "wd": 0.01},
-        "sched_kwargs": {"div_factor": 10, "final_div_factor": 10000, "pct_start": 0.5, "three_phase":false, "cycle_momentum": true},
-        "same_lr": false,
-        "w_si": 1,
-        "w_domain": 0.2,
-        "w_reg": 0,
-        "w_grad": 0,
-        "avoid_boundary": false,
-        "random_crop": true,
-        "input_width": 640,
-        "input_height": 480,
-        "midas_lr_factor": 1,
-        "encoder_lr_factor":10,
-        "pos_enc_lr_factor":10,
-        "freeze_midas_bn": true,
-        "sec_stage": true,
-        "multi_consistency": true,
-        "w_consistency": 0.1, // consistency weight here
-        "overlap_length_h": 135,
-        "overlap_length_w": 240,
-        "w_p": 0.1 // weight of pred depth in consistency loss
-    },
-
-    "infer":{
-        "train_midas": false,
-        "use_pretrained_midas": false,
-        "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_N.pt",
-        "force_keep_ar": true
-    },
-
-    "eval":{
-        "train_midas": false,
-        "use_pretrained_midas": false,
-        "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_N.pt"
-    }
-}
-
diff --git a/zoedepth/models/zoedepth_custom/patchfusion.py b/zoedepth/models/zoedepth_custom/patchfusion.py
deleted file mode 100644
index 75b25b8b50f5fef9f968fb697f914e46b0a9c502..0000000000000000000000000000000000000000
--- a/zoedepth/models/zoedepth_custom/patchfusion.py
+++ /dev/null
@@ -1,610 +0,0 @@
-# MIT License
-
-# Copyright (c) 2022 Intelligent Systems Lab Org
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-# File author: Zhenyu Li
-
-import itertools
-
-import math
-import copy
-import torch
-import torch.nn as nn
-import numpy as np
-
-from zoedepth.models.depth_model import DepthModel
-from zoedepth.models.base_models.midas import MidasCore
-from zoedepth.models.layers.attractor import AttractorLayer, AttractorLayerUnnormed
-from zoedepth.models.layers.dist_layers import ConditionalLogBinomial, ConditionalLogBinomialV2
-from zoedepth.models.layers.localbins_layers import (Projector, SeedBinRegressor, SeedBinRegressorUnnormed)
-from zoedepth.models.model_io import load_state_from_resource
-from torchvision.transforms import Normalize
-from torchvision.ops import roi_align as torch_roi_align
-from zoedepth.utils.misc import generatemask
-
-from zoedepth.models.layers.transformer import TransformerDecoderLayer, TransformerEncoderLayer, TransformerEncoder
-
-from zoedepth.utils.misc import colorize, colors
-import matplotlib.pyplot as plt
-
-from zoedepth.models.layers.fusion_network import UNetv1
-import matplotlib.pyplot as plt
-
-import os
-import torch.distributed as dist
-import torch.nn.functional as F
-
-def check_keywords_in_name(name, keywords=()):
-    isin = False
-    for keyword in keywords:
-        if keyword in name:
-            isin = True
-    return isin
-
-def get_activation(name, bank):
-    # input of forward_hook will be a function of model/inp/oup
-    def hook(module, input, output):
-        bank[name] = output
-    return hook
-
-def get_input(name, bank):
-    # input of forward_hook will be a function of model/inp/oup
-    def hook(module, input, output):
-        bank[name] = input
-    return hook
-    
-class AttributeDict(dict):
-    def __getattr__(self, key):
-        try:
-            return self[key]
-        except KeyError:
-            raise AttributeError(key)
-
-    def __setattr__(self, key, value):
-        self[key] = value
-
-    def __delattr__(self, key):
-        try:
-            del self[key]
-        except KeyError:
-            raise AttributeError(key)
-
-class PatchFusion(DepthModel):
-    def __init__(self, coarse_model, fine_model, n_bins=64, bin_centers_type="softplus", bin_embedding_dim=128, min_depth=1e-3, max_depth=10,
-                 n_attractors=[16, 8, 4, 1], attractor_alpha=300, attractor_gamma=2, attractor_kind='sum', attractor_type='exp', min_temp=5, max_temp=50, train_midas=True,
-                 midas_lr_factor=10, encoder_lr_factor=10, pos_enc_lr_factor=10, inverse_midas=False, sr_ratio=1,
-                 raw_depth_shape=(2160, 3840), transform_sample_gt_size=(2160, 3840), representation='',
-                 fetch_features=True, sample_feat_level=3, use_hr=False, deform=False, wo_bins=False, baseline=False,
-                 condition=True, freeze=False, g2l=False, use_fusion_network=False, use_area_prior=False,
-                 unet_version='v1', consistency_training=False, consistency_target='unet_feat', pos_embed=False, **kwargs):
-        """ZoeDepth model. This is the version of ZoeDepth that has a single metric head
-
-        Args:
-            core (models.base_models.midas.MidasCore): The base midas model that is used for extraction of "relative" features
-            n_bins (int, optional): Number of bin centers. Defaults to 64.
-            bin_centers_type (str, optional): "normed" or "softplus". Activation type used for bin centers. For "normed" bin centers, linear normalization trick is applied. This results in bounded bin centers.
-                                               For "softplus", softplus activation is used and thus are unbounded. Defaults to "softplus".
-            bin_embedding_dim (int, optional): bin embedding dimension. Defaults to 128.
-            min_depth (float, optional): Lower bound for normed bin centers. Defaults to 1e-3.
-            max_depth (float, optional): Upper bound for normed bin centers. Defaults to 10.
-            n_attractors (List[int], optional): Number of bin attractors at decoder layers. Defaults to [16, 8, 4, 1].
-            attractor_alpha (int, optional): Proportional attractor strength. Refer to models.layers.attractor for more details. Defaults to 300.
-            attractor_gamma (int, optional): Exponential attractor strength. Refer to models.layers.attractor for more details. Defaults to 2.
-            attractor_kind (str, optional): Attraction aggregation "sum" or "mean". Defaults to 'sum'.
-            attractor_type (str, optional): Type of attractor to use; "inv" (Inverse attractor) or "exp" (Exponential attractor). Defaults to 'exp'.
-            min_temp (int, optional): Lower bound for temperature of output probability distribution. Defaults to 5.
-            max_temp (int, optional): Upper bound for temperature of output probability distribution. Defaults to 50.
-            train_midas (bool, optional): Whether to train "core", the base midas model. Defaults to True.
-            midas_lr_factor (int, optional): Learning rate reduction factor for base midas model except its encoder and positional encodings. Defaults to 10.
-            encoder_lr_factor (int, optional): Learning rate reduction factor for the encoder in midas model. Defaults to 10.
-            pos_enc_lr_factor (int, optional): Learning rate reduction factor for positional encodings in the base midas model. Defaults to 10.
-
-            sr_ratio: sr ratio during infer
-            raw_depth_shape: raw depth shape during infer. times sr_ratio will be the target resolution. Used to sample points during training
-            transform_sample_gt_size: training depth shape # influenced by crop shape which is not included in this pipeline right now
-            representation: I use it to test the "bilap head" and a discarded idea
-            fetch_features: if fetch feats. Default=True
-        """
-        super().__init__()
-
-        self.coarse_model = coarse_model
-        self.fine_model = fine_model
-        self.max_depth = max_depth
-        self.min_depth = min_depth
-        self.min_temp = min_temp
-        self.bin_centers_type = bin_centers_type
-
-        self.midas_lr_factor = midas_lr_factor
-        self.encoder_lr_factor = encoder_lr_factor
-        self.pos_enc_lr_factor = pos_enc_lr_factor
-        self.train_midas = train_midas
-        self.inverse_midas = inverse_midas
-        
-        if bin_centers_type == "normed":
-            SeedBinRegressorLayer = SeedBinRegressor
-            Attractor = AttractorLayer
-        elif bin_centers_type == "softplus": # default
-            SeedBinRegressorLayer = SeedBinRegressorUnnormed
-            Attractor = AttractorLayerUnnormed
-        elif bin_centers_type == "hybrid1":
-            SeedBinRegressorLayer = SeedBinRegressor
-            Attractor = AttractorLayerUnnormed
-        elif bin_centers_type == "hybrid2":
-            SeedBinRegressorLayer = SeedBinRegressorUnnormed
-            Attractor = AttractorLayer
-        else:
-            raise ValueError(
-                "bin_centers_type should be one of 'normed', 'softplus', 'hybrid1', 'hybrid2'")
-        
-        N_MIDAS_OUT = 32
-        btlnck_features = self.fine_model.core.output_channels[0]
-        num_out_features = self.fine_model.core.output_channels[1:] # all of them are the same
-
-        self.seed_bin_regressor = SeedBinRegressorLayer(
-            btlnck_features, n_bins=n_bins, min_depth=min_depth, max_depth=max_depth)
-        self.seed_projector = Projector(btlnck_features, bin_embedding_dim)
-        self.projectors = nn.ModuleList([
-            Projector(num_out, bin_embedding_dim)
-            for num_out in num_out_features
-        ])
-        # 1000, 2, inv, mean
-        self.attractors = nn.ModuleList([
-            Attractor(bin_embedding_dim, n_bins, n_attractors=n_attractors[i], min_depth=min_depth, max_depth=max_depth,
-                      alpha=attractor_alpha, gamma=attractor_gamma, kind=attractor_kind, attractor_type=attractor_type)
-            for i in range(len(num_out_features))
-        ])
-        
-        last_in = N_MIDAS_OUT + 1  # +1 for relative depth
-
-        # use log binomial instead of softmax
-        self.conditional_log_binomial = ConditionalLogBinomial(
-            last_in, bin_embedding_dim, n_classes=n_bins, min_temp=min_temp, max_temp=max_temp)
-        
-        self.handles = []
-        self.hook_feats = {}
-        self.set_fetch_features(fetch_features)
-
-        # settings for patchfusion
-        self.use_area_prior = use_area_prior
-        self.g2l = g2l
-
-        self.fusion_conv_list = nn.ModuleList()
-        for i in range(6):
-            if i == 5:
-                layer = nn.Conv2d(N_MIDAS_OUT * 2, N_MIDAS_OUT, 3, 1, 1)
-            else:
-                layer = nn.Conv2d(btlnck_features * 2, btlnck_features, 3, 1, 1)
-            self.fusion_conv_list.append(layer)
-
-        self.coarse_input_proj = nn.ModuleList()
-        for i in range(6):
-            if i == 4:
-                layer = nn.Conv2d(N_MIDAS_OUT, N_MIDAS_OUT, 3, 1, 1)
-            else:
-                layer = nn.Conv2d(btlnck_features, btlnck_features, 3, 1, 1)
-            self.coarse_input_proj.append(layer)
-
-        self.fine_input_proj = nn.ModuleList()
-        for i in range(6):
-            if i == 4:
-                layer = nn.Conv2d(N_MIDAS_OUT, N_MIDAS_OUT, 3, 1, 1)
-            else:
-                layer = nn.Conv2d(btlnck_features, btlnck_features, 3, 1, 1)
-            self.fine_input_proj.append(layer)
-
-        self.coarse_depth_proj = nn.Conv2d(1, last_in, 3, 1, 1)
-        self.fine_depth_proj = nn.Conv2d(1, last_in, 3, 1, 1)
-        # self.coarse_depth_proj = nn.Conv2d(1, 1, 3, 1, 1)
-        # self.fine_depth_proj = nn.Conv2d(1, 1, 3, 1, 1)
-        # self.init_weight()
-
-        self.freeze = freeze
-        if self.freeze:
-            # Freeze the parameters of sub_model
-            for param in self.fine_model.parameters():
-                param.requires_grad = False
-            
-            for param in self.coarse_model.parameters():
-                param.requires_grad = False
-
-            # Set the sub_model to evaluation mode
-            self.fine_model.eval()
-            self.coarse_model.eval()
-
-        self.use_fusion_network = use_fusion_network
-        if self.use_fusion_network:
-            self.fusion_extractor = UNetv1(5, self.g2l, pos_embed, use_area_prior)
-
-        self.consistency_training = consistency_training
-        if self.consistency_training:
-            if consistency_target == 'mix':
-                consistency_target = 'unet_feat'
-            self.consistency_target = consistency_target
-            print("current consistency target is {}".format(consistency_target))
-
-            self.consistency_projs = nn.ModuleList()
-            if self.consistency_target == 'unet_feat':
-                for i in range(6):
-                    if i == 5:
-                        layer = nn.Conv2d(N_MIDAS_OUT, N_MIDAS_OUT, 1, 1, 0)
-                        layer = nn.Identity()
-                    else:
-                        layer = nn.Conv2d(btlnck_features, btlnck_features, 1, 1, 0)
-                        layer = nn.Identity()
-                    self.consistency_projs.append(layer)
-            
-            if self.consistency_target == 'final_feat':
-                layer = nn.Conv2d(64, 64, 1, 1, 0) # 192
-                layer = nn.Identity()
-                self.consistency_projs.append(layer)
-                layer = nn.Conv2d(32, 32, 1, 1, 0) # 384
-                layer = nn.Identity()
-                self.consistency_projs.append(layer)
-                layer = nn.Conv2d(128, 128, 1, 1, 0) # 192
-                layer = nn.Identity()
-                self.consistency_projs.append(layer)
-                
-    def init_weight(self):
-        for m in self.coarse_input_proj:
-            torch.nn.init.constant_(m.weight, 0)
-            torch.nn.init.constant_(m.bias, 0)
-
-        for m in self.fine_input_proj:
-            torch.nn.init.constant_(m.weight, 0)
-            torch.nn.init.constant_(m.bias, 0)
-
-        torch.nn.init.constant_(self.coarse_depth_proj.weight, 0)
-        torch.nn.init.constant_(self.fine_depth_proj.weight, 0)
-
-    def get_lr_params(self, lr):
-        """
-        Learning rate configuration for different layers of the model
-        Args:
-            lr (float) : Base learning rate
-        Returns:
-            list : list of parameters to optimize and their learning rates, in the format required by torch optimizers.
-        """
-        
-        # return self.fusion_extractor.parameters()
-    
-        param_conf = []
-        param_conf_coarse_model = self.coarse_model.get_lr_params(lr)
-        param_conf_fine_model = self.fine_model.get_lr_params(lr)
-        param_conf.extend(param_conf_coarse_model)
-        param_conf.extend(param_conf_fine_model)
-
-
-        skip_list = {'absolute_pos_embed'}
-        skip_keywords = {'relative_position_bias_table'}
-        skip_hack = {'g2l0', 'g2l1', 'g2l2', 'g2l3', 'g2l4', 'g2l5'}
-
-        no_decay = []
-        has_decay = []
-        fusion_enc = []
-
-        for name, param in self.named_parameters():
-            if 'coarse_model' not in name and 'fine_model' not in name:
-                if len(param.shape) == 1 or (name.endswith(".bias") and check_keywords_in_name(name, skip_hack)) or check_keywords_in_name(name, skip_list) or \
-                        check_keywords_in_name(name, skip_keywords):
-                    # print("no decay: {}".format(name))
-                    no_decay.append(param)
-                else:
-                    # print("has decay: {}".format(name))
-                    has_decay.append(param)
-        
-        param_conf.append({'params': has_decay, 'lr': lr})
-        param_conf.append({'params': no_decay, 'weight_decay': 0., 'lr': lr})
-        # param_conf.append({'params': fusion_enc, 'lr': lr / self.encoder_lr_factor})
-        param_conf.append({'params': fusion_enc, 'lr': lr})
-
-        return param_conf
-    
-    def forward(
-        self, 
-        x, 
-        sampled_depth=None, 
-        mode='train', 
-        return_final_centers=False, 
-        denorm=False, 
-        return_probs=False, 
-        image_raw=None, 
-        bbox=None, 
-        crop_area=None, 
-        shift=None, 
-        bbox_raw=None, 
-        iter_prior=None,
-        previous_info=None,
-        **kwargs):
-        """
-        Args:
-            x (torch.Tensor): Input image tensor of shape (B, C, H, W)
-            return_final_centers (bool, optional): Whether to return the final bin centers. Defaults to False.
-            denorm (bool, optional): Whether to denormalize the input image. This reverses ImageNet normalization as midas normalization is different. Defaults to False.
-            return_probs (bool, optional): Whether to return the output probability distribution. Defaults to False.
-        
-        Returns:
-            dict: Dictionary containing the following keys:
-                - rel_depth (torch.Tensor): Relative depth map of shape (B, H, W)
-                - metric_depth (torch.Tensor): Metric depth map of shape (B, 1, H, W)
-                - bin_centers (torch.Tensor): Bin centers of shape (B, n_bins). Present only if return_final_centers is True
-                - probs (torch.Tensor): Output probability distribution of shape (B, n_bins, H, W). Present only if return_probs is True
-
-        """
-
-        if self.consistency_training and mode == 'train':
-            split_x = torch.split(x, 3, dim=1)
-            x = torch.cat(split_x, dim=0)
-            image_raw = torch.cat([image_raw, image_raw], dim=0)
-            split_bbox = torch.split(bbox, 4, dim=-1)
-            bbox = torch.cat(split_bbox, dim=0)
-            split_bbox = torch.split(bbox_raw, 4, dim=-1)
-            bbox_raw = torch.cat(split_bbox, dim=0)
-            crop_area = torch.split(crop_area, 1, dim=1)
-            crop_area = torch.cat(crop_area, dim=0)
-
-        crop_input = x
-    
-        # coarse forward
-        if self.freeze:
-            with torch.no_grad():
-                if self.fine_model.training:
-                    self.fine_model.eval()
-                    self.coarse_model.eval()
-                    
-                if previous_info is None:
-                    previous_info = dict()
-                    whole_depth_pred = self.coarse_model(image_raw)['metric_depth']
-                    previous_info['whole_depth_pred'] = whole_depth_pred
-                    previous_info['coarse_model.hook_feats'] = self.coarse_model.hook_feats
-                    
-                else:
-                    whole_depth_pred = previous_info['whole_depth_pred']
-                    self.coarse_model.hook_feats = dict()
-                    self.coarse_model.hook_feats = previous_info['coarse_model.hook_feats']
-                
-                fine_depth_pred = self.fine_model(x)['metric_depth']
-
-                whole_depth_pred = nn.functional.interpolate(
-                    whole_depth_pred, (2160, 3840), mode='bilinear', align_corners=True)
-
-        else:
-            whole_depth_pred = self.coarse_model(image_raw)['metric_depth']
-            fine_depth_pred = self.fine_model(x)['metric_depth']
-        
-        coarse_model_midas_enc_feats = [
-            self.coarse_input_proj[5](self.coarse_model.hook_feats['x_d0']),
-            self.coarse_input_proj[0](self.coarse_model.hook_feats['x_blocks_feat_0']), 
-            self.coarse_input_proj[1](self.coarse_model.hook_feats['x_blocks_feat_1']), 
-            self.coarse_input_proj[2](self.coarse_model.hook_feats['x_blocks_feat_2']), 
-            self.coarse_input_proj[3](self.coarse_model.hook_feats['x_blocks_feat_3']),
-            self.coarse_input_proj[4](self.coarse_model.hook_feats['midas_final_feat'])] # 384
-
-        if self.g2l:
-            coarse_model_midas_enc_feats_g2l = coarse_model_midas_enc_feats
-                
-        if self.use_area_prior:
-            crop_area_resize = [
-                nn.functional.interpolate(crop_area, (12, 16), mode='bilinear', align_corners=True),
-                nn.functional.interpolate(crop_area, (24, 32), mode='bilinear', align_corners=True),
-                nn.functional.interpolate(crop_area, (48, 64), mode='bilinear', align_corners=True),
-                nn.functional.interpolate(crop_area, (96, 128), mode='bilinear', align_corners=True),
-                nn.functional.interpolate(crop_area, (192, 256), mode='bilinear', align_corners=True),
-                nn.functional.interpolate(crop_area, (384, 512), mode='bilinear', align_corners=True)]
-        else:
-            crop_area_resize = None
-
-        inds = torch.arange(bbox.shape[0]).to(bbox.device).unsqueeze(dim=-1)
-        bbox = torch.cat((inds, bbox), dim=-1)
-        coarse_model_midas_enc_roi_feats = [
-            torch_roi_align(coarse_model_midas_enc_feats[0], bbox, (12, 16), 12/384, aligned=True),
-            torch_roi_align(coarse_model_midas_enc_feats[1], bbox, (24, 32), 24/384, aligned=True),
-            torch_roi_align(coarse_model_midas_enc_feats[2], bbox, (48, 64), 48/384, aligned=True),
-            torch_roi_align(coarse_model_midas_enc_feats[3], bbox, (96, 128), 96/384, aligned=True),
-            torch_roi_align(coarse_model_midas_enc_feats[4], bbox, (192, 256), 192/384, aligned=True),
-            torch_roi_align(coarse_model_midas_enc_feats[5], bbox, (384, 512), 384/384, aligned=True)
-        ]
-
-        # whole_depth_roi_pred = torch_roi_align(whole_depth_pred, bbox, (384, 512), 384/384)
-        # back to full resolution to avoid potential misalignment
-        bbox_hack = copy.deepcopy(bbox)
-        bbox_hack[:, 1] = bbox[:, 1] / 512 * 3840 # scale back to full resolution coord
-        bbox_hack[:, 2] = bbox[:, 2] / 384 * 2160
-        bbox_hack[:, 3] = bbox[:, 3] / 512 * 3840
-        bbox_hack[:, 4] = bbox[:, 4] / 384 * 2160
-        whole_depth_roi_pred = torch_roi_align(whole_depth_pred, bbox_hack, (384, 512), 1, aligned=True)
-
-        fine_model_midas_enc_feats = [
-            self.fine_input_proj[5](self.fine_model.hook_feats['x_d0']),
-            self.fine_input_proj[0](self.fine_model.hook_feats['x_blocks_feat_0']), 
-            self.fine_input_proj[1](self.fine_model.hook_feats['x_blocks_feat_1']), 
-            self.fine_input_proj[2](self.fine_model.hook_feats['x_blocks_feat_2']), 
-            self.fine_input_proj[3](self.fine_model.hook_feats['x_blocks_feat_3']),
-            self.fine_input_proj[4](self.fine_model.hook_feats['midas_final_feat'])] # 384
-        
-        x_plane = []
-        x_blocks = []
-        feat_plus_list = []
-        feat_cat_list = []
-        res_pool = [(24, 32), (48, 64), (96, 128), (192, 256), (384, 512)]
-        for l_i, (f_ca, f_c_roi, f_f) in enumerate(zip(coarse_model_midas_enc_feats, coarse_model_midas_enc_roi_feats, fine_model_midas_enc_feats)):
-            feat_cat = self.fusion_conv_list[l_i](torch.cat([f_c_roi, f_f], dim=1))
-            feat_plus = f_c_roi + f_f
-            feat_cat_list.append(feat_cat)
-            feat_plus_list.append(feat_plus)
-
-        if iter_prior is not None:
-            input_tensor = torch.cat([whole_depth_roi_pred, iter_prior, crop_input], dim=1)
-        else:
-            input_tensor = torch.cat([whole_depth_roi_pred, fine_depth_pred, crop_input], dim=1)
-        output = self.fusion_extractor(
-            input_tensor = input_tensor,
-            guide_plus = feat_plus_list,
-            guide_cat = feat_cat_list,
-            bbox = bbox,
-            crop_area_resize = crop_area_resize,
-            fine_feat_crop = fine_model_midas_enc_feats,
-            coarse_feat_whole = coarse_model_midas_enc_feats,
-            coarse_feat_crop = coarse_model_midas_enc_roi_feats,
-            coarse_feat_whole_hack=None)[::-1] # low -> high
- 
-        x_blocks = output
-        x = x_blocks[0]
-        x_blocks = x_blocks[1:]
-
-        if self.consistency_training:
-            if self.consistency_target == 'unet_feat':
-                proj_feat_list = []
-                for idx, feat in enumerate(output):
-                    proj_feat = self.consistency_projs[idx](feat)
-                    proj_feat_list.append(proj_feat)
-
-        # NOTE: below is ZoeDepth implementation
-        # # new last
-        # last = coarse_model_midas_enc_roi_feats[-1] + fine_model_midas_enc_feats[-1]
-        last = x_blocks[-1] # have already been fused in x_blocks
-        self.hook_feats['midas_final_feat'] = last
-
-        bs, c, h, w = last.shape
-        rel_cond = torch.zeros((bs, 1, h, w), device=last.device)
-
-        self.hook_feats['rel_depth'] = rel_cond # skip this
-
-        _, seed_b_centers = self.seed_bin_regressor(x)
-
-        if self.bin_centers_type == 'normed' or self.bin_centers_type == 'hybrid2':
-            b_prev = (seed_b_centers - self.min_depth) / \
-                (self.max_depth - self.min_depth)
-        else:
-            b_prev = seed_b_centers
-
-        prev_b_embedding = self.seed_projector(x)
-
-        # unroll this loop for better performance
-        for idx, (projector, attractor, x) in enumerate(zip(self.projectors, self.attractors, x_blocks)):
-            b_embedding = projector(x)
-            self.hook_feats['x_blocks_feat_{}'.format(idx)] = x
-            b, b_centers = attractor(
-                b_embedding, b_prev, prev_b_embedding, interpolate=True)
-            b_prev = b.clone()
-            prev_b_embedding = b_embedding.clone()
-
-        self.hook_feats['b_centers'] = b_centers
-
-        
-        if self.consistency_training:
-            if self.consistency_target == 'final_feat':
-                proj_feat_1 = self.consistency_projs[0](b_centers)
-                proj_feat_2 = self.consistency_projs[1](last)
-                proj_feat_3 = self.consistency_projs[2](b_embedding)
-                proj_feat_list = [proj_feat_1, proj_feat_2, proj_feat_3]
-
-        rel_cond = nn.functional.interpolate(
-            rel_cond, size=last.shape[2:], mode='bilinear', align_corners=True)
-        last = torch.cat([last, rel_cond], dim=1) # + self.coarse_depth_proj(whole_depth_roi_pred) + self.fine_depth_proj(fine_depth_pred)
-        b_embedding = nn.functional.interpolate(
-            b_embedding, last.shape[-2:], mode='bilinear', align_corners=True)
-        # till here, we have features (attached with a relative depth prediction) and embeddings
-        # post process
-        # final_pred = out * self.blur_mask + whole_depth_roi_pred * (1-self.blur_mask)
-        # out = F.interpolate(out, (540, 960), mode='bilinear', align_corners=True)
-        x = self.conditional_log_binomial(last, b_embedding)
-        b_centers = nn.functional.interpolate(
-            b_centers, x.shape[-2:], mode='bilinear', align_corners=True)
-
-        out = torch.sum(x * b_centers, dim=1, keepdim=True)
-
-            
-        final_pred = out
-        output = dict(metric_depth=final_pred)
-
-        output['coarse_depth_pred'] = whole_depth_pred
-        output['fine_depth_pred'] = fine_depth_pred
-        output['coarse_depth_pred_roi'] = whole_depth_roi_pred
-        if self.consistency_training:
-            if self.consistency_target == 'final_feat' or self.consistency_target == 'unet_feat':
-                output['temp_features'] = proj_feat_list
-        
-        output['previous_info'] = previous_info
-            
-        return output
-
-    @staticmethod
-    def build(midas_model_type="DPT_BEiT_L_384", pretrained_resource=None, use_pretrained_midas=False, train_midas=False, freeze_midas_bn=True, coarse_model_path=None, fine_model_path=None, **kwargs):
-        from zoedepth.models.zoedepth_custom.zoedepth_custom import ZoeDepthCustom
-        
-        print("build pretrained condition model from {}".format(coarse_model_path))
-        coarse_model = ZoeDepthCustom.build(
-            midas_model_type=midas_model_type, 
-            pretrained_resource=coarse_model_path, 
-            # pretrained_resource="", 
-            use_pretrained_midas=use_pretrained_midas, 
-            # use_pretrained_midas=False, 
-            train_midas=train_midas, 
-            freeze_midas_bn=freeze_midas_bn, 
-            **kwargs)
-
-        print("build pretrained condition model from {}".format(fine_model_path))
-        fine_model = ZoeDepthCustom.build(
-            midas_model_type=midas_model_type, 
-            pretrained_resource=fine_model_path, 
-            # pretrained_resource="", 
-            use_pretrained_midas=use_pretrained_midas,
-            # use_pretrained_midas=False,
-            train_midas=train_midas, 
-            freeze_midas_bn=freeze_midas_bn, 
-            **kwargs)
-        
-        model = PatchFusion(coarse_model, fine_model, **kwargs)
-        if pretrained_resource:
-            assert isinstance(pretrained_resource, str), "pretrained_resource must be a string"
-            model = load_state_from_resource(model, pretrained_resource)
-
-        return model
-    
-    @staticmethod
-    def build_from_config(config):
-        return PatchFusion.build(**config)
- 
-    def remove_hooks(self):
-        for h in self.handles:
-            h.remove()
-        return self
-    
-    def set_fetch_features(self, fetch_features):
-        self.fetch_features = fetch_features
-        if fetch_features:
-            if len(self.handles) == 0:
-                self.attach_hooks()
-        else:
-            self.remove_hooks()
-        return self
-    
-    def attach_hooks(self):
-        
-        self.handles.append(self.seed_projector.register_forward_hook(get_activation("seed_projector", self.hook_feats)))
-        
-        for idx, proj in enumerate(self.projectors):
-            self.handles.append(proj.register_forward_hook(get_activation("projector_{}".format(idx), self.hook_feats)))
-            
-        for idx, proj in enumerate(self.attractors):
-            self.handles.append(proj.register_forward_hook(get_activation("attractor_{}".format(idx), self.hook_feats)))
-
-        return self
diff --git a/zoedepth/models/zoedepth_custom/zoedepth_custom.py b/zoedepth/models/zoedepth_custom/zoedepth_custom.py
deleted file mode 100644
index e810cdd09842133cd6a311768df328215e7309d1..0000000000000000000000000000000000000000
--- a/zoedepth/models/zoedepth_custom/zoedepth_custom.py
+++ /dev/null
@@ -1,335 +0,0 @@
-# MIT License
-
-# Copyright (c) 2022 Intelligent Systems Lab Org
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-# File author: Shariq Farooq Bhat
-
-import itertools
-
-import math
-import torch
-import torch.nn as nn
-import numpy as np
-
-from zoedepth.models.depth_model import DepthModel
-from zoedepth.models.base_models.midas import MidasCore
-from zoedepth.models.layers.attractor import AttractorLayer, AttractorLayerUnnormed
-from zoedepth.models.layers.dist_layers import ConditionalLogBinomial, ConditionalLogBinomialV2
-from zoedepth.models.layers.localbins_layers import (Projector, SeedBinRegressor,
-                                            SeedBinRegressorUnnormed)
-from zoedepth.models.model_io import load_state_from_resource
-from torchvision.transforms import Normalize
-
-def get_activation(name, bank):
-    # input of forward_hook will be a function of model/inp/oup
-    def hook(module, input, output):
-        bank[name] = output
-    return hook
-    
-class ZoeDepthCustom(DepthModel):
-    def __init__(self, core,  n_bins=64, bin_centers_type="softplus", bin_embedding_dim=128, min_depth=1e-3, max_depth=10,
-                 n_attractors=[16, 8, 4, 1], attractor_alpha=300, attractor_gamma=2, attractor_kind='sum', attractor_type='exp', min_temp=5, max_temp=50, train_midas=True,
-                 midas_lr_factor=10, encoder_lr_factor=10, pos_enc_lr_factor=10, inverse_midas=False, sr_ratio=1,
-                 raw_depth_shape=(2160, 3840), transform_sample_gt_size=(2160, 3840), representation='',
-                 fetch_features=True, sample_feat_level=3, use_hr=False, deform=False, wo_bins=False, baseline=False, **kwargs):
-        """ZoeDepth model. This is the version of ZoeDepth that has a single metric head
-
-        Args:
-            core (models.base_models.midas.MidasCore): The base midas model that is used for extraction of "relative" features
-            n_bins (int, optional): Number of bin centers. Defaults to 64.
-            bin_centers_type (str, optional): "normed" or "softplus". Activation type used for bin centers. For "normed" bin centers, linear normalization trick is applied. This results in bounded bin centers.
-                                               For "softplus", softplus activation is used and thus are unbounded. Defaults to "softplus".
-            bin_embedding_dim (int, optional): bin embedding dimension. Defaults to 128.
-            min_depth (float, optional): Lower bound for normed bin centers. Defaults to 1e-3.
-            max_depth (float, optional): Upper bound for normed bin centers. Defaults to 10.
-            n_attractors (List[int], optional): Number of bin attractors at decoder layers. Defaults to [16, 8, 4, 1].
-            attractor_alpha (int, optional): Proportional attractor strength. Refer to models.layers.attractor for more details. Defaults to 300.
-            attractor_gamma (int, optional): Exponential attractor strength. Refer to models.layers.attractor for more details. Defaults to 2.
-            attractor_kind (str, optional): Attraction aggregation "sum" or "mean". Defaults to 'sum'.
-            attractor_type (str, optional): Type of attractor to use; "inv" (Inverse attractor) or "exp" (Exponential attractor). Defaults to 'exp'.
-            min_temp (int, optional): Lower bound for temperature of output probability distribution. Defaults to 5.
-            max_temp (int, optional): Upper bound for temperature of output probability distribution. Defaults to 50.
-            train_midas (bool, optional): Whether to train "core", the base midas model. Defaults to True.
-            midas_lr_factor (int, optional): Learning rate reduction factor for base midas model except its encoder and positional encodings. Defaults to 10.
-            encoder_lr_factor (int, optional): Learning rate reduction factor for the encoder in midas model. Defaults to 10.
-            pos_enc_lr_factor (int, optional): Learning rate reduction factor for positional encodings in the base midas model. Defaults to 10.
-
-            sr_ratio: sr ratio during infer
-            raw_depth_shape: raw depth shape during infer. times sr_ratio will be the target resolution. Used to sample points during training
-            transform_sample_gt_size: training depth shape # influenced by crop shape which is not included in this pipeline right now
-            representation: I use it to test the "bilap head" and a discarded idea
-            fetch_features: if fetch feats. Default=True
-        """
-        super().__init__()
-
-        self.core = core
-        self.max_depth = max_depth
-        self.min_depth = min_depth
-        self.min_temp = min_temp
-        self.bin_centers_type = bin_centers_type
-
-        self.midas_lr_factor = midas_lr_factor
-        self.encoder_lr_factor = encoder_lr_factor
-        self.pos_enc_lr_factor = pos_enc_lr_factor
-        self.train_midas = train_midas
-        self.inverse_midas = inverse_midas
-
-        if self.encoder_lr_factor <= 0:
-            self.core.freeze_encoder(
-                freeze_rel_pos=self.pos_enc_lr_factor <= 0)
-
-        N_MIDAS_OUT = 32
-        btlnck_features = self.core.output_channels[0]
-        num_out_features = self.core.output_channels[1:]
-
-        self.conv2 = nn.Conv2d(btlnck_features, btlnck_features,
-                               kernel_size=1, stride=1, padding=0)  # btlnck conv
-
-        if bin_centers_type == "normed":
-            SeedBinRegressorLayer = SeedBinRegressor
-            Attractor = AttractorLayer
-        elif bin_centers_type == "softplus": # default
-            SeedBinRegressorLayer = SeedBinRegressorUnnormed
-            Attractor = AttractorLayerUnnormed
-        elif bin_centers_type == "hybrid1":
-            SeedBinRegressorLayer = SeedBinRegressor
-            Attractor = AttractorLayerUnnormed
-        elif bin_centers_type == "hybrid2":
-            SeedBinRegressorLayer = SeedBinRegressorUnnormed
-            Attractor = AttractorLayer
-        else:
-            raise ValueError(
-                "bin_centers_type should be one of 'normed', 'softplus', 'hybrid1', 'hybrid2'")
-        
-        self.seed_bin_regressor = SeedBinRegressorLayer(
-            btlnck_features, n_bins=n_bins, min_depth=min_depth, max_depth=max_depth)
-        self.seed_projector = Projector(btlnck_features, bin_embedding_dim)
-        self.projectors = nn.ModuleList([
-            Projector(num_out, bin_embedding_dim)
-            for num_out in num_out_features
-        ])
-        # 1000, 2, inv, mean
-        self.attractors = nn.ModuleList([
-            Attractor(bin_embedding_dim, n_bins, n_attractors=n_attractors[i], min_depth=min_depth, max_depth=max_depth,
-                      alpha=attractor_alpha, gamma=attractor_gamma, kind=attractor_kind, attractor_type=attractor_type)
-            for i in range(len(num_out_features))
-        ])
-        
-        last_in = N_MIDAS_OUT + 1  # +1 for relative depth
-
-        # use log binomial instead of softmax
-        self.conditional_log_binomial = ConditionalLogBinomial(
-            last_in, bin_embedding_dim, n_classes=n_bins, min_temp=min_temp, max_temp=max_temp)
-        
-        self.handles = []
-        self.hook_feats = {}
-        self.set_fetch_features(fetch_features)
-        
-        self.baseline = baseline
-        
-    def init_weight(self):
-        if self.deform:
-            for m in self.mlp_feat_offset.modules():
-                if isinstance(m, nn.Conv1d):
-                    torch.nn.init.constant_(m.weight, 0)
-
-        if self.representation == 'biLaplacian':
-            for m in self.proj_x_block:
-                torch.nn.init.constant_(m.weight, 0)
-        
-    def forward(self, x, sampled_depth=None, mode='train', return_final_centers=False, denorm=False, return_probs=False, image_raw=None, **kwargs):
-        """
-        Args:
-            x (torch.Tensor): Input image tensor of shape (B, C, H, W)
-            return_final_centers (bool, optional): Whether to return the final bin centers. Defaults to False.
-            denorm (bool, optional): Whether to denormalize the input image. This reverses ImageNet normalization as midas normalization is different. Defaults to False.
-            return_probs (bool, optional): Whether to return the output probability distribution. Defaults to False.
-        
-        Returns:
-            dict: Dictionary containing the following keys:
-                - rel_depth (torch.Tensor): Relative depth map of shape (B, H, W)
-                - metric_depth (torch.Tensor): Metric depth map of shape (B, 1, H, W)
-                - bin_centers (torch.Tensor): Bin centers of shape (B, n_bins). Present only if return_final_centers is True
-                - probs (torch.Tensor): Output probability distribution of shape (B, n_bins, H, W). Present only if return_probs is True
-
-        """
-
-        b, c, h, w = x.shape
-        # print("input shape ", x.shape)
-        self.orig_input_width = w
-        self.orig_input_height = h
-        rel_depth, out = self.core(x, denorm=denorm, return_rel_depth=True)
-        # print("output shapes", rel_depth.shape, out.shape)
-
-        outconv_activation = out[0]
-        btlnck = out[1]
-        x_blocks = out[2:]
-
-        x_d0 = self.conv2(btlnck)
-        self.hook_feats['x_d0'] = x_d0
-        x = x_d0
-
-        last = outconv_activation
-        self.hook_feats['midas_final_feat'] = last
-
-        if self.inverse_midas:
-            # invert depth followed by normalization
-            rel_depth = 1.0 / (rel_depth + 1e-6)
-            rel_depth = (rel_depth - rel_depth.min()) / \
-                (rel_depth.max() - rel_depth.min())
-        # concat rel depth with last. First interpolate rel depth to last size
-
-        rel_cond = rel_depth.unsqueeze(1)
-        self.hook_feats['rel_depth'] = rel_cond
-        
-        _, seed_b_centers = self.seed_bin_regressor(x)
-
-        if self.bin_centers_type == 'normed' or self.bin_centers_type == 'hybrid2':
-            b_prev = (seed_b_centers - self.min_depth) / \
-                (self.max_depth - self.min_depth)
-        else:
-            b_prev = seed_b_centers
-
-        prev_b_embedding = self.seed_projector(x)
-
-        # unroll this loop for better performance
-        
-        for idx, (projector, attractor, x) in enumerate(zip(self.projectors, self.attractors, x_blocks)):
-            b_embedding = projector(x)
-            self.hook_feats['x_blocks_feat_{}'.format(idx)] = x
-            # self.hook_feats['attractor_{}_0'.format(idx)] = b_embedding
-            # self.hook_feats['attractor_{}_1'.format(idx)] = b_prev
-            # self.hook_feats['attractor_{}_2'.format(idx)] = prev_b_embedding
-            b, b_centers = attractor(
-                b_embedding, b_prev, prev_b_embedding, interpolate=True)
-            b_prev = b.clone()
-            prev_b_embedding = b_embedding.clone()
-
-        self.hook_feats['b_centers'] = b_centers
-
-        if self.baseline:
-            rel_cond = nn.functional.interpolate(
-                rel_cond, size=last.shape[2:], mode='bilinear', align_corners=True)
-            last = torch.cat([last, rel_cond], dim=1)
-            b_embedding = nn.functional.interpolate(
-                b_embedding, last.shape[-2:], mode='bilinear', align_corners=True)
-            # till here, we have features (attached with a relative depth prediction) and embeddings
-            x = self.conditional_log_binomial(last, b_embedding)
-            b_centers = nn.functional.interpolate(
-                b_centers, x.shape[-2:], mode='bilinear', align_corners=True)
-            self.hook_feats['b_centers'] = b_centers
-            out = torch.sum(x * b_centers, dim=1, keepdim=True)
-            output = dict(metric_depth=out)
-            return output
-
-        # if mode == 'train':
-        #     if self.wo_bins and self.representation == 'biLaplacian':
-        #         pred_depth_dict = self.implicit_function_train_process(sampled_depth, mode=mode)
-        #         output = pred_depth_dict
-        #     else:
-        #         pred_depth = self.implicit_function_train_process(sampled_depth, mode=mode)
-        #         output = dict(metric_depth=pred_depth)
-
-        # elif mode == 'eval':
-        #     pred_depth = self.implicit_function_infer_process(mode=mode)
-        #     output = dict(metric_depth=pred_depth)
-        # else:
-        #     raise NotImplementedError
-
-        return output
-
-    # change param lr later
-    def get_lr_params(self, lr):
-        """
-        Learning rate configuration for different layers of the model
-        Args:
-            lr (float) : Base learning rate
-        Returns:
-            list : list of parameters to optimize and their learning rates, in the format required by torch optimizers.
-        """
-        param_conf = []
-        if self.train_midas:
-            if self.encoder_lr_factor > 0:
-                param_conf.append({'params': self.core.get_enc_params_except_rel_pos(
-                ), 'lr': lr / self.encoder_lr_factor})
-
-            if self.pos_enc_lr_factor > 0:
-                param_conf.append(
-                    {'params': self.core.get_rel_pos_params(), 'lr': lr / self.pos_enc_lr_factor})
-
-            midas_params = self.core.core.scratch.parameters()
-            midas_lr_factor = self.midas_lr_factor
-            param_conf.append(
-                {'params': midas_params, 'lr': lr / midas_lr_factor})
-
-        remaining_modules = []
-        for name, child in self.named_children():
-            if name != 'core':
-                remaining_modules.append(child)
-        remaining_params = itertools.chain(
-            *[child.parameters() for child in remaining_modules])
-
-        param_conf.append({'params': remaining_params, 'lr': lr})
-
-        return param_conf
-
-    @staticmethod
-    def build(midas_model_type="DPT_BEiT_L_384", pretrained_resource=None, use_pretrained_midas=False, train_midas=False, freeze_midas_bn=True, **kwargs):
-        core = MidasCore.build(midas_model_type=midas_model_type, use_pretrained_midas=use_pretrained_midas,
-                               train_midas=train_midas, fetch_features=True, freeze_bn=freeze_midas_bn, **kwargs)
-        model = ZoeDepthCustom(core, **kwargs)
-        if pretrained_resource:
-            assert isinstance(pretrained_resource, str), "pretrained_resource must be a string"
-            model = load_state_from_resource(model, pretrained_resource)
-        # a = torch.ones((2, 3, 100, 200))
-        # b = torch.ones((2, 500, 3))
-        # model(a, b)
-        return model
-
-    @staticmethod
-    def build_from_config(config):
-        return ZoeDepthCustom.build(**config)
-
-    def remove_hooks(self):
-        for h in self.handles:
-            h.remove()
-        return self
-    
-    def set_fetch_features(self, fetch_features):
-        self.fetch_features = fetch_features
-        if fetch_features:
-            if len(self.handles) == 0:
-                self.attach_hooks()
-        else:
-            self.remove_hooks()
-        return self
-    
-    def attach_hooks(self):
-        
-        self.handles.append(self.seed_projector.register_forward_hook(get_activation("seed_projector", self.hook_feats)))
-        
-        for idx, proj in enumerate(self.projectors):
-            self.handles.append(proj.register_forward_hook(get_activation("projector_{}".format(idx), self.hook_feats)))
-            
-        for idx, proj in enumerate(self.attractors):
-            self.handles.append(proj.register_forward_hook(get_activation("attractor_{}".format(idx), self.hook_feats)))
-    
-        return self
diff --git a/zoedepth/models/zoedepth_custom/__init__.py b/zoedepth/models/zoedepth_nk/__init__.py
similarity index 87%
rename from zoedepth/models/zoedepth_custom/__init__.py
rename to zoedepth/models/zoedepth_nk/__init__.py
index 2de3a6af3d3834d7895f2eb1d4c6f17ba77ef9f5..513a278b939c10c010e3c0250ec73544d5663886 100644
--- a/zoedepth/models/zoedepth_custom/__init__.py
+++ b/zoedepth/models/zoedepth_nk/__init__.py
@@ -20,14 +20,12 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-# File author: Zhenyu Li
+# File author: Shariq Farooq Bhat
 
-from .zoedepth_custom import ZoeDepthCustom
-from .patchfusion import PatchFusion
+from .zoedepth_nk_v1 import ZoeDepthNK
 
 all_versions = {
-    "custom": ZoeDepthCustom,
-    "patchfusion": PatchFusion
+    "v1": ZoeDepthNK,
 }
 
 get_version = lambda v : all_versions[v]
\ No newline at end of file
diff --git a/zoedepth/models/zoedepth_nk/config_zoedepth_nk.json b/zoedepth/models/zoedepth_nk/config_zoedepth_nk.json
new file mode 100644
index 0000000000000000000000000000000000000000..42bab2a3ad159a09599a5aba270c491021a3cf1a
--- /dev/null
+++ b/zoedepth/models/zoedepth_nk/config_zoedepth_nk.json
@@ -0,0 +1,67 @@
+{
+    "model": {
+        "name": "ZoeDepthNK",
+        "version_name": "v1",
+        "bin_conf" : [
+            {
+                "name": "nyu",
+                "n_bins": 64,
+                "min_depth": 1e-3,
+                "max_depth": 10.0
+            },
+            {
+                "name": "kitti",
+                "n_bins": 64,
+                "min_depth": 1e-3,
+                "max_depth": 80.0
+            }
+        ], 
+        "bin_embedding_dim": 128,
+        "bin_centers_type": "softplus",
+        "n_attractors":[16, 8, 4, 1],
+        "attractor_alpha": 1000,
+        "attractor_gamma": 2,
+        "attractor_kind" : "mean",
+        "attractor_type" : "inv",
+        "min_temp": 0.0212,
+        "max_temp": 50.0, 
+        "memory_efficient": true, 
+        "midas_model_type" : "DPT_BEiT_L_384",
+        "img_size": [384, 512]
+    },
+
+    "train": {
+        "train_midas": true,
+        "use_pretrained_midas": true,
+        "trainer": "zoedepth_nk",
+        "epochs": 5,
+        "bs": 16,
+        "optim_kwargs": {"lr": 0.0002512, "wd": 0.01},
+        "sched_kwargs": {"div_factor": 1, "final_div_factor": 10000, "pct_start": 0.7, "three_phase":false, "cycle_momentum": true},
+        "same_lr": false,
+        "w_si": 1,
+        "w_domain": 100,
+        "avoid_boundary": false,
+        "random_crop": false,
+        "input_width": 640,
+        "input_height": 480,
+        "w_grad": 0,
+        "w_reg": 0,
+        "midas_lr_factor": 10,
+        "encoder_lr_factor":10,
+        "pos_enc_lr_factor":10
+    },
+
+    "infer": {
+        "train_midas": false,
+        "pretrained_resource": "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_NK.pt",
+        "use_pretrained_midas": false,
+        "force_keep_ar": true
+    },
+    
+    "eval": {
+        "train_midas": false,
+        "pretrained_resource": "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_NK.pt",
+        "use_pretrained_midas": false
+    }
+}
\ No newline at end of file
diff --git a/zoedepth/models/zoedepth_nk/zoedepth_nk_v1.py b/zoedepth/models/zoedepth_nk/zoedepth_nk_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..7368ae8031188a9f946d9d3f29633c96e791e68e
--- /dev/null
+++ b/zoedepth/models/zoedepth_nk/zoedepth_nk_v1.py
@@ -0,0 +1,333 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
+
+import itertools
+
+import torch
+import torch.nn as nn
+
+from zoedepth.models.depth_model import DepthModel
+from zoedepth.models.base_models.midas import MidasCore
+from zoedepth.models.layers.attractor import AttractorLayer, AttractorLayerUnnormed
+from zoedepth.models.layers.dist_layers import ConditionalLogBinomial
+from zoedepth.models.layers.localbins_layers import (Projector, SeedBinRegressor,
+                                            SeedBinRegressorUnnormed)
+from zoedepth.models.layers.patch_transformer import PatchTransformerEncoder
+from zoedepth.models.model_io import load_state_from_resource
+
+
+class ZoeDepthNK(DepthModel):
+    def __init__(self, core,  bin_conf, bin_centers_type="softplus", bin_embedding_dim=128,
+                 n_attractors=[16, 8, 4, 1], attractor_alpha=300, attractor_gamma=2, attractor_kind='sum', attractor_type='exp',
+                 min_temp=5, max_temp=50,
+                 memory_efficient=False, train_midas=True,
+                 is_midas_pretrained=True, midas_lr_factor=1, encoder_lr_factor=10, pos_enc_lr_factor=10, inverse_midas=False,  **kwargs):
+        """ZoeDepthNK model. This is the version of ZoeDepth that has two metric heads and uses a learned router to route to experts.
+
+        Args:
+            core (models.base_models.midas.MidasCore): The base midas model that is used for extraction of "relative" features
+
+            bin_conf (List[dict]): A list of dictionaries that contain the bin configuration for each metric head. Each dictionary should contain the following keys: 
+                                    "name" (str, typically same as the dataset name), "n_bins" (int), "min_depth" (float), "max_depth" (float)
+
+                                   The length of this list determines the number of metric heads.
+            bin_centers_type (str, optional): "normed" or "softplus". Activation type used for bin centers. For "normed" bin centers, linear normalization trick is applied. This results in bounded bin centers.
+                                               For "softplus", softplus activation is used and thus are unbounded. Defaults to "normed".
+            bin_embedding_dim (int, optional): bin embedding dimension. Defaults to 128.
+
+            n_attractors (List[int], optional): Number of bin attractors at decoder layers. Defaults to [16, 8, 4, 1].
+            attractor_alpha (int, optional): Proportional attractor strength. Refer to models.layers.attractor for more details. Defaults to 300.
+            attractor_gamma (int, optional): Exponential attractor strength. Refer to models.layers.attractor for more details. Defaults to 2.
+            attractor_kind (str, optional): Attraction aggregation "sum" or "mean". Defaults to 'sum'.
+            attractor_type (str, optional): Type of attractor to use; "inv" (Inverse attractor) or "exp" (Exponential attractor). Defaults to 'exp'.
+
+            min_temp (int, optional): Lower bound for temperature of output probability distribution. Defaults to 5.
+            max_temp (int, optional): Upper bound for temperature of output probability distribution. Defaults to 50.
+            
+            memory_efficient (bool, optional): Whether to use memory efficient version of attractor layers. Memory efficient version is slower but is recommended incase of multiple metric heads in order save GPU memory. Defaults to False.
+
+            train_midas (bool, optional): Whether to train "core", the base midas model. Defaults to True.
+            is_midas_pretrained (bool, optional): Is "core" pretrained? Defaults to True.
+            midas_lr_factor (int, optional): Learning rate reduction factor for base midas model except its encoder and positional encodings. Defaults to 10.
+            encoder_lr_factor (int, optional): Learning rate reduction factor for the encoder in midas model. Defaults to 10.
+            pos_enc_lr_factor (int, optional): Learning rate reduction factor for positional encodings in the base midas model. Defaults to 10.
+
+        """
+
+        super().__init__()
+
+        self.core = core
+        self.bin_conf = bin_conf
+        self.min_temp = min_temp
+        self.max_temp = max_temp
+        self.memory_efficient = memory_efficient
+        self.train_midas = train_midas
+        self.is_midas_pretrained = is_midas_pretrained
+        self.midas_lr_factor = midas_lr_factor
+        self.encoder_lr_factor = encoder_lr_factor
+        self.pos_enc_lr_factor = pos_enc_lr_factor
+        self.inverse_midas = inverse_midas
+
+        N_MIDAS_OUT = 32
+        btlnck_features = self.core.output_channels[0]
+        num_out_features = self.core.output_channels[1:]
+        # self.scales = [16, 8, 4, 2]  # spatial scale factors
+
+        self.conv2 = nn.Conv2d(
+            btlnck_features, btlnck_features, kernel_size=1, stride=1, padding=0)
+
+        # Transformer classifier on the bottleneck
+        self.patch_transformer = PatchTransformerEncoder(
+            btlnck_features, 1, 128, use_class_token=True)
+        self.mlp_classifier = nn.Sequential(
+            nn.Linear(128, 128),
+            nn.ReLU(),
+            nn.Linear(128, 2)
+        )
+
+        if bin_centers_type == "normed":
+            SeedBinRegressorLayer = SeedBinRegressor
+            Attractor = AttractorLayer
+        elif bin_centers_type == "softplus":
+            SeedBinRegressorLayer = SeedBinRegressorUnnormed
+            Attractor = AttractorLayerUnnormed
+        elif bin_centers_type == "hybrid1":
+            SeedBinRegressorLayer = SeedBinRegressor
+            Attractor = AttractorLayerUnnormed
+        elif bin_centers_type == "hybrid2":
+            SeedBinRegressorLayer = SeedBinRegressorUnnormed
+            Attractor = AttractorLayer
+        else:
+            raise ValueError(
+                "bin_centers_type should be one of 'normed', 'softplus', 'hybrid1', 'hybrid2'")
+        self.bin_centers_type = bin_centers_type
+        # We have bins for each bin conf.
+        # Create a map (ModuleDict) of 'name' -> seed_bin_regressor
+        self.seed_bin_regressors = nn.ModuleDict(
+            {conf['name']: SeedBinRegressorLayer(btlnck_features, conf["n_bins"], mlp_dim=bin_embedding_dim//2, min_depth=conf["min_depth"], max_depth=conf["max_depth"])
+             for conf in bin_conf}
+        )
+
+        self.seed_projector = Projector(
+            btlnck_features, bin_embedding_dim, mlp_dim=bin_embedding_dim//2)
+        self.projectors = nn.ModuleList([
+            Projector(num_out, bin_embedding_dim, mlp_dim=bin_embedding_dim//2)
+            for num_out in num_out_features
+        ])
+
+        # Create a map (ModuleDict) of 'name' -> attractors (ModuleList)
+        self.attractors = nn.ModuleDict(
+            {conf['name']: nn.ModuleList([
+                Attractor(bin_embedding_dim, n_attractors[i],
+                          mlp_dim=bin_embedding_dim, alpha=attractor_alpha,
+                          gamma=attractor_gamma, kind=attractor_kind,
+                          attractor_type=attractor_type, memory_efficient=memory_efficient,
+                          min_depth=conf["min_depth"], max_depth=conf["max_depth"])
+                for i in range(len(n_attractors))
+            ])
+                for conf in bin_conf}
+        )
+
+        last_in = N_MIDAS_OUT
+        # conditional log binomial for each bin conf
+        self.conditional_log_binomial = nn.ModuleDict(
+            {conf['name']: ConditionalLogBinomial(last_in, bin_embedding_dim, conf['n_bins'], bottleneck_factor=4, min_temp=self.min_temp, max_temp=self.max_temp)
+             for conf in bin_conf}
+        )
+
+    def forward(self, x, return_final_centers=False, denorm=False, return_probs=False, **kwargs):
+        """
+        Args:
+            x (torch.Tensor): Input image tensor of shape (B, C, H, W). Assumes all images are from the same domain.
+            return_final_centers (bool, optional): Whether to return the final centers of the attractors. Defaults to False.
+            denorm (bool, optional): Whether to denormalize the input image. Defaults to False.
+            return_probs (bool, optional): Whether to return the probabilities of the bins. Defaults to False.
+        
+        Returns:
+            dict: Dictionary of outputs with keys:
+                - "rel_depth": Relative depth map of shape (B, 1, H, W)
+                - "metric_depth": Metric depth map of shape (B, 1, H, W)
+                - "domain_logits": Domain logits of shape (B, 2)
+                - "bin_centers": Bin centers of shape (B, N, H, W). Present only if return_final_centers is True
+                - "probs": Bin probabilities of shape (B, N, H, W). Present only if return_probs is True
+        """
+        b, c, h, w = x.shape
+        self.orig_input_width = w
+        self.orig_input_height = h
+        rel_depth, out = self.core(x, denorm=denorm, return_rel_depth=True)
+
+        outconv_activation = out[0]
+        btlnck = out[1]
+        x_blocks = out[2:]
+
+        x_d0 = self.conv2(btlnck)
+        x = x_d0
+
+        # Predict which path to take
+        embedding = self.patch_transformer(x)[0]  # N, E
+        domain_logits = self.mlp_classifier(embedding)  # N, 2
+        domain_vote = torch.softmax(domain_logits.sum(
+            dim=0, keepdim=True), dim=-1)  # 1, 2
+
+        # Get the path
+        bin_conf_name = ["nyu", "kitti"][torch.argmax(
+            domain_vote, dim=-1).squeeze().item()]
+
+        try:
+            conf = [c for c in self.bin_conf if c.name == bin_conf_name][0]
+        except IndexError:
+            raise ValueError(
+                f"bin_conf_name {bin_conf_name} not found in bin_confs")
+
+        min_depth = conf['min_depth']
+        max_depth = conf['max_depth']
+
+        seed_bin_regressor = self.seed_bin_regressors[bin_conf_name]
+        _, seed_b_centers = seed_bin_regressor(x)
+        if self.bin_centers_type == 'normed' or self.bin_centers_type == 'hybrid2':
+            b_prev = (seed_b_centers - min_depth)/(max_depth - min_depth)
+        else:
+            b_prev = seed_b_centers
+        prev_b_embedding = self.seed_projector(x)
+
+        attractors = self.attractors[bin_conf_name]
+        for projector, attractor, x in zip(self.projectors, attractors, x_blocks):
+            b_embedding = projector(x)
+            b, b_centers = attractor(
+                b_embedding, b_prev, prev_b_embedding, interpolate=True)
+            b_prev = b
+            prev_b_embedding = b_embedding
+
+        last = outconv_activation
+
+        b_centers = nn.functional.interpolate(
+            b_centers, last.shape[-2:], mode='bilinear', align_corners=True)
+        b_embedding = nn.functional.interpolate(
+            b_embedding, last.shape[-2:], mode='bilinear', align_corners=True)
+
+        clb = self.conditional_log_binomial[bin_conf_name]
+        x = clb(last, b_embedding)
+
+        # Now depth value is Sum px * cx , where cx are bin_centers from the last bin tensor
+        # print(x.shape, b_centers.shape)
+        # b_centers = nn.functional.interpolate(b_centers, x.shape[-2:], mode='bilinear', align_corners=True)
+        out = torch.sum(x * b_centers, dim=1, keepdim=True)
+
+        output = dict(domain_logits=domain_logits, metric_depth=out)
+        if return_final_centers or return_probs:
+            output['bin_centers'] = b_centers
+
+        if return_probs:
+            output['probs'] = x
+        return output
+
+    def get_lr_params(self, lr):
+        """
+        Learning rate configuration for different layers of the model
+
+        Args:
+            lr (float) : Base learning rate
+        Returns:
+            list : list of parameters to optimize and their learning rates, in the format required by torch optimizers.
+        """
+        param_conf = []
+        if self.train_midas:
+            def get_rel_pos_params():
+                for name, p in self.core.core.pretrained.named_parameters():
+                    if "relative_position" in name:
+                        yield p
+
+            def get_enc_params_except_rel_pos():
+                for name, p in self.core.core.pretrained.named_parameters():
+                    if "relative_position" not in name:
+                        yield p
+
+            encoder_params = get_enc_params_except_rel_pos()
+            rel_pos_params = get_rel_pos_params()
+            midas_params = self.core.core.scratch.parameters()
+            midas_lr_factor = self.midas_lr_factor if self.is_midas_pretrained else 1.0
+            param_conf.extend([
+                {'params': encoder_params, 'lr': lr / self.encoder_lr_factor},
+                {'params': rel_pos_params, 'lr': lr / self.pos_enc_lr_factor},
+                {'params': midas_params, 'lr': lr / midas_lr_factor}
+            ])
+
+        remaining_modules = []
+        for name, child in self.named_children():
+            if name != 'core':
+                remaining_modules.append(child)
+        remaining_params = itertools.chain(
+            *[child.parameters() for child in remaining_modules])
+        param_conf.append({'params': remaining_params, 'lr': lr})
+        return param_conf
+
+    def get_conf_parameters(self, conf_name):
+        """
+        Returns parameters of all the ModuleDicts children that are exclusively used for the given bin configuration
+        """
+        params = []
+        for name, child in self.named_children():
+            if isinstance(child, nn.ModuleDict):
+                for bin_conf_name, module in child.items():
+                    if bin_conf_name == conf_name:
+                        params += list(module.parameters())
+        return params
+
+    def freeze_conf(self, conf_name):
+        """
+        Freezes all the parameters of all the ModuleDicts children that are exclusively used for the given bin configuration
+        """
+        for p in self.get_conf_parameters(conf_name):
+            p.requires_grad = False
+
+    def unfreeze_conf(self, conf_name):
+        """
+        Unfreezes all the parameters of all the ModuleDicts children that are exclusively used for the given bin configuration
+        """
+        for p in self.get_conf_parameters(conf_name):
+            p.requires_grad = True
+
+    def freeze_all_confs(self):
+        """
+        Freezes all the parameters of all the ModuleDicts children
+        """
+        for name, child in self.named_children():
+            if isinstance(child, nn.ModuleDict):
+                for bin_conf_name, module in child.items():
+                    for p in module.parameters():
+                        p.requires_grad = False
+
+    @staticmethod
+    def build(midas_model_type="DPT_BEiT_L_384", pretrained_resource=None, use_pretrained_midas=False, train_midas=False, freeze_midas_bn=True, **kwargs):
+        core = MidasCore.build(midas_model_type=midas_model_type, use_pretrained_midas=use_pretrained_midas,
+                               train_midas=train_midas, fetch_features=True, freeze_bn=freeze_midas_bn, **kwargs)
+        model = ZoeDepthNK(core, **kwargs)
+        if pretrained_resource:
+            assert isinstance(pretrained_resource, str), "pretrained_resource must be a string"
+            model = load_state_from_resource(model, pretrained_resource)
+        return model
+
+    @staticmethod
+    def build_from_config(config):
+        return ZoeDepthNK.build(**config)
diff --git a/zoedepth/trainers/base_trainer.py b/zoedepth/trainers/base_trainer.py
index 6009b9e5f64b32b019fcdae3798c59c6741c84ca..33fbbea3a7d49efe11b005adb5127f441eabfaf6 100644
--- a/zoedepth/trainers/base_trainer.py
+++ b/zoedepth/trainers/base_trainer.py
@@ -22,8 +22,6 @@
 
 # File author: Shariq Farooq Bhat
 
-# This file may include modifications from author Zhenyu Li
-
 import os
 import uuid
 import warnings
@@ -63,25 +61,6 @@ class BaseTrainer:
         self.optimizer = self.init_optimizer()
         self.scheduler = self.init_scheduler()
 
-        # import matplotlib.pyplot as plt
-        # lrs = []
-        # momentums = []
-        # for e in range(self.config.epochs):
-        #     for s in range(len(self.train_loader)):
-        #         self.scheduler.step()
-        #         self.optimizer.step()
-        #         lr = self.scheduler.get_last_lr()[2]
-        #         lrs.append(lr)
-        #         print(self.optimizer.param_groups[0]['betas'])
-        #         momentum = self.optimizer.param_groups[0]['betas'][0]
-        #         momentums.append(momentum)
-
-        # step = [_ for _ in range(len(lrs))]
-        # plt.scatter(step, momentums)
-        # plt.savefig("debug.png")
-        # exit(100)
-
-
     def resize_to_target(self, prediction, target):
         if prediction.shape[2:] != target.shape[-2:]:
             prediction = nn.functional.interpolate(
@@ -135,8 +114,7 @@ class BaseTrainer:
         lrs = [l['lr'] for l in self.optimizer.param_groups]
         return optim.lr_scheduler.OneCycleLR(self.optimizer, lrs, epochs=self.config.epochs, steps_per_epoch=len(self.train_loader),
                                              cycle_momentum=self.config.cycle_momentum,
-                                             base_momentum=self.config.get('base_momentum', 0.85), max_momentum=self.config.get('max_momentum', 0.95), div_factor=self.config.div_factor,
-                                             final_div_factor=self.config.final_div_factor, pct_start=self.config.pct_start, three_phase=self.config.three_phase)
+                                             base_momentum=0.85, max_momentum=0.95, div_factor=self.config.div_factor, final_div_factor=self.config.final_div_factor, pct_start=self.config.pct_start, three_phase=self.config.three_phase)
 
     def train_on_batch(self, batch, train_step):
         raise NotImplementedError
@@ -184,6 +162,7 @@ class BaseTrainer:
 
 
         if self.config.prefetch:
+
             for i, batch in tqdm(enumerate(self.train_loader), desc=f"Prefetching...",
                                  total=self.iters_per_epoch) if is_rank_zero(self.config) else enumerate(self.train_loader):
                 pass
@@ -196,10 +175,6 @@ class BaseTrainer:
                 break
             
             self.epoch = epoch
-            # self.model.eval()
-            # metrics, test_losses = self.validate()
-            # print(metrics)
-            # exit(100)
             ################################# Train loop ##########################################################
             if self.should_log:
                 wandb.log({"Epoch": epoch}, step=self.step)
@@ -221,12 +196,7 @@ class BaseTrainer:
 
                 if self.should_log and self.step % 50 == 0:
                     wandb.log({f"Train/{name}": loss.item()
-                               for name, loss in losses.items()}, step=self.step)
-                    # current_lr = self.optimizer.param_groups[0]['lr']
-                    current_lr = self.scheduler.get_last_lr()[0]
-                    wandb.log({f"Train/LR": current_lr}, step=self.step)
-                    momentum = self.optimizer.param_groups[0]['betas'][0]
-                    wandb.log({f"Train/momentum": momentum}, step=self.step)
+                              for name, loss in losses.items()}, step=self.step)
 
                 self.step += 1
 
@@ -298,10 +268,7 @@ class BaseTrainer:
                 if metrics:
                     metrics_avg.update(metrics)
 
-            r1, r2 = metrics_avg.get_value(), losses_avg.get_value()
-            if self.should_log and self.config.get("debug", False):
-                print(r1)
-            return r1, r2
+            return metrics_avg.get_value(), losses_avg.get_value()
 
     def save_checkpoint(self, filename):
         if not self.should_write:
@@ -319,31 +286,23 @@ class BaseTrainer:
                 "epoch": self.epoch
             }, fpath)
 
-    def log_images(self, rgb: Dict[str, list] = {}, depth: Dict[str, list] = {}, scalar_field: Dict[str, list] = {}, prefix="", scalar_cmap="turbo_r", min_depth=None, max_depth=None):
+    def log_images(self, rgb: Dict[str, list] = {}, depth: Dict[str, list] = {}, scalar_field: Dict[str, list] = {}, prefix="", scalar_cmap="jet", min_depth=None, max_depth=None):
         if not self.should_log:
             return
 
-        # if min_depth is None:
-        #     try:
-        #         min_depth = self.config.min_depth
-        #         max_depth = self.config.max_depth
-        #     except AttributeError:
-        #         min_depth = None
-        #         max_depth = None
-        
-        depths_gt = depth['GT']
-        invalid_mask = torch.logical_or(depths_gt<=min_depth, depths_gt>=max_depth).detach().cpu().squeeze().numpy()
-        min_depth = None
-        max_depth = None
-        
-        depth = {k: colorize(v, vmin=min_depth, vmax=max_depth, invalid_mask=invalid_mask)
+        if min_depth is None:
+            try:
+                min_depth = self.config.min_depth
+                max_depth = self.config.max_depth
+            except AttributeError:
+                min_depth = None
+                max_depth = None
+
+        depth = {k: colorize(v, vmin=min_depth, vmax=max_depth)
                  for k, v in depth.items()}
-        # depth = {k: colorize(v, vmin=0, vmax=80, invalid_mask=invalid_mask)
-        #          for k, v in depth.items()}
         scalar_field = {k: colorize(
             v, vmin=None, vmax=None, cmap=scalar_cmap) for k, v in scalar_field.items()}
         images = {**rgb, **depth, **scalar_field}
-
         wimages = {
             prefix+"Predictions": [wandb.Image(v, caption=k) for k, v in images.items()]}
         wandb.log(wimages, step=self.step)
diff --git a/zoedepth/trainers/loss.py b/zoedepth/trainers/loss.py
index 4381f5f956c546211f48c7c762f9e9a314ccab04..0c5a1c15cdf5628c1474c566fdc6e58159d7f5ab 100644
--- a/zoedepth/trainers/loss.py
+++ b/zoedepth/trainers/loss.py
@@ -22,21 +22,15 @@
 
 # File author: Shariq Farooq Bhat
 
-# This file may include modifications from author Zhenyu Li
-
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.cuda.amp as amp
 import numpy as np
 
-from torch.autograd import Variable
-from math import exp
 
-import matplotlib.pyplot as plt
 KEY_OUTPUT = 'metric_depth'
-# import kornia
-import copy
+
 
 def extract_key(prediction, key):
     if isinstance(prediction, dict):
@@ -53,8 +47,6 @@ class SILogLoss(nn.Module):
         self.beta = beta
 
     def forward(self, input, target, mask=None, interpolate=True, return_interpolated=False):
-        hack_input = input
-
         input = extract_key(input, KEY_OUTPUT)
         if input.shape[-1] != target.shape[-1] and interpolate:
             input = nn.functional.interpolate(
@@ -86,12 +78,6 @@ class SILogLoss(nn.Module):
             loss = 10 * torch.sqrt(Dg)
 
         if torch.isnan(loss):
-            if input.numel() == 0:
-                loss = torch.mean(hack_input) * 0
-                if not return_interpolated:
-                    return loss
-                return loss, intr_input
-        
             print("Nan SILog loss")
             print("input:", input.shape)
             print("target:", target.shape)
@@ -121,33 +107,6 @@ def grad_mask(mask):
     return mask[..., 1:, 1:] & mask[..., 1:, :-1] & mask[..., :-1, 1:]
 
 
-# class GradL1Loss(nn.Module):
-#     """Gradient loss"""
-#     def __init__(self):
-#         super(GradL1Loss, self).__init__()
-#         self.name = 'GradL1'
-
-#     def forward(self, input, target, mask=None, interpolate=True, return_interpolated=False):
-#         input = extract_key(input, KEY_OUTPUT)
-#         if input.shape[-1] != target.shape[-1] and interpolate:
-#             input = nn.functional.interpolate(
-#                 input, target.shape[-2:], mode='bilinear', align_corners=True)
-#             intr_input = input
-#         else:
-#             intr_input = input
-
-#         grad_gt = grad(target)
-#         grad_pred = grad(input)
-#         mask_g = grad_mask(mask)
-
-#         loss = nn.functional.l1_loss(grad_pred[0][mask_g], grad_gt[0][mask_g])
-#         loss = loss + \
-#             nn.functional.l1_loss(grad_pred[1][mask_g], grad_gt[1][mask_g])
-#         if not return_interpolated:
-#             return loss
-#         return loss, intr_input
-
-
 class GradL1Loss(nn.Module):
     """Gradient loss"""
     def __init__(self):
@@ -319,7 +278,6 @@ def compute_scale_and_shift(prediction, target, mask):
     x_1[valid] = (-a_01[valid] * b_0[valid] + a_00[valid] * b_1[valid]) / det[valid]
 
     return x_0, x_1
-
 class ScaleAndShiftInvariantLoss(nn.Module):
     def __init__(self):
         super().__init__()
@@ -347,23 +305,6 @@ class ScaleAndShiftInvariantLoss(nn.Module):
         return loss, intr_input
 
 
-class BudgetConstraint(nn.Module):
-    """
-    Given budget constraint to reduce expected inference FLOPs in the Dynamic Network.
-    """
-    def __init__(self, loss_mu, flops_all, warm_up=True):
-        super().__init__()
-        self.loss_mu = loss_mu
-        self.flops_all = flops_all
-        self.warm_up = warm_up
-
-    def forward(self, flops_expt, warm_up_rate=1.0):
-        if self.warm_up:
-            warm_up_rate = min(1.0, warm_up_rate)
-        else:
-            warm_up_rate = 1.0
-        losses =  warm_up_rate * ((flops_expt / self.flops_all - self.loss_mu)**2)
-        return losses
 
 
 if __name__ == '__main__':
@@ -373,483 +314,3 @@ if __name__ == '__main__':
 
     d = torch.Tensor([6.59, 3.8, 10.0])
     print(celoss.dequantize_depth(celoss.quantize_depth(d)))
-
-
-
-class HistogramMatchingLoss(nn.Module):
-    def __init__(self, min_depth, max_depth, bins=512):
-        super(HistogramMatchingLoss, self).__init__()
-        self.name = 'HistogramMatchingLoss'
-        self.min_depth = min_depth
-        self.max_depth = max_depth
-        self.bins = bins
-
-    def forward(self, input, target, mask, interpolate=True):
-        if input.shape[-1] != mask.shape[-1] and interpolate:
-            input = nn.functional.interpolate(
-                input, mask.shape[-2:], mode='bilinear', align_corners=True)
-        
-        if target.shape[-1] != mask.shape[-1] and interpolate:
-            target = nn.functional.interpolate(
-                target, mask.shape[-2:], mode='bilinear', align_corners=True)
-
-        input[~mask] = 0
-        target[~mask] = 0
-
-
-        pred_hist = torch.histc(input, bins=self.bins, min=self.min_depth, max=self.max_depth)
-        gt_hist = torch.histc(target, bins=self.bins, min=self.min_depth, max=self.max_depth)
-
-        pred_hist /= pred_hist.sum(dim=0, keepdim=True)
-        gt_hist /= gt_hist.sum(dim=0, keepdim=True)
-
-        # print(pred_hist.shape)
-        # print(pred_hist)
-        # _pred_hist = pred_hist.detach().cpu().numpy()
-        # _gt_hist = gt_hist.detach().cpu().numpy()
-        # plt.subplot(2, 1, 1)
-        # plt.bar(range(len(_pred_hist)), _pred_hist)
-        # plt.subplot(2, 1, 2)
-        # plt.bar(range(len(_gt_hist)), _gt_hist)
-        # plt.savefig('./debug_scale.png')
-
-        # Compute cumulative histograms (CDF)
-        cdf_pred = torch.cumsum(pred_hist, dim=0)
-        cdf_gt = torch.cumsum(gt_hist, dim=0)
-
-        # Compute Earth Mover's Distance (EMD) between the CDFs
-        loss = torch.mean(torch.abs(cdf_pred - cdf_gt))
-        # loss = torch.mean(torch.sqrt((pred_hist - gt_hist)**2))
-        # loss = F.kl_div(torch.log(pred_hist + 1e-10), gt_hist, reduction='mean')
-        
-        return loss
-
-
-
-
-def gaussian(window_size, sigma):
-    gauss = torch.Tensor([exp(-(x - window_size//2)**2/float(2*sigma**2)) for x in range(window_size)])
-    return gauss/gauss.sum()
-
-def create_window(window_size, channel):
-    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
-    _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
-    window = Variable(_2D_window.expand(channel, 1, window_size, window_size).contiguous())
-    return window
-
-def _ssim(img1, img2, window, window_size, channel, size_average = True):
-    mu1 = F.conv2d(img1, window, padding = window_size//2, groups = channel)
-    mu2 = F.conv2d(img2, window, padding = window_size//2, groups = channel)
-
-    mu1_sq = mu1.pow(2)
-    mu2_sq = mu2.pow(2)
-    mu1_mu2 = mu1*mu2
-
-    sigma1_sq = F.conv2d(img1*img1, window, padding = window_size//2, groups = channel) - mu1_sq
-    sigma2_sq = F.conv2d(img2*img2, window, padding = window_size//2, groups = channel) - mu2_sq
-    sigma12 = F.conv2d(img1*img2, window, padding = window_size//2, groups = channel) - mu1_mu2
-
-    C1 = 0.01**2
-    C2 = 0.03**2
-
-    ssim_map = ((2*mu1_mu2 + C1)*(2*sigma12 + C2))/((mu1_sq + mu2_sq + C1)*(sigma1_sq + sigma2_sq + C2))
-
-    if size_average:
-        return ssim_map.mean()
-    else:
-        return ssim_map.mean(1).mean(1).mean(1)
-
-class SSIM(torch.nn.Module):
-    def __init__(self, window_size = 11, size_average = True):
-        super(SSIM, self).__init__()
-        self.window_size = window_size
-        self.size_average = size_average
-        self.channel = 1
-        self.window = create_window(window_size, self.channel)
-
-    def forward(self, img1, img2, mask, interpolate=True):
-        if img1.shape[-1] != mask.shape[-1] and interpolate:
-            img1 = nn.functional.interpolate(
-                img1, mask.shape[-2:], mode='bilinear', align_corners=True)
-        
-        if img2.shape[-1] != mask.shape[-1] and interpolate:
-            img2 = nn.functional.interpolate(
-                img2, mask.shape[-2:], mode='bilinear', align_corners=True)
-
-        img1[~mask] = 0
-        img2[~mask] = 0
-
-        (_, channel, _, _) = img1.size()
-
-        if channel == self.channel and self.window.data.type() == img1.data.type():
-            window = self.window
-        else:
-            window = create_window(self.window_size, channel)
-            
-            if img1.is_cuda:
-                window = window.cuda(img1.get_device())
-            window = window.type_as(img1)
-            
-            self.window = window
-            self.channel = channel
-
-
-        loss = _ssim(img1, img2, window, self.window_size, channel, self.size_average)
-        return loss
-
-def ssim(img1, img2, window_size = 11, size_average = True):
-    (_, channel, _, _) = img1.size()
-    window = create_window(window_size, channel)
-    
-    if img1.is_cuda:
-        window = window.cuda(img1.get_device())
-    window = window.type_as(img1)
-    
-    return _ssim(img1, img2, window, window_size, channel, size_average)
-        
-class ConsistencyLoss(nn.Module):
-    def __init__(self, target, focus_flatten=False, wp=1) -> None:
-        super().__init__()
-        self.name = 'Consistency'
-        self.target = target
-        self.mode = 'no-resize'
-        # self.mode = 'resize'
-        self.focus_flatten = focus_flatten
-        self.wp = wp
-
-    def gradient_y(self, img):
-        # gy = torch.cat([F.conv2d(img[:, i, :, :].unsqueeze(0), torch.Tensor([[1, 2, 1], [0, 0, 0], [-1, -2, -1]]).view((1, 1, 3, 3)).to(img.device), padding=1) for i in range(img.shape[1])], 1)
-        gy = F.conv2d(img, torch.Tensor([[1, 2, 1], [0, 0, 0], [-1, -2, -1]]).view((1, 1, 3, 3)).to(img.device), padding=1)
-        return gy
-
-    def gradient_x(self, img):
-        # gx = torch.cat([F.conv2d(img[:, i, :, :].unsqueeze(0), torch.Tensor([[1, 0, -1], [2, 0, -2], [1, 0, -1]]).view((1, 1, 3, 3)).to(img.device), padding=1) for i in range(img.shape[1])], 1)
-        gx = F.conv2d(img, torch.Tensor([[1, 0, -1], [2, 0, -2], [1, 0, -1]]).view((1, 1, 3, 3)).to(img.device), padding=1)
-        return gx
-
-    def forward(self, depth_preds, shifts, mask, temp_features, pred_f=None):
-
-        common_area_1_list = []
-        common_area_2_list = []
-
-        if self.focus_flatten:
-            # only consider flatten place
-            grad = kornia.filters.spatial_gradient(pred_f.detach())
-            grad_x, grad_y = grad[:, :, 0, :, :], grad[:, :, 1, :, :]
-            grad = torch.sqrt(grad_x ** 2 + grad_y ** 2)
-            grad_ext = grad > 0.05 # over 5cm
-            grad_ext = grad_ext.float()
-            grad_blur = kornia.filters.gaussian_blur2d(grad_ext, (11, 11), (3, 3))
-            grad_ext = grad_blur > 0 # over 5cm
-            grad_ext = grad_blur == 0 
-            mask = torch.logical_and(mask, grad_ext)
-
-
-        if self.target == "mix":
-            ## for feature
-            bs, c, h, w = depth_preds.shape
-            split_depth = torch.split(depth_preds, bs//2, dim=0)
-            split_mask = torch.split(F.interpolate(mask.float(), (384, 512)).bool(), bs//2, dim=0)
-
-            feat_ori_list = []
-            feat_shift_list = []
-            multi_level_mask = []
-
-            for idx, feature in enumerate(temp_features): # multi-level
-                split_feat = torch.split(feature, bs//2, dim=0)
-
-                _, _, h, w = split_feat[0].shape
-                feat_ori_list.append(split_feat[0])
-                feat_shift_list.append(split_feat[1])
-
-                mask_ori_cur_scale = F.interpolate(split_mask[0].float(), (h, w)).bool()
-                multi_level_mask.append(mask_ori_cur_scale)
-
-            for idx_out, (feat_ori_cur_level, feat_shift_cur_level, mask_ori_cur_level) in enumerate(zip(feat_ori_list, feat_shift_list, multi_level_mask)): # iter multi-scale
-                scale_factor = 2 ** (5 - idx_out)
-                _, _, cur_scale_h, cur_scale_w = feat_ori_cur_level.shape
-                scale_factor = int(384 / cur_scale_h)
-
-                for idx_in, (feat_ori, feat_shift, mask_ori, shift_bs) in enumerate(zip(feat_ori_cur_level, feat_shift_cur_level, mask_ori_cur_level, shifts)): # iter bs (paired feat)
-                    c, _, _ = feat_ori.shape
-                    mask_ori = mask_ori.repeat(c, 1, 1)
-                    shift_h, shift_w = int(shift_bs[0] * (384/540) / scale_factor), int(shift_bs[1]* (512/960) / scale_factor)
-
-                    if shift_h >= 0 and shift_w >= 0:
-                        common_area_1 = feat_ori[:, shift_h:, shift_w:]
-                        common_area_2 = feat_shift[:, :-shift_h, :-shift_w]
-                        mask_common = mask_ori[:, shift_h:, shift_w:]       
-                    elif shift_h >= 0 and shift_w <= 0:
-                        common_area_1 = feat_ori[:, shift_h:, :-abs(shift_w)]
-                        common_area_2 = feat_shift[:, :-shift_h, abs(shift_w):]
-                        mask_common = mask_ori[:, shift_h:, :-abs(shift_w)]
-                    elif shift_h <= 0 and shift_w <= 0:
-                        common_area_1 = feat_ori[:, :-abs(shift_h), :-abs(shift_w)]
-                        common_area_2 = feat_shift[:, abs(shift_h):, abs(shift_w):]
-                        mask_common = mask_ori[:, :-abs(shift_h), :-abs(shift_w)]
-                    elif shift_h <= 0 and shift_w >= 0:
-                        common_area_1 = feat_ori[:, :-abs(shift_h):, shift_w:]
-                        common_area_2 = feat_shift[:, abs(shift_h):, :-shift_w]
-                        mask_common = mask_ori[:, :-abs(shift_h):, shift_w:]
-                    else:
-                        print("can you really reach here?")
-
-                    common_area_masked_1 = common_area_1[mask_common].flatten()
-                    common_area_masked_2 = common_area_2[mask_common].flatten()
-                    common_area_1_list.append(common_area_masked_1)
-                    common_area_2_list.append(common_area_masked_2)
-
-            common_area_1 = torch.cat(common_area_1_list)
-            common_area_2 = torch.cat(common_area_2_list)
-            if common_area_1.numel() == 0 or common_area_2.numel() == 0:
-                consistency_loss = torch.Tensor([0]).squeeze()
-            else:
-                consistency_loss = F.mse_loss(common_area_1, common_area_2)
-            consistency_loss_feat = consistency_loss
-
-            
-            common_area_1_list = []
-            common_area_2_list = []
-
-            ## for pred
-            bs, c, h, w = depth_preds.shape
-            split_depth = torch.split(depth_preds, bs//2, dim=0)
-            split_mask = torch.split(mask, bs//2, dim=0)
-        
-            for shift, depth_ori, depth_shift, mask_ori, mask_shift in zip(shifts, split_depth[0], split_depth[1], split_mask[0], split_mask[1]):
-                shift_h, shift_w = shift[0], shift[1]
-                if shift_h >= 0 and shift_w >= 0:
-                    common_area_1 = depth_ori[:, shift_h:, shift_w:]
-                    common_area_2 = depth_shift[:, :-shift_h, :-shift_w]
-                    mask_common = mask_ori[:, shift_h:, shift_w:]
-                    # mask_debug = mask_shift[:, :-shift_h, :-shift_w]
-                elif shift_h >= 0 and shift_w <= 0:
-                    common_area_1 = depth_ori[:, shift_h:, :-abs(shift_w)]
-                    common_area_2 = depth_shift[:, :-shift_h, abs(shift_w):]
-                    mask_common = mask_ori[:, shift_h:, :-abs(shift_w)]
-                    # mask_debug = mask_shift[:, :-shift_h, abs(shift_w):]
-                elif shift_h <= 0 and shift_w <= 0:
-                    common_area_1 = depth_ori[:, :-abs(shift_h), :-abs(shift_w)]
-                    common_area_2 = depth_shift[:, abs(shift_h):, abs(shift_w):]
-                    mask_common = mask_ori[:, :-abs(shift_h), :-abs(shift_w)]
-                    # mask_debug = mask_shift[:, abs(shift_h):, abs(shift_w):]
-                elif shift_h <= 0 and shift_w >= 0:
-                    common_area_1 = depth_ori[:, :-abs(shift_h):, shift_w:]
-                    common_area_2 = depth_shift[:, abs(shift_h):, :-shift_w]
-                    mask_common = mask_ori[:, :-abs(shift_h):, shift_w:]
-                    # mask_debug = mask_shift[:, abs(shift_h):, :-shift_w]
-                else:
-                    print("can you really reach here?")
-            
-                common_area_1 = common_area_1[mask_common].flatten()
-                common_area_2 = common_area_2[mask_common].flatten()
-                common_area_1_list.append(common_area_1)
-                common_area_2_list.append(common_area_2)
-
-            common_area_1 = torch.cat(common_area_1_list)
-            common_area_2 = torch.cat(common_area_2_list)
-            if common_area_1.numel() == 0 or common_area_2.numel() == 0:
-                consistency_loss = torch.Tensor([0]).squeeze()
-            else:
-                # pred_hist = torch.histc(common_area_1, bins=512, min=0, max=80)
-                # gt_hist = torch.histc(common_area_2, bins=512, min=0, max=80)
-
-                # pred_hist /= pred_hist.sum(dim=0, keepdim=True)
-                # gt_hist /= gt_hist.sum(dim=0, keepdim=True)
-
-                # # Compute cumulative histograms (CDF)
-                # cdf_pred = torch.cumsum(pred_hist, dim=0)
-                # cdf_gt = torch.cumsum(gt_hist, dim=0)
-
-                # # Compute Earth Mover's Distance (EMD) between the CDFs
-                # consistency_loss = torch.mean(torch.abs(cdf_pred - cdf_gt))
-                consistency_loss = F.mse_loss(common_area_1, common_area_2) 
-            consistency_loss_pred = consistency_loss
-
-            consistency_loss = consistency_loss_pred * self.wp + consistency_loss_feat
-            return consistency_loss
-    
-        elif 'feat' in self.target:
-            if self.mode == 'resize':
-                bs, c, h, w = depth_preds.shape
-                split_depth = torch.split(depth_preds, bs//2, dim=0)
-                split_mask = torch.split(mask, bs//2, dim=0)
-                
-                feat_ori_list = []
-                feat_shift_list = []
-
-                for idx, feature in enumerate(temp_features): # multi-level
-                    if idx < 4:
-                        continue
-                    
-                    split_feat = torch.split(feature, bs//2, dim=0)
-                    f = F.interpolate(split_feat[0], (h, w), mode='bilinear', align_corners=True)
-                    feat_ori_list.append(f)
-                    f = F.interpolate(split_feat[1], (h, w), mode='bilinear', align_corners=True)
-                    feat_shift_list.append(f)
-
-
-                for idx_out, (feat_ori_cur_level, feat_shift_cur_level) in enumerate(zip(feat_ori_list, feat_shift_list)): # iter multi-scale
-                    scale_factor = 2 ** (5 - idx_out)
-
-                    for idx_in, (feat_ori, feat_shift, mask_ori, shift_bs) in enumerate(zip(feat_ori_cur_level, feat_shift_cur_level, split_mask[0], shifts)): # iter bs (paired feat)
-                        c, h, w = feat_ori.shape
-                        mask_ori = mask_ori.repeat(c, 1, 1)
-                        shift_h, shift_w = shift_bs[0], shift_bs[1]
-
-                        if shift_h >= 0 and shift_w >= 0:
-                            common_area_1 = feat_ori[:, shift_h:, shift_w:]
-                            common_area_2 = feat_shift[:, :-shift_h, :-shift_w]
-                            mask_common = mask_ori[:, shift_h:, shift_w:]       
-                        elif shift_h >= 0 and shift_w <= 0:
-                            common_area_1 = feat_ori[:, shift_h:, :-abs(shift_w)]
-                            common_area_2 = feat_shift[:, :-shift_h, abs(shift_w):]
-                            mask_common = mask_ori[:, shift_h:, :-abs(shift_w)]
-                        elif shift_h <= 0 and shift_w <= 0:
-                            common_area_1 = feat_ori[:, :-abs(shift_h), :-abs(shift_w)]
-                            common_area_2 = feat_shift[:, abs(shift_h):, abs(shift_w):]
-                            mask_common = mask_ori[:, :-abs(shift_h), :-abs(shift_w)]
-                        elif shift_h <= 0 and shift_w >= 0:
-                            common_area_1 = feat_ori[:, :-abs(shift_h):, shift_w:]
-                            common_area_2 = feat_shift[:, abs(shift_h):, :-shift_w]
-                            mask_common = mask_ori[:, :-abs(shift_h):, shift_w:]
-                        else:
-                            print("can you really reach here?")
-
-                        common_area_masked_1 = common_area_1[mask_common].flatten()
-                        common_area_masked_2 = common_area_2[mask_common].flatten()
-                        # common_area_masked_1 = common_area_1.flatten()
-                        # common_area_masked_2 = common_area_2.flatten()
-                        common_area_1_list.append(common_area_masked_1)
-                        common_area_2_list.append(common_area_masked_2)
-
-                common_area_1 = torch.cat(common_area_1_list)
-                common_area_2 = torch.cat(common_area_2_list)
-                if common_area_1.numel() == 0 or common_area_2.numel() == 0:
-                    consistency_loss = torch.Tensor([0]).squeeze()
-                else:
-                    consistency_loss = F.mse_loss(common_area_1, common_area_2)
-
-                return consistency_loss
-            
-
-            else:
-                bs, c, h, w = depth_preds.shape
-                split_depth = torch.split(depth_preds, bs//2, dim=0)
-                mask = F.interpolate(mask.float(), (384, 512)).bool() # back to 384, 512
-                split_mask = torch.split(mask, bs//2, dim=0)
-
-                feat_ori_list = []
-                feat_shift_list = []
-                multi_level_mask = []
-
-                for idx, feature in enumerate(temp_features): # multi-level
-                    split_feat = torch.split(feature, bs//2, dim=0)
-
-                    _, _, h, w = split_feat[0].shape
-                    feat_ori_list.append(split_feat[0])
-                    feat_shift_list.append(split_feat[1])
-
-                    mask_ori_cur_scale = F.interpolate(split_mask[0].float(), (h, w)).bool()
-                    multi_level_mask.append(mask_ori_cur_scale)
-
-                for idx_out, (feat_ori_cur_level, feat_shift_cur_level, mask_ori_cur_level) in enumerate(zip(feat_ori_list, feat_shift_list, multi_level_mask)): # iter multi-scale
-                    scale_factor = 2 ** (5 - idx_out)
-                    _, _, cur_scale_h, cur_scale_w = feat_ori_cur_level.shape
-                    scale_factor = int(384 / cur_scale_h)
-
-                    for idx_in, (feat_ori, feat_shift, mask_ori, shift_bs) in enumerate(zip(feat_ori_cur_level, feat_shift_cur_level, mask_ori_cur_level, shifts)): # iter bs (paired feat)
-                        c, _, _ = feat_ori.shape
-                        mask_ori = mask_ori.repeat(c, 1, 1)
-                        shift_h, shift_w = int(shift_bs[0] * (384/540) / scale_factor), int(shift_bs[1]* (512/960) / scale_factor)
-
-                        if shift_h >= 0 and shift_w >= 0:
-                            common_area_1 = feat_ori[:, shift_h:, shift_w:]
-                            common_area_2 = feat_shift[:, :-shift_h, :-shift_w]
-                            mask_common = mask_ori[:, shift_h:, shift_w:]       
-                        elif shift_h >= 0 and shift_w <= 0:
-                            common_area_1 = feat_ori[:, shift_h:, :-abs(shift_w)]
-                            common_area_2 = feat_shift[:, :-shift_h, abs(shift_w):]
-                            mask_common = mask_ori[:, shift_h:, :-abs(shift_w)]
-                        elif shift_h <= 0 and shift_w <= 0:
-                            common_area_1 = feat_ori[:, :-abs(shift_h), :-abs(shift_w)]
-                            common_area_2 = feat_shift[:, abs(shift_h):, abs(shift_w):]
-                            mask_common = mask_ori[:, :-abs(shift_h), :-abs(shift_w)]
-                        elif shift_h <= 0 and shift_w >= 0:
-                            common_area_1 = feat_ori[:, :-abs(shift_h):, shift_w:]
-                            common_area_2 = feat_shift[:, abs(shift_h):, :-shift_w]
-                            mask_common = mask_ori[:, :-abs(shift_h):, shift_w:]
-                        else:
-                            print("can you really reach here?")
-
-                        common_area_masked_1 = common_area_1[mask_common].flatten()
-                        common_area_masked_2 = common_area_2[mask_common].flatten()
-                        common_area_1_list.append(common_area_masked_1)
-                        common_area_2_list.append(common_area_masked_2)
-
-                common_area_1 = torch.cat(common_area_1_list)
-                common_area_2 = torch.cat(common_area_2_list)
-                if common_area_1.numel() == 0 or common_area_2.numel() == 0:
-                    consistency_loss = torch.Tensor([0]).squeeze()
-                else:
-                    consistency_loss = F.mse_loss(common_area_1, common_area_2)
-                return consistency_loss
-        
-        elif self.target == 'pred':
-            bs, c, h, w = depth_preds.shape
-            split_depth = torch.split(depth_preds, bs//2, dim=0)
-            split_mask = torch.split(mask, bs//2, dim=0)
-        
-            for shift, depth_ori, depth_shift, mask_ori, mask_shift in zip(shifts, split_depth[0], split_depth[1], split_mask[0], split_mask[1]):
-                shift_h, shift_w = shift[0], shift[1]
-                if shift_h >= 0 and shift_w >= 0:
-                    common_area_1 = depth_ori[:, shift_h:, shift_w:]
-                    common_area_2 = depth_shift[:, :-shift_h, :-shift_w]
-                    mask_common = mask_ori[:, shift_h:, shift_w:]
-                    # mask_debug = mask_shift[:, :-shift_h, :-shift_w]
-                elif shift_h >= 0 and shift_w <= 0:
-                    common_area_1 = depth_ori[:, shift_h:, :-abs(shift_w)]
-                    common_area_2 = depth_shift[:, :-shift_h, abs(shift_w):]
-                    mask_common = mask_ori[:, shift_h:, :-abs(shift_w)]
-                    # mask_debug = mask_shift[:, :-shift_h, abs(shift_w):]
-                elif shift_h <= 0 and shift_w <= 0:
-                    common_area_1 = depth_ori[:, :-abs(shift_h), :-abs(shift_w)]
-                    common_area_2 = depth_shift[:, abs(shift_h):, abs(shift_w):]
-                    mask_common = mask_ori[:, :-abs(shift_h), :-abs(shift_w)]
-                    # mask_debug = mask_shift[:, abs(shift_h):, abs(shift_w):]
-                elif shift_h <= 0 and shift_w >= 0:
-                    common_area_1 = depth_ori[:, :-abs(shift_h):, shift_w:]
-                    common_area_2 = depth_shift[:, abs(shift_h):, :-shift_w]
-                    mask_common = mask_ori[:, :-abs(shift_h):, shift_w:]
-                    # mask_debug = mask_shift[:, abs(shift_h):, :-shift_w]
-                else:
-                    print("can you really reach here?")
-            
-                common_area_1 = common_area_1[mask_common].flatten()
-                common_area_2 = common_area_2[mask_common].flatten()
-                common_area_1_list.append(common_area_1)
-                common_area_2_list.append(common_area_2)
-
-            common_area_1 = torch.cat(common_area_1_list)
-            common_area_2 = torch.cat(common_area_2_list)
-            if common_area_1.numel() == 0 or common_area_2.numel() == 0:
-                consistency_loss = torch.Tensor([0]).squeeze()
-            else:
-                # pred_hist = torch.histc(common_area_1, bins=512, min=0, max=80)
-                # gt_hist = torch.histc(common_area_2, bins=512, min=0, max=80)
-
-                # pred_hist /= pred_hist.sum(dim=0, keepdim=True)
-                # gt_hist /= gt_hist.sum(dim=0, keepdim=True)
-
-                # # Compute cumulative histograms (CDF)
-                # cdf_pred = torch.cumsum(pred_hist, dim=0)
-                # cdf_gt = torch.cumsum(gt_hist, dim=0)
-
-                # # Compute Earth Mover's Distance (EMD) between the CDFs
-                # consistency_loss = torch.mean(torch.abs(cdf_pred - cdf_gt))
-                consistency_loss = F.mse_loss(common_area_1, common_area_2)
-            
-            return consistency_loss
-            
-        else:
-            raise NotImplementedError
\ No newline at end of file
diff --git a/zoedepth/trainers/loss_sample.py b/zoedepth/trainers/loss_sample.py
deleted file mode 100644
index b2cf3c146654b491300eaa3650b73387cb8632c9..0000000000000000000000000000000000000000
--- a/zoedepth/trainers/loss_sample.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# MIT License
-
-# Copyright (c) 2022 Intelligent Systems Lab Org
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-# File author: Zhenyu Li
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.cuda.amp as amp
-import numpy as np
-
-
-KEY_OUTPUT = 'metric_depth'
-
-
-def extract_key(prediction, key):
-    if isinstance(prediction, dict):
-        return prediction[key]
-    return prediction
-
-
-# Main loss function used for ZoeDepth. Copy/paste from AdaBins repo (https://github.com/shariqfarooq123/AdaBins/blob/0952d91e9e762be310bb4cd055cbfe2448c0ce20/loss.py#L7)
-class SILogLoss(nn.Module):
-    """SILog loss (pixel-wise)"""
-    def __init__(self, beta=0.15):
-        super(SILogLoss, self).__init__()
-        self.name = 'SILog'
-        self.beta = beta
-
-    def forward(self, input, target, mask=None):
-        input = extract_key(input, KEY_OUTPUT)
-        
-        if mask is not None:
-            input_filtered = input[mask]
-            target_filtered = target[mask]
-
-        with amp.autocast(enabled=False):  # amp causes NaNs in this loss function
-            alpha = 1e-7
-            g = torch.log(input_filtered + alpha) - torch.log(target_filtered + alpha)
-            Dg = torch.var(g) + self.beta * torch.pow(torch.mean(g), 2)
-            loss = 10 * torch.sqrt(Dg)
-
-        if torch.isnan(loss):
-            print("Nan SILog loss")
-            print("input:", input.shape)
-            print("target:", target.shape)
-            print("G", torch.sum(torch.isnan(g)))
-            print("Input min max", torch.min(input), torch.max(input))
-            print("Target min max", torch.min(target), torch.max(target))
-            print("Dg", torch.isnan(Dg))
-            print("loss", torch.isnan(loss))
-
-        return loss
-    
-
-
-import torch
-import torch.nn.functional as F
-
-def gaussian(mu, sigma, labels):
-    return torch.exp(-0.5*(mu-labels)** 2/ sigma** 2)/sigma
-
-def laplacian(mu, b, labels):
-    # a = torch.abs(mu-labels)/b
-    # print("1 isnan: {}".format(torch.isnan(a).any()))
-    # print("1 isinf: {}".format(torch.isinf(a).any()))
-    # a = torch.exp(-(torch.abs(mu-labels)/b))
-    # print(a)
-
-    # print("1 isnan: {}".format(torch.isnan(a).any()))
-    # print("1 isinf: {}".format(torch.isinf(a).any()))
-    return 0.5 * torch.exp(-(torch.abs(mu-labels)/b))/b
-
-def distribution(mu, sigma, labels, dist="gaussian"):
-    return gaussian(mu, sigma, labels) if dist=="gaussian" else \
-           laplacian(mu, sigma, labels)
-
-def bimodal_loss(mu0, mu1, sigma0, sigma1, w0, w1, labels, dist="gaussian"):
-    # first_term = w0 * distribution(mu0, sigma0, labels, dist)
-    # print(first_term)
-    # print("f isnan: {}".format(torch.isnan(first_term).any()))
-    # print("f isinf: {}".format(torch.isinf(first_term).any()))
-    # second_term = w1 * distribution(mu1, sigma1, labels, dist)
-    # print(second_term)
-    # print("s isnan: {}".format(torch.isnan(second_term).any()))
-    # print("s isinf: {}".format(torch.isinf(second_term).any()))
-
-    loss = w0 * distribution(mu0, sigma0, labels, dist) + w1 * distribution(mu1, sigma1, labels, dist)
-    # loss = torch.clamp(loss, min=1e-12)
-    # print(loss)
-    return - torch.log(loss)
-
-def unimodal_loss(mu, sigma, labels):
-    return torch.abs(mu - labels)/sigma + torch.log(sigma)
-
-def smooth_l1_loss(preds, labels, reduce=None):
-    return F.smooth_l1_loss(preds, labels, reduce=reduce)
-
-def l1_loss(preds, labels, reduce=None):
-    return F.l1_loss(preds, labels, reduce=reduce)
-
-
-class DistributionLoss(nn.Module):
-    def __init__(self, max_depth):
-        super(DistributionLoss, self).__init__()
-        self.name = 'DistributionLoss'
-        self.max_depth = max_depth
-
-    def forward(self, input, target, mask=None, dist='biLaplacian'):
-        
-        
-        mu0 = input['mu0']
-        mu1 = input['mu1']
-        sigma0 = input['sigma0']
-        sigma1 = input['sigma1']
-        pi0 = input['pi0']
-        pi1 = input['pi1']
-        
-        pred_mask = (pi0 / sigma0 > pi1 / sigma1).float()
-        pred_depth = (mu0 * pred_mask + mu1 * (1. - pred_mask))
-        pred_metric_depth = (1 - pred_depth) * self.max_depth
-
-
-        if mask is not None:
-            mu0 = mu0[mask]
-            mu1 = mu1[mask]
-            sigma0 = sigma0[mask]
-            sigma1 = sigma1[mask]
-            pi0 = pi0[mask]
-            pi1 = pi1[mask]
-
-            # real_input = real_depth[mask]
-            
-            real_input = mu0
-            pred_metric_depth = pred_metric_depth[mask]
-            record_target = target[mask]
-
-
-        target_filtered = 1 - target[mask] / self.max_depth
-        bi_loss = bimodal_loss(mu0, mu1, sigma0, sigma1, pi0, pi1, target_filtered, dist=dist).mean()
-        # print(bi_loss)  
-
-        alpha = 1e-7
-        beta = 0.15
-        g = torch.log(real_input + alpha) - torch.log(record_target + alpha)
-        Dg = torch.var(g) + beta * torch.pow(torch.mean(g), 2)
-        sig_loss = 10 * torch.sqrt(Dg)
-        # print(sig_loss)
-        
-        return bi_loss, sig_loss
-
-
-        
-        
\ No newline at end of file
diff --git a/zoedepth/trainers/zoedepth_custom_trainer.py b/zoedepth/trainers/zoedepth_custom_trainer.py
deleted file mode 100644
index e4cd353a1a304e4cd13b82c0fe539862c9e2d85d..0000000000000000000000000000000000000000
--- a/zoedepth/trainers/zoedepth_custom_trainer.py
+++ /dev/null
@@ -1,722 +0,0 @@
-# MIT License
-
-# Copyright (c) 2022 Intelligent Systems Lab Org
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-# File author: Zhenyu Li
-
-# This file is partly inspired from ZoeDepth (https://github.com/isl-org/ZoeDepth/blob/main/zoedepth/trainers/zoedepth_trainer.py); author: Shariq Farooq Bhat
-
-import os
-
-import torch
-import torch.cuda.amp as amp
-import torch.nn as nn
-
-from zoedepth.trainers.loss_sample import SILogLoss, DistributionLoss
-from zoedepth.trainers.loss import SILogLoss as DenseSILogLoss
-from zoedepth.trainers.loss import BudgetConstraint, HistogramMatchingLoss, SSIM, ConsistencyLoss
-from zoedepth.utils.config import DATASETS_CONFIG
-from zoedepth.utils.misc import compute_metrics
-from zoedepth.data.preprocess import get_black_border
-
-from .base_trainer import BaseTrainer, is_rank_zero, colors, flatten
-from torchvision import transforms
-from PIL import Image
-import numpy as np
-
-import wandb
-import uuid
-from tqdm import tqdm
-from datetime import datetime as dt
-import torch.distributed as dist
-
-import copy
-from zoedepth.utils.misc import generatemask
-import torch.optim as optim
-
-class Trainer(BaseTrainer):
-    def __init__(self, config, model, train_loader, test_loader=None, device=None):
-        self.addf = config.get("addf", False)
-        self.lazy_epoch = -1
-        self.boostingdepth = config.get("boostingdepth", False)
-        super().__init__(config, model, train_loader,
-                         test_loader=test_loader, device=device)
-        self.device = device
-        self.silog_loss = SILogLoss(beta=config.get("beta", 0.15))
-        self.dense_silog_loss = DenseSILogLoss(beta=config.get("beta", 0.15))
-        print("sigloss's beta is set to {}".format(config.get("beta", 0.15)))
-        self.scaler = amp.GradScaler(enabled=self.config.use_amp)
-        self.distribution_loss = DistributionLoss(max_depth=self.config.max_depth)
-        self.sampled_training = config.get("sampled_training", False)
-        self.sec_stage = config.get("sec_stage", False)
-        self.multi_consistency = config.get("multi_consistency", False)
-        self.use_blur = config.get("use_blur", False)
-        self.dynamic = config.get("dynamic", False)
-        if self.dynamic:
-            self.dynamic_unupdate_rate = config.get("dynamic_unupdate_rate", 0.0)
-            self.budget_loss = BudgetConstraint(loss_mu=0.0, flops_all=21552.5684, warm_up=True)
-        self.use_scale_loss = config.get("use_scale_loss", False)
-        if self.use_scale_loss:
-            if config.get("scale_type", "ssim"):
-                self.scale_loss = SSIM(window_size=config.get("window_size", int(11)))
-            else:
-                self.scale_loss = HistogramMatchingLoss(min_depth=self.config.min_depth, max_depth=self.config.max_depth)
-        self.scale_target = config.get("scale_target", None)
-        self.consistency_training = config.get("consistency_training", False)
-        if self.consistency_training:
-            self.consistency_target = config.get("consistency_target", None)
-            self.consistency_loss = ConsistencyLoss(self.consistency_target, config.get("focus_flatten", False), config.get("w_p", 1.0))
-            print("current weight for consistency loss is {}. focus_flatten is {}. w_p is {}".format(self.config.w_consistency, config.get("focus_flatten", False), config.get("w_p", 1.0)))
-        
-        
-
-    def train_on_batch(self, batch, train_step, step_rate):
-        """
-        Expects a batch of images and depth as input
-        batch["image"].shape : batch_size, c, h, w
-        batch["depth"].shape : batch_size, 1, h, w
-        """
-
-        images, depths_gt = batch['image'].to(self.device), batch['depth'].to(self.device)
-        image_raw = batch.get("image_raw", None)
-        if image_raw is not None:
-            image_raw = image_raw.to(self.device)
-        sample_points = None
-        if self.sampled_training:
-            sample_points = batch['sample_points'].to(self.device)
-        bbox = batch.get("bbox", None)
-        if bbox is not None:
-            bbox = bbox.to(self.device)
-        bbox_raw = batch.get("bbox_raw", None)
-        if bbox_raw is not None:
-            bbox_raw = bbox_raw.to(self.device)
-        depth_raw = batch.get("depth_raw", None)
-        if depth_raw is not None:
-            depth_raw = depth_raw.to(self.device)
-        crop_area = batch.get("crop_area", None)
-        if crop_area is not None:
-            crop_area = crop_area.to(self.device)
-        shift = batch.get("shift", None)
-        if shift is not None:
-            shift = shift.to(self.device)
-        
-
-        dataset = batch['dataset'][0]
-        
-        b, c, h, w = images.size()
-        mask = batch["mask"].to(self.device).to(torch.bool)
-        sample_mask = batch.get("sample_mask", None)
-        if sample_mask is not None:
-            sample_mask = sample_mask.to(self.device).to(torch.bool)
-        mask_raw = batch.get("mask_raw", None)
-        if mask_raw is not None:
-            mask_raw = mask_raw.to(self.device).to(torch.bool)
-
-        losses = {}
-
-        with amp.autocast(enabled=self.config.use_amp):
-            if self.sampled_training:
-                output = self.model(images, sample_points, mode='train', image_raw=image_raw, bbox=bbox, depth_raw=depth_raw, crop_area=crop_area, shift=shift, bbox_raw=bbox_raw)
-            else:
-                output = self.model(images, None, mode='train', image_raw=image_raw, bbox=bbox, depth_raw=depth_raw, crop_area=crop_area, shift=shift, bbox_raw=bbox_raw)
-
-            
-            if self.boostingdepth:
-                if self.lazy_epoch < self.epoch:
-                    output.update_learning_rate()
-                    self.lazy_epoch = self.epoch
-                input_dict = dict()
-                input_dict['data_gtfake'] = depths_gt
-                output.set_input_train_gt(input_dict)
-                output.optimize_parameters()
-                pred_depths = output.fake_B
-                pred = output.fake_B
-                # print(torch.min(pred), torch.max(pred))
-                losses = output.get_current_losses()
-
-            else:
-                pred_depths = output['metric_depth']
-
-                if self.sampled_training:
-                    sampled_depth_gt = sample_points[:, :, -1].float().unsqueeze(dim=-1)
-                    sampled_depth_gt = sampled_depth_gt.permute(0, 2, 1)
-                
-                if self.config.get("representation", "") == 'biLaplacian':
-                    # only for sampled training for now
-                    l_dist, l_si = self.distribution_loss(output, sampled_depth_gt, mask=sample_mask)
-                    loss = self.config.w_dist * l_dist + self.config.w_si * l_si
-                    losses['distribution_loss'] = l_dist
-                    losses['sigloss'] = l_si
-
-                    if self.multi_consistency:
-                        coarse, fine = output['coarse_depth_pred'], output['fine_depth_pred']
-                        l_si_f = self.dense_silog_loss(
-                            fine, depths_gt, mask=mask, interpolate=True, return_interpolated=False)
-                        l_si_c = self.dense_silog_loss(
-                            coarse, depth_raw, mask=mask_raw, interpolate=True, return_interpolated=False)
-                        
-                        losses['sigloss_f'] = l_si_f
-                        losses['l_si_c'] = l_si_c
-                        loss += self.config.w_si * (l_si_f + l_si_c)
-
-                else:
-                    if self.sampled_training:
-                        l_si = self.silog_loss(
-                            pred_depths, sampled_depth_gt, mask=sample_mask)
-                        loss = self.config.w_si * l_si
-                        losses[self.silog_loss.name] = l_si
-
-                        if self.multi_consistency:
-                            coarse, fine = output['coarse_depth_pred'], output['fine_depth_pred']
-                            l_si_f = self.dense_silog_loss(
-                                fine, depths_gt, mask=mask, interpolate=True, return_interpolated=False)
-                            l_si_c = self.dense_silog_loss(
-                                coarse, depth_raw, mask=mask_raw, interpolate=True, return_interpolated=False)
-                            
-                            losses['sigloss_f'] = l_si_f
-                            losses['l_si_c'] = l_si_c
-                            loss += self.config.w_si * (l_si_f + l_si_c)
-
-                    else:
-                        if self.multi_consistency:
-                            #### here here here
-                            pred_depths, coarse, fine = output['metric_depth'], output['coarse_depth_pred'], output['fine_depth_pred']
-
-                            if self.consistency_training:
-                                depths_gt = torch.split(depths_gt, 1, dim=1)
-                                depths_gt = torch.cat(depths_gt, dim=0)
-                                mask = torch.split(mask, 1, dim=-1)
-                                mask = torch.cat(mask, dim=0).permute(0, 3, 1, 2)
-                                mask_raw = torch.cat([mask_raw, mask_raw], dim=0)
-                                depth_raw = torch.cat([depth_raw, depth_raw], dim=0)
-                                temp_features = output.get('temp_features', None)
-                                
-
-                            l_si_1, pred = self.dense_silog_loss(
-                                pred_depths, depths_gt, mask=mask, interpolate=True, return_interpolated=True)
-                            l_si_f, pred_f = self.dense_silog_loss(
-                                fine, depths_gt, mask=mask, interpolate=True, return_interpolated=True)
-                            l_si_c = self.dense_silog_loss(
-                                coarse, depth_raw, mask=mask_raw, interpolate=True, return_interpolated=False)
-                            
-                            losses[self.silog_loss.name] = l_si_1
-                            losses['sigloss_f'] = l_si_f
-                            losses['l_si_c'] = l_si_c
-                            # loss = l_si_1 + l_si_f + l_si_c
-                            loss = l_si_1
-
-                            if self.consistency_training:
-                                try:
-                                    # depths_gt?  pred_f?
-                                    l_consistency = self.consistency_loss(pred, shift, mask, temp_features, pred_f=depths_gt) # use the resized pred
-                                except RuntimeError as e:
-                                    print(e)
-                                    print("some runtime error here! Hack with 0")
-                                    l_consistency = torch.Tensor([0]).squeeze()
-
-                                losses[self.consistency_loss.name] = l_consistency
-                                loss += l_consistency * self.config.w_consistency
-
-                        else:
-
-                            l_si, pred = self.dense_silog_loss(
-                                pred_depths, depths_gt, mask=mask, interpolate=True, return_interpolated=True)
-
-                            loss = self.config.w_si * l_si
-                            losses[self.silog_loss.name] = l_si
-
-                if self.dynamic:
-                    if step_rate > self.dynamic_unupdate_rate:
-                        warm_up_rate = min(1.0, (step_rate - self.dynamic_unupdate_rate) / 0.02)
-                        flop_cost = self.budget_loss(output['all_cell_flops'], warm_up_rate=warm_up_rate)
-                        loss += self.config.w_flop * flop_cost
-                        losses['flop_loss'] = flop_cost
-                    else:
-                        flop_cost = self.budget_loss(output['all_cell_flops'], warm_up_rate=1)
-                        loss += 0 * flop_cost
-                        losses['flop_loss'] = flop_cost
-
-                if self.use_scale_loss:
-                    if self.scale_target == 'coarse':
-                        h_loss = self.scale_loss(pred_depths, output['coarse_depth_pred_roi'], mask, interpolate=True)
-                    else:
-                        h_loss = self.scale_loss(pred_depths, depths_gt, mask, interpolate=True)
-                    loss += self.config.w_scale * h_loss
-                    losses['scale_loss'] = h_loss
-
-            
-                # self.scaler.scale(loss).backward()
-
-                # if self.config.clip_grad > 0:
-                #     self.scaler.unscale_(self.optimizer)
-                #     nn.utils.clip_grad_norm_(
-                #         self.model.parameters(), self.config.clip_grad)
-
-                # self.scaler.step(self.optimizer)
-                
-                # self.scaler.update()
-                # self.optimizer.zero_grad()
-
-
-                self.scaler.scale(loss).backward()
-
-                if self.config.clip_grad > 0:
-                    self.scaler.unscale_(self.optimizer)
-                    nn.utils.clip_grad_norm_(
-                        self.model.parameters(), self.config.clip_grad)
-
-                self.scaler.step(self.optimizer)
-                
-                self.scaler.update()
-                self.optimizer.zero_grad()
-
-
-        if self.should_log and (self.step % int(self.config.log_images_every * self.iters_per_epoch)) == 0:
-            if self.config.get("debug", False):
-                pred = nn.functional.interpolate(
-                    pred[0:1], depths_gt.shape[-2:], mode='bilinear', align_corners=True)[0]
-                import matplotlib.pyplot as plt
-                plt.imshow(pred.squeeze().detach().cpu().numpy())
-                plt.savefig('debug.png')
-                pass
-            else:
-                pred = nn.functional.interpolate(
-                    pred[0:1], depths_gt.shape[-2:], mode='bilinear', align_corners=True)[0]
-                depths_gt[torch.logical_not(mask)] = DATASETS_CONFIG[dataset]['max_depth']
-                if self.consistency_training:
-                    split_images = torch.split(images, 3, dim=1)
-                    images = torch.cat(split_images, dim=0)
-                self.log_images(rgb={"Input": images[0, ...]}, depth={"GT": depths_gt[0], "PredictedMono": pred}, prefix="Train",
-                                min_depth=DATASETS_CONFIG[dataset]['min_depth'], max_depth=DATASETS_CONFIG[dataset]['max_depth'])
-                
-        return losses
-    
-    @torch.no_grad()
-    def eval_infer(self, x, image_raw, bboxs=None, crop_area=None, dataset='u4k', bbox_raw=None):
-        m = self.model.module if self.config.multigpu else self.model
-
-        if dataset == 'u4k':
-            base_h = 540
-            base_w = 960
-        elif dataset == 'gta':
-            base_h = 270
-            base_w = 480
-        elif dataset == 'nyu':
-            base_h = 120 * 2
-            base_w = 160 * 2
-        else:
-            raise NotImplementedError
-        
-        if dataset == 'nyu':
-            if self.sec_stage:
-                images_crops = torch.split(x, 3, dim=1)
-                bboxs_list = torch.split(bboxs, 1, dim=1)
-                crop_areas = torch.split(crop_area, 1, dim=1)
-
-                pred_depth_crops = []
-                for i, (img, bbox, crop_area) in enumerate(zip(images_crops, bboxs_list, crop_areas)):
-                    with amp.autocast(enabled=self.config.use_amp):
-                        if i == 0:
-                            out_dict = m(img, mode='eval', image_raw=image_raw, bbox=bbox[0], crop_area=crop_area, bbox_raw=bbox_raw[:, i, :] if bbox_raw is not None else None)
-                            # whole_depth_pred = out_dict['coarse_depth_pred']
-                            pred_depth_crop = out_dict['metric_depth']
-                        else:
-                            pred_depth_crop = m(img, mode='eval', image_raw=image_raw, bbox=bbox[0], crop_area=crop_area, bbox_raw=bbox_raw[:, i, :] if bbox_raw is not None else None)['metric_depth']
-                            
-                        pred_depth_crop = nn.functional.interpolate(
-                            pred_depth_crop, (base_h, base_w), mode='bilinear', align_corners=True)
-                        pred_depth_crops.append(pred_depth_crop)
-
-                x_start, y_start = [0, base_h], [0, base_w]
-                pred_depth = torch.zeros((base_h*2, base_w*2)).cuda()
-                inner_idx = 0
-                for ii, x in enumerate(x_start):
-                    for jj, y in enumerate(y_start):
-                        if self.use_blur:
-                            pred_depth[x: x+base_h, y: y+base_w] = pred_depth_crops[inner_idx].squeeze() # do not care about boundry during validation
-                        else:
-                            pred_depth[x: x+base_h, y: y+base_w] = pred_depth_crops[inner_idx].squeeze()
-                        inner_idx += 1
-                pred_depth = pred_depth.squeeze(dim=0)
-
-            else:
-
-                with amp.autocast(enabled=self.config.use_amp):
-                    pred_depth = m(x, mode='eval', image_raw=image_raw)['metric_depth']
-                
-        else:
-            if self.sec_stage:
-                images_crops = torch.split(x, 3, dim=1)
-                bboxs_list = torch.split(bboxs, 1, dim=1)
-                crop_areas = torch.split(crop_area, 1, dim=1)
-
-                pred_depth_crops = []
-                for i, (img, bbox, crop_area) in enumerate(zip(images_crops, bboxs_list, crop_areas)):
-                    with amp.autocast(enabled=self.config.use_amp):
-                        if i == 0:
-                            out_dict = m(img, mode='eval', image_raw=image_raw, bbox=bbox[0], crop_area=crop_area, bbox_raw=bbox_raw[:, i, :] if bbox_raw is not None else None)
-                            # whole_depth_pred = out_dict['coarse_depth_pred']
-                            pred_depth_crop = out_dict['metric_depth']
-                        else:
-                            pred_depth_crop = m(img, mode='eval', image_raw=image_raw, bbox=bbox[0], crop_area=crop_area, bbox_raw=bbox_raw[:, i, :] if bbox_raw is not None else None)['metric_depth']
-                            
-                        pred_depth_crop = nn.functional.interpolate(
-                            pred_depth_crop, (base_h, base_w), mode='bilinear', align_corners=True)
-                        pred_depth_crops.append(pred_depth_crop)
-
-                x_start, y_start = [0, base_h], [0, base_w]
-                pred_depth = torch.zeros((base_h*2, base_w*2)).cuda()
-                inner_idx = 0
-                for ii, x in enumerate(x_start):
-                    for jj, y in enumerate(y_start):
-                        if self.use_blur:
-                            pred_depth[x: x+base_h, y: y+base_w] = pred_depth_crops[inner_idx].squeeze() # do not care about boundry during validation
-                        else:
-                            pred_depth[x: x+base_h, y: y+base_w] = pred_depth_crops[inner_idx].squeeze()
-                        inner_idx += 1
-                pred_depth = pred_depth.squeeze(dim=0)
-
-            else:
-
-                with amp.autocast(enabled=self.config.use_amp):
-                    pred_depth = m(x, mode='eval', image_raw=image_raw)['metric_depth']
-        
-        return pred_depth
-
-    @torch.no_grad()
-    def crop_aware_infer(self, x, image_raw):
-        # if we are not avoiding the black border, we can just use the normal inference
-        if not self.config.get("avoid_boundary", False):
-            return self.eval_infer(x)
-        
-        # otherwise, we need to crop the image to avoid the black border
-        # For now, this may be a bit slow due to converting to numpy and back
-        # We assume no normalization is done on the input image
-
-        # get the black border
-        assert x.shape[0] == 1, "Only batch size 1 is supported for now"
-        x_pil = transforms.ToPILImage()(x[0].cpu())
-        x_np = np.array(x_pil, dtype=np.uint8)
-        black_border_params = get_black_border(x_np)
-        top, bottom, left, right = black_border_params.top, black_border_params.bottom, black_border_params.left, black_border_params.right
-        x_np_cropped = x_np[top:bottom, left:right, :]
-        x_cropped = transforms.ToTensor()(Image.fromarray(x_np_cropped))
-
-        # run inference on the cropped image
-        pred_depths_cropped = self.eval_infer(x_cropped.unsqueeze(0).to(self.device))
-
-        # resize the prediction to x_np_cropped's size
-        pred_depths_cropped = nn.functional.interpolate(
-            pred_depths_cropped, size=(x_np_cropped.shape[0], x_np_cropped.shape[1]), mode="bilinear", align_corners=False)
-        
-
-        # pad the prediction back to the original size
-        pred_depths = torch.zeros((1, 1, x_np.shape[0], x_np.shape[1]), device=pred_depths_cropped.device, dtype=pred_depths_cropped.dtype)
-        pred_depths[:, :, top:bottom, left:right] = pred_depths_cropped
-
-        return pred_depths
-
-    def validate_on_batch(self, batch, val_step):
-        images = batch['image'].to(self.device)
-        depths_gt = batch['depth'].to(self.device)
-        dataset = batch['dataset'][0]
-        image_raw = batch['image_raw'].to(self.device)
-        mask = batch["mask"].to(self.device)
-        disp_gt_edges = batch['disp_gt_edges'].squeeze().numpy()
-        bboxs = batch.get("bbox", None)
-        if bboxs is not None:
-            bboxs = bboxs.to(self.device)
-        bbox_raw = batch.get("bbox_raw", None)
-        if bbox_raw is not None:
-            bbox_raw = bbox_raw.to(self.device)
-        crop_area = batch.get("crop_area", None)
-        if crop_area is not None:
-            crop_area = crop_area.to(self.device)
-
-        if 'has_valid_depth' in batch:
-            if not batch['has_valid_depth']:
-                return None, None
-
-        depths_gt = depths_gt.squeeze().unsqueeze(0).unsqueeze(0)
-        mask = mask.squeeze().unsqueeze(0).unsqueeze(0)
-        # if dataset == 'nyu':
-        #     pred_depths = self.crop_aware_infer(images, image_raw)
-        # else:
-        #     pred_depths = self.eval_infer(images, image_raw, bboxs, crop_area, dataset, bbox_raw)
-        pred_depths = self.eval_infer(images, image_raw, bboxs, crop_area, dataset, bbox_raw)
-        pred_depths = pred_depths.squeeze().unsqueeze(0).unsqueeze(0)
-        # print(pred_depths.shape) # torch.Size([1, 1, 2160, 3840])
-        # print(depths_gt.shape) # torch.Size([1, 1, 2160, 3840])
-
-        with amp.autocast(enabled=self.config.use_amp):
-            if self.sampled_training:
-                l_depth = self.silog_loss(
-                    pred_depths, depths_gt, mask=mask.to(torch.bool))
-            else:
-                l_depth = self.dense_silog_loss(
-                    pred_depths, depths_gt, mask=mask.to(torch.bool), interpolate=True)
-
-
-
-        metrics = compute_metrics(depths_gt, pred_depths, disp_gt_edges=disp_gt_edges, **self.config)
-        losses = {f"{self.silog_loss.name}": l_depth.item()}
-
-        if self.should_log and self.config.get("debug", False):
-            print(metrics)
-        if val_step in [21, 27] and self.should_log:
-            if self.config.get("debug", False):
-                pass
-            else:
-                if self.sec_stage:
-                    log_rgb = image_raw
-                else:
-                    log_rgb = images
-
-                scale_pred = nn.functional.interpolate(
-                    pred_depths[0:1], depths_gt.shape[-2:], mode='bilinear', align_corners=True)[0]
-                depths_gt[torch.logical_not(mask)] = DATASETS_CONFIG[dataset]['max_depth']
-                self.log_images(rgb={"Input": log_rgb[0]}, depth={"GT": depths_gt[0], "PredictedMono": scale_pred}, prefix="Test",
-                                min_depth=DATASETS_CONFIG[dataset]['min_depth'], max_depth=DATASETS_CONFIG[dataset]['max_depth'])
-
-        return metrics, losses
-
-
-    
-    def train(self):
-        print(f"Training {self.config.name}")
-        if self.config.uid is None:
-            self.config.uid = str(uuid.uuid4()).split('-')[-1]
-        run_id = f"{dt.now().strftime('%d-%h_%H-%M')}-{self.config.uid}"
-        self.config.run_id = run_id
-        self.config.experiment_id = f"{self.config.wandb_start}_{self.config.name}{self.config.version_name}_{run_id}"
-        self.should_write = ((not self.config.distributed)
-                             or self.config.rank == 0)
-        self.should_log = self.should_write  # and logging
-        if self.should_log:
-            if self.config.get("debug", False):
-                pass
-            else:
-                tags = self.config.tags.split(
-                    ',') if self.config.tags != '' else None
-                wandb.init(project=self.config.project, name=self.config.experiment_id, config=flatten(self.config), dir=self.config.root,
-                        tags=tags, notes=self.config.notes, settings=wandb.Settings(start_method="fork"))
-
-        self.model.train()
-        self.step = 0
-        best_loss = np.inf
-        validate_every = int(self.config.validate_every * self.iters_per_epoch)
-
-
-        if self.config.prefetch:
-
-            for i, batch in tqdm(enumerate(self.train_loader), desc=f"Prefetching...",
-                                 total=self.iters_per_epoch) if is_rank_zero(self.config) else enumerate(self.train_loader):
-                pass
-        
-        losses = {}
-        def stringify_losses(L): return "; ".join(map(
-            lambda kv: f"{colors.fg.purple}{kv[0]}{colors.reset}: {round(kv[1].item(),3):.4e}", L.items()))
-        
-        epoch_len = len(self.train_loader)
-        total_step = epoch_len * self.config.epochs
-
-        for epoch in range(self.config.epochs):
-            if self.should_early_stop():
-                break
-            
-            self.epoch = epoch
-            # self.save_checkpoint(f"{self.config.experiment_id}_latest.pt") # debug
-            ################################# Train loop ##########################################################
-            if self.should_log:
-                if self.config.get("debug", False):
-                    pass
-                else:
-                    wandb.log({"Epoch": epoch}, step=self.step)
-            pbar = tqdm(enumerate(self.train_loader), desc=f"Epoch: {epoch + 1}/{self.config.epochs}. Loop: Train",
-                        total=self.iters_per_epoch) if is_rank_zero(self.config) else enumerate(self.train_loader)
-
-            # 1532146.125
-            for i, batch in pbar:
-                current_step = epoch_len * epoch + i
-                step_rate = current_step / total_step
-                # metrics, test_losses = self.validate()
-                # print(metrics)
-                if self.should_early_stop():
-                    print("Early stopping")
-                    break
-                # print(f"Batch {self.step+1} on rank {self.config.rank}")
-                losses = self.train_on_batch(batch, i, step_rate)
-                # print(f"trained batch {self.step+1} on rank {self.config.rank}")
-                if self.config.get("debug", False):
-                    log_info = ""
-                    for name, loss in losses.items():
-                        log_info += "{}: {}, ".format(name, loss)
-                    print(log_info)
-
-                if self.boostingdepth:
-                    for k,v in losses.items():
-                        losses[k] = torch.tensor(v)
-
-                self.raise_if_nan(losses)
-                if is_rank_zero(self.config) and self.config.print_losses:
-                    pbar.set_description(
-                        f"Epoch: {epoch + 1}/{self.config.epochs}. Loop: Train. Losses: {stringify_losses(losses)}")
-                self.scheduler.step()
-
-                if self.should_log and self.step % 50 == 0:
-                    if self.config.get("debug", False):
-                        log_info = ""
-                        for name, loss in losses.items():
-                            log_info += "{}: {}, ".format(name, loss)
-                        print(log_info)
-                    else:
-                        wandb.log({f"Train/{name}": loss.item()
-                                for name, loss in losses.items()}, step=self.step)
-                        # current_lr = self.optimizer.param_groups[0]['lr']
-                        current_lr = self.scheduler.get_last_lr()[0]
-                        wandb.log({f"Train/LR": current_lr}, step=self.step)
-                        momentum = self.optimizer.param_groups[0]['betas'][0]
-                        wandb.log({f"Train/momentum": momentum}, step=self.step)
-                        wandb.log({f"Train/step_rate": step_rate}, step=self.step)
-
-                self.step += 1
-
-                ########################################################################################################
-
-                if self.test_loader:
-                    if (self.step % validate_every) == 0:
-                        self.model.eval()
-                        if self.should_write:
-                            self.save_checkpoint(
-                                f"{self.config.experiment_id}_latest.pt")
-
-                        ################################# Validation loop ##################################################
-                        # validate on the entire validation set in every process but save only from rank 0, I know, inefficient, but avoids divergence of processes
-                        metrics, test_losses = self.validate()
-                        # print("Validated: {}".format(metrics))
-                        if self.should_log:
-                            if self.config.get("debug", False):
-                                log_info = ""
-                                for name, loss in test_losses.items():
-                                    log_info += "{}: {}, ".format(name, loss)
-                                log_info = "\n"
-                                for name, val in metrics.items():
-                                    log_info += "{}: {}, ".format(name, val)
-                                print(log_info)
-                                
-                            else:
-                                wandb.log(
-                                    {f"Test/{name}": tloss for name, tloss in test_losses.items()}, step=self.step)
-
-                                wandb.log({f"Metrics/{k}": v for k,
-                                        v in metrics.items()}, step=self.step)
-
-                            if (metrics[self.metric_criterion] < best_loss) and self.should_write:
-                                self.save_checkpoint(
-                                    f"{self.config.experiment_id}_best.pt")
-                                best_loss = metrics[self.metric_criterion]
-
-                        self.model.train()
-
-                        if self.config.distributed:
-                            dist.barrier()
-                        # print(f"Validated: {metrics} on device {self.config.rank}")
-
-                # print(f"Finished step {self.step} on device {self.config.rank}")
-                #################################################################################################
-
-        # Save / validate at the end
-        self.step += 1  # log as final point
-        self.model.eval()
-        self.save_checkpoint(f"{self.config.experiment_id}_latest.pt")
-        if self.test_loader:
-
-            ################################# Validation loop ##################################################
-            metrics, test_losses = self.validate()
-            # print("Validated: {}".format(metrics))
-            if self.should_log:
-                if self.config.get("debug", False):
-                    log_info = ""
-                    for name, loss in test_losses.items():
-                        log_info += "{}: {}, ".format(name, loss)
-                    log_info = "\n"
-                    for name, val in metrics.items():
-                        log_info += "{}: {}, ".format(name, val)
-                    print(log_info)
-                    
-                else:
-                    wandb.log({f"Test/{name}": tloss for name,
-                            tloss in test_losses.items()}, step=self.step)
-                    wandb.log({f"Metrics/{k}": v for k,
-                            v in metrics.items()}, step=self.step)
-
-                if (metrics[self.metric_criterion] < best_loss) and self.should_write:
-                    self.save_checkpoint(
-                        f"{self.config.experiment_id}_best.pt")
-                    best_loss = metrics[self.metric_criterion]
-
-        self.model.train()
-
-
-    def init_optimizer(self):
-        m = self.model.module if self.config.multigpu else self.model
-
-        if self.config.same_lr:
-            print("Using same LR")
-            if hasattr(m, 'core'):
-                m.core.unfreeze()
-            params = self.model.parameters()
-        else:
-            print("Using diff LR")
-            if not hasattr(m, 'get_lr_params'):
-                raise NotImplementedError(
-                    f"Model {m.__class__.__name__} does not implement get_lr_params. Please implement it or use the same LR for all parameters.")
-
-            params = m.get_lr_params(self.config.lr)
-
-        # if self.addf:
-        #     return optim.Adam(params, lr=self.config.lr, betas=(0.5, 0.999))
-        # else:
-        #     return optim.AdamW(params, lr=self.config.lr, weight_decay=self.config.wd)
-
-        return optim.AdamW(params, lr=self.config.lr, weight_decay=self.config.wd)
-
-
-    
-    def save_checkpoint(self, filename):
-        if not self.should_write:
-            return
-        root = self.config.save_dir
-        if not os.path.isdir(root):
-            os.makedirs(root)
-
-        fpath = os.path.join(root, filename)
-        m = self.model.module if self.config.multigpu else self.model
-        torch.save(
-            {
-                "model": m.state_dict(),
-                "optimizer": None,  # TODO : Change to self.optimizer.state_dict() if resume support is needed, currently None to reduce file size
-                "epoch": self.epoch
-            }, fpath)
-        
-        if self.boostingdepth:
-            fpath = os.path.join(root, "_fusion" + filename)
-            m.fusion_network.save_networks(fpath)
diff --git a/zoedepth/trainers/zoedepth_nk_trainer.py b/zoedepth/trainers/zoedepth_nk_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d528ae126f1c51b2f25fd31f94a39591ceb2f43a
--- /dev/null
+++ b/zoedepth/trainers/zoedepth_nk_trainer.py
@@ -0,0 +1,143 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
+
+import torch
+import torch.cuda.amp as amp
+import torch.nn as nn
+
+from zoedepth.trainers.loss import GradL1Loss, SILogLoss
+from zoedepth.utils.config import DATASETS_CONFIG
+from zoedepth.utils.misc import compute_metrics
+
+from .base_trainer import BaseTrainer
+
+
+class Trainer(BaseTrainer):
+    def __init__(self, config, model, train_loader, test_loader=None, device=None):
+        super().__init__(config, model, train_loader,
+                         test_loader=test_loader, device=device)
+        self.device = device
+        self.silog_loss = SILogLoss()
+        self.grad_loss = GradL1Loss()
+        self.domain_classifier_loss = nn.CrossEntropyLoss()
+
+        self.scaler = amp.GradScaler(enabled=self.config.use_amp)
+
+    def train_on_batch(self, batch, train_step):
+        """
+        Expects a batch of images and depth as input
+        batch["image"].shape : batch_size, c, h, w
+        batch["depth"].shape : batch_size, 1, h, w
+
+        Assumes all images in a batch are from the same dataset
+        """
+
+        images, depths_gt = batch['image'].to(
+            self.device), batch['depth'].to(self.device)
+        # batch['dataset'] is a tensor strings all valued either 'nyu' or 'kitti'. labels nyu -> 0, kitti -> 1
+        dataset = batch['dataset'][0]
+        # Convert to 0s or 1s
+        domain_labels = torch.Tensor([dataset == 'kitti' for _ in range(
+            images.size(0))]).to(torch.long).to(self.device)
+
+        # m = self.model.module if self.config.multigpu else self.model
+
+        b, c, h, w = images.size()
+        mask = batch["mask"].to(self.device).to(torch.bool)
+
+        losses = {}
+
+        with amp.autocast(enabled=self.config.use_amp):
+            output = self.model(images)
+            pred_depths = output['metric_depth']
+            domain_logits = output['domain_logits']
+
+            l_si, pred = self.silog_loss(
+                pred_depths, depths_gt, mask=mask, interpolate=True, return_interpolated=True)
+            loss = self.config.w_si * l_si
+            losses[self.silog_loss.name] = l_si
+
+            if self.config.w_grad > 0:
+                l_grad = self.grad_loss(pred, depths_gt, mask=mask)
+                loss = loss + self.config.w_grad * l_grad
+                losses[self.grad_loss.name] = l_grad
+            else:
+                l_grad = torch.Tensor([0])
+
+            if self.config.w_domain > 0:
+                l_domain = self.domain_classifier_loss(
+                    domain_logits, domain_labels)
+                loss = loss + self.config.w_domain * l_domain
+                losses["DomainLoss"] = l_domain
+            else:
+                l_domain = torch.Tensor([0.])
+
+        self.scaler.scale(loss).backward()
+
+        if self.config.clip_grad > 0:
+            self.scaler.unscale_(self.optimizer)
+            nn.utils.clip_grad_norm_(
+                self.model.parameters(), self.config.clip_grad)
+
+        self.scaler.step(self.optimizer)
+
+        if self.should_log and self.step > 1 and (self.step % int(self.config.log_images_every * self.iters_per_epoch)) == 0:
+            depths_gt[torch.logical_not(mask)] = -99
+            self.log_images(rgb={"Input": images[0, ...]}, depth={"GT": depths_gt[0], "PredictedMono": pred[0]}, prefix="Train",
+                            min_depth=DATASETS_CONFIG[dataset]['min_depth'], max_depth=DATASETS_CONFIG[dataset]['max_depth'])
+
+        self.scaler.update()
+        self.optimizer.zero_grad(set_to_none=True)
+
+        return losses
+
+    def validate_on_batch(self, batch, val_step):
+        images = batch['image'].to(self.device)
+        depths_gt = batch['depth'].to(self.device)
+        dataset = batch['dataset'][0]
+        if 'has_valid_depth' in batch:
+            if not batch['has_valid_depth']:
+                return None, None
+
+        depths_gt = depths_gt.squeeze().unsqueeze(0).unsqueeze(0)
+        with amp.autocast(enabled=self.config.use_amp):
+            m = self.model.module if self.config.multigpu else self.model
+            pred_depths = m(images)["metric_depth"]
+        pred_depths = pred_depths.squeeze().unsqueeze(0).unsqueeze(0)
+
+        mask = torch.logical_and(
+            depths_gt > self.config.min_depth, depths_gt < self.config.max_depth)
+        with amp.autocast(enabled=self.config.use_amp):
+            l_depth = self.silog_loss(
+                pred_depths, depths_gt, mask=mask.to(torch.bool), interpolate=True)
+
+        metrics = compute_metrics(depths_gt, pred_depths, **self.config)
+        losses = {f"{self.silog_loss.name}": l_depth.item()}
+
+        if val_step == 1 and self.should_log:
+            depths_gt[torch.logical_not(mask)] = -99
+            self.log_images(rgb={"Input": images[0]}, depth={"GT": depths_gt[0], "PredictedMono": pred_depths[0]}, prefix="Test",
+                            min_depth=DATASETS_CONFIG[dataset]['min_depth'], max_depth=DATASETS_CONFIG[dataset]['max_depth'])
+
+        return metrics, losses
diff --git a/zoedepth/trainers/zoedepth_trainer.py b/zoedepth/trainers/zoedepth_trainer.py
index 5fa5de7de1cc1ec388d5e3ee5b80b93ae3b56a1f..3ac1c24c0512c1c1b191670a7c24abb4fca47ba1 100644
--- a/zoedepth/trainers/zoedepth_trainer.py
+++ b/zoedepth/trainers/zoedepth_trainer.py
@@ -22,8 +22,6 @@
 
 # File author: Shariq Farooq Bhat
 
-# This file may include modifications from author Zhenyu Li
-
 import torch
 import torch.cuda.amp as amp
 import torch.nn as nn
@@ -64,6 +62,7 @@ class Trainer(BaseTrainer):
         losses = {}
 
         with amp.autocast(enabled=self.config.use_amp):
+
             output = self.model(images)
             pred_depths = output['metric_depth']
 
@@ -151,8 +150,6 @@ class Trainer(BaseTrainer):
         depths_gt = batch['depth'].to(self.device)
         dataset = batch['dataset'][0]
         mask = batch["mask"].to(self.device)
-        disp_gt_edges = batch['disp_gt_edges'].squeeze().numpy()
-        
         if 'has_valid_depth' in batch:
             if not batch['has_valid_depth']:
                 return None, None
@@ -169,13 +166,12 @@ class Trainer(BaseTrainer):
             l_depth = self.silog_loss(
                 pred_depths, depths_gt, mask=mask.to(torch.bool), interpolate=True)
 
-        metrics = compute_metrics(depths_gt, pred_depths, disp_gt_edges=disp_gt_edges, **self.config)
+        metrics = compute_metrics(depths_gt, pred_depths, **self.config)
         losses = {f"{self.silog_loss.name}": l_depth.item()}
 
-        if self.should_log:
-            if val_step in [1, 21, 41, 61]:
-                depths_gt[torch.logical_not(mask)] = DATASETS_CONFIG[dataset]['max_depth']
-                self.log_images(rgb={"Input": images[0]}, depth={"GT": depths_gt[0], "PredictedMono": pred_depths[0]}, prefix="Test",
-                                min_depth=DATASETS_CONFIG[dataset]['min_depth'], max_depth=DATASETS_CONFIG[dataset]['max_depth'])
+        if val_step == 1 and self.should_log:
+            depths_gt[torch.logical_not(mask)] = -99
+            self.log_images(rgb={"Input": images[0]}, depth={"GT": depths_gt[0], "PredictedMono": pred_depths[0]}, prefix="Test",
+                            min_depth=DATASETS_CONFIG[dataset]['min_depth'], max_depth=DATASETS_CONFIG[dataset]['max_depth'])
 
         return metrics, losses
diff --git a/zoedepth/utils/align/depth_alignment.py b/zoedepth/utils/align/depth_alignment.py
deleted file mode 100644
index 939eca0376eebe06f6c7f03756dfe2c4c82d89f1..0000000000000000000000000000000000000000
--- a/zoedepth/utils/align/depth_alignment.py
+++ /dev/null
@@ -1,374 +0,0 @@
-# MIT License
-
-# Copyright (c) 2022 Intelligent Systems Lab Org
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-# File author: Shariq Farooq Bhat, Zhenyu Li
-
-import torch
-import numpy as np
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-from tqdm.auto import tqdm
-from torchvision.transforms import ToTensor, ToPILImage
-from typing import List, Tuple
-from PIL import Image
-# from models.monodepth.zoedepth import ZoeDepthLora
-# from zoedepth.utils.align.loss import SILogLoss, gradl1_loss, edge_aware_smoothness_per_pixel, ssim_loss
-from .loss import *
-import cv2
-from zoedepth.trainers.loss import *
-# from utils.misc import *
-
-
-
-@torch.no_grad()
-def scale_shift_linear(rendered_depth, predicted_depth, mask, fuse=True, return_params=False):
-    """
-    Optimize a scale and shift parameter in the least squares sense, such that rendered_depth and predicted_depth match.
-    Formally, solves the following objective:
-
-    min     || (d * a + b) - d_hat ||
-    a, b
-
-    where d = 1 / predicted_depth, d_hat = 1 / rendered_depth
-
-    :param rendered_depth: torch.Tensor (H, W)
-    :param predicted_depth:  torch.Tensor (H, W)
-    :param mask: torch.Tensor (H, W) - 1: valid points of rendered_depth, 0: invalid points of rendered_depth (ignore)
-    :param fuse: whether to fuse shifted/scaled predicted_depth with the rendered_depth
-
-    :return: scale/shift corrected depth
-    """
-    if mask.sum() == 0:
-        return predicted_depth
-
-    # rendered_disparity = 1 / rendered_depth[mask].unsqueeze(-1)
-    # predicted_disparity = 1 / predicted_depth[mask].unsqueeze(-1)
-
-    rendered_disparity = rendered_depth[mask].unsqueeze(-1)
-    predicted_disparity = predicted_depth[mask].unsqueeze(-1)
-
-    X = torch.cat([predicted_disparity, torch.ones_like(predicted_disparity)], dim=1)
-    XTX_inv = (X.T @ X).inverse()
-    XTY = X.T @ rendered_disparity
-    AB = XTX_inv @ XTY
-
-    if return_params:
-        return AB
-
-    fixed_disparity = (predicted_depth) * AB[0] + AB[1]
-    fixed_depth = fixed_disparity
-
-    if fuse:
-        fused_depth = torch.where(mask, rendered_depth, fixed_depth)
-        return fused_depth
-    else:
-        return fixed_depth
-    
-
-def np_scale_shift_linear(rendered_depth: np.ndarray, predicted_depth: np.ndarray, mask: np.ndarray, fuse: bool=True):
-    """
-    Optimize a scale and shift parameter in the least squares sense, such that rendered_depth and predicted_depth match.
-    Formally, solves the following objective:
-
-    min     || (d * a + b) - d_hat ||
-    a, b
-
-    where d = predicted_depth, d_hat = rendered_depth
-
-    :param rendered_depth: np.ndarray (H, W)
-    :param predicted_depth:  np.ndarray (H, W)
-    :param mask: np.ndarray (H, W) - 1: valid points of rendered_depth, 0: invalid points of rendered_depth (ignore)
-    :param fuse: whether to fuse shifted/scaled predicted_depth with the rendered_depth
-
-    :return: scale/shift corrected depth
-    """
-    if mask.sum() == 0:
-        return predicted_depth
-
-    # rendered_disparity = 1 / rendered_depth[mask].reshape(-1, 1)
-    # predicted_disparity = 1 / predicted_depth[mask].reshape(-1, 1)
-
-    rendered_disparity = rendered_depth[mask].reshape(-1, 1)
-    predicted_disparity = predicted_depth[mask].reshape(-1, 1)
-
-    X = np.concatenate([predicted_disparity, np.ones_like(predicted_disparity)], axis=1)
-    XTX_inv = np.linalg.inv(X.T @ X)
-    XTY = X.T @ rendered_disparity
-    AB = XTX_inv @ XTY
-
-    fixed_disparity = (predicted_depth) * AB[0] + AB[1]
-    fixed_depth = fixed_disparity
-
-    if fuse:
-        fused_depth = np.where(mask, rendered_depth, fixed_depth)
-        return fused_depth
-    else:
-        return fixed_depth
-
-
-@torch.no_grad()
-def apply_depth_smoothing(depth, mask):
-
-    def dilate(x, k=3):
-        x = as_bchw_tensor(x.float(), 1)
-        x = torch.nn.functional.conv2d(x.float(),
-            torch.ones(1, 1, k, k).to(x.device),
-            padding="same"
-        )
-        return x.squeeze() > 0
-
-    def sobel(x):
-        flipped_sobel_x = torch.tensor([
-            [-1, 0, 1],
-            [-2, 0, 2],
-            [-1, 0, 1]
-        ], dtype=torch.float32).to(x.device)
-        flipped_sobel_x = torch.stack([flipped_sobel_x, flipped_sobel_x.t()]).unsqueeze(1)
-
-        x_pad = torch.nn.functional.pad(x.float(), (1, 1, 1, 1), mode="replicate")
-
-        x = torch.nn.functional.conv2d(
-            x_pad,
-            flipped_sobel_x,
-            padding="valid"
-        )
-        dx, dy = x.unbind(dim=-3)
-        # return torch.sqrt(dx**2 + dy**2).squeeze()
-        # new content is created mostly in x direction, sharp edges in y direction are wanted (e.g. table --> wall)
-        return dx
-
-    depth = as_bchw_tensor(depth, 1)
-    mask = as_bchw_tensor(mask, 1).float()
-
-    edges = sobel(mask)
-    dilated_edges = dilate(edges, k=21)
-
-    depth_numpy = depth.squeeze().float().cpu().numpy()
-    blur_bilateral = cv2.bilateralFilter(depth_numpy, 5, 140, 140)
-    blur_gaussian = cv2.GaussianBlur(blur_bilateral, (5, 5), 0)
-    blur_gaussian = torch.from_numpy(blur_gaussian).to(depth)
-    # print("blur_gaussian", blur_gaussian.shape)
-    # plt.imshow(blur_gaussian.cpu().squeeze().numpy())
-    # plt.title("depth smoothed whole")
-    # plt.show()
-    depth_smooth = torch.where(dilated_edges, blur_gaussian, depth)
-    return depth_smooth
-
-def get_dilated_only_mask(mask: torch.Tensor, k=7):
-    x = as_bchw_tensor(mask.float(), 1)
-    x = torch.nn.functional.conv2d(x, torch.ones(1, 1, k, k).to(mask.device),padding="same")
-    dilated = x.squeeze() > 0
-    dilated_only = dilated ^ mask 
-    return dilated_only
-
-def get_boundary_mask(mask: torch.Tensor, k=7):
-    return get_dilated_only_mask(mask, k=k) | get_dilated_only_mask(~mask, k=k)
-
-
-@torch.no_grad()
-def ss_align_and_blur(rendered_depth: torch.Tensor, predicted_depth: torch.Tensor, mask: torch.Tensor, fuse: bool=True):
-    aligned = scale_shift_linear(rendered_depth, predicted_depth, mask, fuse=fuse)
-    aligned = apply_depth_smoothing(aligned, mask)
-    return aligned
-
-
-def np_ss_align_and_blur(rendered_depth: np.ndarray, predicted_depth: np.ndarray, mask: np.ndarray, fuse: bool=True):
-    aligned = np_scale_shift_linear(rendered_depth, predicted_depth, mask, fuse=fuse)
-    aligned = apply_depth_smoothing(aligned, mask).cpu().numpy()
-    return aligned
-
-
-
-def stitch(depth_src: torch.Tensor, depth_target: torch.Tensor, mask_src: torch.Tensor, smoothen=True, device='cuda:0'):
-    depth_src = as_bchw_tensor(depth_src, 1, device=device)
-    depth_target = as_bchw_tensor(depth_target, 1, device=device)
-    mask_src = as_bchw_tensor(mask_src, 1, device=device)
-
-    stitched = depth_src * mask_src.float() + depth_target * (~mask_src).float()
-    # plt.imshow(stitched.cpu().squeeze().numpy())
-    # plt.title("stitched before smoothing")
-    # plt.show()
-    # apply smoothing
-    if smoothen:
-        stitched = apply_depth_smoothing(stitched, mask_src).squeeze().float()
-    return stitched
-
-
-
-
-
-
-
-
-
-def smoothness_loss(depth, mask=None):
-    depth_grad_x = torch.abs(depth[:, :, :, :-1] - depth[:, :, :, 1:])
-    depth_grad_y = torch.abs(depth[:, :, :-1, :] - depth[:, :, 1:, :])
-    if mask is not None:
-        return torch.mean(depth_grad_x[mask[:, :, :, :-1]]) + torch.mean(depth_grad_y[mask[:, :, :-1, :]])
-    return torch.mean(depth_grad_x) + torch.mean(depth_grad_y)
-
-import torch.optim as optim
-from torch.optim import lr_scheduler
-def finetune_on_sample(model, image_pil, target_depth, mask=None, 
-                       iters=10, lr=0.1, beta=0.5, w_boundary_grad=1, w_grad=0.1, gamma=0.99):
-    model.train()
-    model_device = next(model.parameters()).device
-    x = as_bchw_tensor(image_pil, 3, device=model_device)
-    target_depth = as_bchw_tensor(target_depth, 1, device=model_device)
-    if mask is None:
-        mask = target_depth > 0
-    elif (not isinstance(mask, torch.Tensor)) or mask.shape != target_depth.shape:
-        mask = as_bchw_tensor(mask, 1, device=model_device).to(torch.bool)
-        
-    
-    history = []
-    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
-    # scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=gamma)
-    scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=lr, steps_per_epoch=iters, epochs=1)
-    # main_loss = nn.L1Loss()
-    main_loss = SILogLoss(beta=beta)
-
-    orig_y = model.infer(x, with_flip_aug=False).detach()
-
-    # scale, shift = scale_shift_linear(target_depth, orig_y, mask, return_params=True)
-
-    gl1 = gradl1_loss
-    pbar = tqdm(range(iters), desc="Finetuning on sample")
-    for i in pbar:
-        optimizer.zero_grad()
-        y = model.infer(x, with_flip_aug=False)
-        # y = y * scale + shift
-        stitched = y * (~mask).float() + (target_depth * (mask).float()).detach()
-        # loss = F.mse_loss(y[mask], target_depth[mask])
-        loss_si = main_loss(y[mask], target_depth[mask])
-        # loss = loss_si \
-        #     + wgrad * ( gl1(y, stitched) \
-        #                + 2*gl1(y, orig_y) ) \
-        #         + wboundary_smoothness * smoothness_loss(y, mask=get_boundary_mask(mask))
-        loss_grad = gl1(y, orig_y)
-
-        bmask = get_boundary_mask(mask)
-        loss_boundary_grad = laplacian_matching_loss(stitched, orig_y, bmask)
-        loss = loss_si + w_boundary_grad * loss_boundary_grad + w_grad * loss_grad
-
-        # check if loss is nan
-        if torch.isnan(loss):
-            print("Loss is nan, breaking")
-            break
-        loss.backward()
-
-        optimizer.step()
-        scheduler.step()
-        # history.append(loss.item())
-
-        pbar.set_postfix(loss=loss.item(), si=loss_si.item())
-    model.eval()
-    return model, history
-
-# def align_by_finetuning_lora(model: ZoeDepthLora, image, target_depth, mask=None, iters=10, lr=0.1, gamma=0.99, **kwargs):
-#     # model.reset_lora()
-#     model.set_only_lora_trainable()
-#     model, history = finetune_on_sample(model, image, target_depth, mask=mask, iters=iters, lr=lr, gamma=gamma)
-#     aligned_depth = model.infer(as_bchw_tensor(image, 3, device=next(model.parameters()).device))
-#     return dict(model=model, history=history, aligned_depth=aligned_depth)
-
-
-
-
-
-
-import torch.nn as nn
-import torch.nn.functional as F
-# from utils.misc import as_bchw_tensor
-
-def as_bchw_tensor(input_tensor, num, device):
-    input_tensor = torch.tensor(input_tensor).unsqueeze(dim=0).unsqueeze(dim=0).cuda()
-    return input_tensor
-
-def optimize_depth_deformation(rendered_depth, pred_depth, mask, h=10, w=10, iters=100, init_lr=0.1, gamma=0.996,
-                                init_deformation=None,
-                                device='cuda:0'):
-    rendered_depth = as_bchw_tensor(rendered_depth, 1, device=device)
-    pred_depth = as_bchw_tensor(pred_depth, 1, device=device)
-    mask = as_bchw_tensor(mask, 1, device=device).to(torch.bool)
-    # initialize a grid of scalar values (with zeros) that will be optimized
-    # to deform the depth map
-    if init_deformation is None:
-        deformation = torch.zeros((1,1,h,w), requires_grad=True, device=device)
-    else:
-        deformation = init_deformation
-        deformation.requires_grad = True
-        assert deformation.shape == (1,1,h,w)
-
-    optimizer = torch.optim.Adam([deformation], lr=init_lr)
-    # exponential LR schedule
-    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=gamma)
-    # optimize the deformation
-    history = []
-    grad_loss = GradL1Loss()
-    for i in tqdm(range(iters)):
-        scalar_deformation = torch.exp(deformation)
-        scalar_deformation = F.interpolate(scalar_deformation, size=pred_depth.shape[-2:], mode='bilinear', align_corners=True)
-        adjusted_depth = pred_depth * scalar_deformation
-        loss = F.mse_loss(adjusted_depth[mask], rendered_depth[mask], reduction='none')
-        loss_g = grad_loss(adjusted_depth, rendered_depth, mask)
-        loss = loss.mean() + 0.1*loss_g
-        # loss = loss.mean()
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-        scheduler.step()
-        if i % 10 == 0:
-            history.append(loss.item())
-    
-    scalar_deformation = torch.exp(deformation)
-    scalar_deformation = F.interpolate(scalar_deformation, size=pred_depth.shape[-2:], mode='bilinear', align_corners=True)
-    adjusted_depth = pred_depth * scalar_deformation
-    # return dict(aligned_depth=adjusted_depth.detach().cpu().numpy().squeeze(), 
-    #             history=history, 
-    #             deformation=deformation)
-    return adjusted_depth.detach().cpu().squeeze()
-
-def stage_wise_optimization(rendered_depth, pred_depth, mask,
-                            stages=[(4,4), (8,8), (16,16), (32,32)], 
-                            iters=100, init_lr=0.1, gamma=0.996, device='cuda:1'):
-    
-    h_init, w_init = stages[0]
-    init_deformation = torch.zeros((1,1,h_init,w_init), device=device)
-    result = optimize_depth_deformation(rendered_depth, pred_depth, mask, h=h_init, w=w_init, iters=iters, init_lr=init_lr, gamma=gamma, init_deformation=init_deformation, device=device)
-    init_deformation = result['deformation']
-    history_stages = [result['history']]
-    for h, w in stages[1:]:
-        init_deformation = F.interpolate(init_deformation, size=(h,w), mode='bilinear', align_corners=True).detach()
-        result = optimize_depth_deformation(rendered_depth, pred_depth, mask, h=h, w=w, iters=iters, init_lr=init_lr, gamma=gamma, init_deformation=init_deformation, device=device)
-        init_deformation = result['deformation']
-        history_stages.append(result['history'])
-        init_lr *= gamma**2
-    
-    return dict(aligned_depth=result['aligned_depth'], history_stages=history_stages)
-        
-
-
-
-
diff --git a/zoedepth/utils/align/loss.py b/zoedepth/utils/align/loss.py
deleted file mode 100644
index cdca588d7abb8417a43c28c850fa7c6e58a220ac..0000000000000000000000000000000000000000
--- a/zoedepth/utils/align/loss.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# MIT License
-
-# Copyright (c) 2022 Intelligent Systems Lab Org
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-# File author: Shariq Farooq Bhat, Zhenyu Li
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import numpy as np
-
-def gaussian(window_size, sigma):
-    gauss = torch.Tensor([np.exp(-(x - window_size//2)**2/float(2*sigma**2)) for x in range(window_size)])
-    return gauss/gauss.sum()
-
-def create_window(window_size, channel=1):
-    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
-    _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
-    window = _2D_window.expand(channel, 1, window_size, window_size).contiguous()
-    return window
-
-def ssim(img1, img2, val_range, window_size=11, window=None, size_average=True, full=False):
-    img1 = nn.functional.interpolate(img1, (256, 256), mode='bilinear', align_corners=True)
-    img2 = nn.functional.interpolate(img2, (256, 256), mode='bilinear', align_corners=True)
-    # h, w = 256, 256
-    L = val_range
-
-    padd = 0
-    (_, channel, height, width) = img1.size()
-    if window is None:
-        real_size = min(window_size, height, width)
-        window = create_window(real_size, channel=channel).to(img1.device)
-
-    mu1 = F.conv2d(img1, window, padding=padd, groups=channel)
-    mu2 = F.conv2d(img2, window, padding=padd, groups=channel)
-
-    mu1_sq = mu1.pow(2)
-    mu2_sq = mu2.pow(2)
-    mu1_mu2 = mu1 * mu2
-
-    sigma1_sq = F.conv2d(img1 * img1, window, padding=padd, groups=channel) - mu1_sq
-    sigma2_sq = F.conv2d(img2 * img2, window, padding=padd, groups=channel) - mu2_sq
-    sigma12 = F.conv2d(img1 * img2, window, padding=padd, groups=channel) - mu1_mu2
-
-    C1 = (0.01 * L) ** 2
-    C2 = (0.03 * L) ** 2
-
-    v1 = 2.0 * sigma12 + C2
-    v2 = sigma1_sq + sigma2_sq + C2
-    cs = torch.mean(v1 / v2)  # contrast sensitivity
-
-    ssim_map = ((2 * mu1_mu2 + C1) * v1) / ((mu1_sq + mu2_sq + C1) * v2)
-
-    if size_average:
-        ret = ssim_map.mean()
-    else:
-        ret = ssim_map.mean(1).mean(1).mean(1)
-
-    if full:
-        return ret, cs
-
-    return ret
-
-
-
-class SSIMLoss(nn.Module):
-    def __init__(self, min_depth=1e-3, max_depth=10):
-        super(SSIMLoss, self).__init__()
-        self.name = 'SSIM'
-        self.min_depth = min_depth
-        self.max_depth = max_depth
-
-    def forward(self, input, target):
-        loss =  torch.clamp((1 - ssim(input, target, val_range=self.max_depth/self.min_depth)) * 0.5, 0, 1)
-        return loss
-
-
-# Main loss function used for ZoeDepth. Copy/paste from AdaBins repo (https://github.com/shariqfarooq123/AdaBins/blob/0952d91e9e762be310bb4cd055cbfe2448c0ce20/loss.py#L7)
-class SILogLoss(nn.Module):
-    """SILog loss (pixel-wise)"""
-    def __init__(self, beta=0.15):
-        super().__init__()
-        self.name = 'SILog'
-        self.beta = beta
-
-    def forward(self, input, target):
-        alpha = 1e-10
-        g = torch.log(input + alpha) - torch.log(target + alpha)
-
-        # n, c, h, w = g.shape
-        # norm = 1/(h*w)
-        # Dg = norm * torch.sum(g**2) - (0.85/(norm**2)) * (torch.sum(g))**2
-
-        Dg = torch.var(g) + self.beta * torch.pow(torch.mean(g), 2)
-
-        loss = 10 * torch.sqrt(Dg)
-        return loss
-
-def gradient_y(img):
-    gy = torch.cat( [F.conv2d(img[:, i, :, :].unsqueeze(0), torch.Tensor([[1, 2, 1], [0, 0, 0], [-1, -2, -1]]).view((1, 1, 3, 3)).to(img.device), padding=1) for i in range(img.shape[1])], 1)
-    return gy
-
-def gradient_x(img):
-    gx = torch.cat( [F.conv2d(img[:, i, :, :].unsqueeze(0), torch.Tensor([[1, 0, -1], [2, 0, -2], [1, 0, -1]]).view((1, 1, 3, 3)).to(img.device), padding=1) for i in range(img.shape[1])], 1)
-    return gx
-
-def laplacian(img):
-    lap = torch.cat( [F.conv2d(img[:, i, :, :].unsqueeze(0), torch.Tensor([[0, 1, 0], [1, -4, 1], [0, 1, 0]]).view((1, 1, 3, 3)).to(img.device), padding=1) for i in range(img.shape[1])], 1)
-    return lap
-
-def laplacian_matching_loss(img1, img2, mask=None):
-    return torch.mean(torch.abs(laplacian(img1)[mask] - laplacian(img2)[mask]))
-
-class GradL1Loss(nn.Module):
-    def __init__(self):
-        super(GradL1Loss, self).__init__()
-        self.name = 'GradL1'
-
-    def forward(self, input, target, mask=None):
-
-        grad_gt_x = gradient_x(target)
-        grad_gt_y = gradient_y(target)
-        grad_pred_x = gradient_x(input)
-        grad_pred_y = gradient_y(input)
-        loss = torch.mean(torch.abs(grad_pred_x[mask] - grad_gt_x[mask])) + torch.mean(torch.abs(grad_pred_y[mask] - grad_gt_y[mask]))
-        return loss
-
-# Edge aware smoothness loss implementation is adapted from: https://github.com/anuragranj/cc
-def edge_aware_smoothness_per_pixel(img, pred):
-    """ A measure of how closely the gradients of a predicted disparity/depth map match the 
-    gradients of the RGB image. 
-
-    Args:
-        img (c x 3 x h x w tensor): RGB image
-        pred (c x h x w tensor): predicted depth/disparity
-
-    Returns:
-        c x 1 tensor: measure of gradient matching (smoothness loss)
-    """
-    
-
-    
-    pred_gradients_x = gradient_x(pred)
-    pred_gradients_y = gradient_y(pred)
-
-    image_gradients_x = gradient_x(img)
-    image_gradients_y = gradient_y(img)
-
-    weights_x = torch.exp(-torch.mean(torch.abs(image_gradients_x), 1, keepdim=True))
-    weights_y = torch.exp(-torch.mean(torch.abs(image_gradients_y), 1, keepdim=True))
-            
-    smoothness_x = torch.abs(pred_gradients_x) * weights_x
-    smoothness_y = torch.abs(pred_gradients_y) * weights_y
-
-    return torch.mean(smoothness_x) + torch.mean(smoothness_y)
-
-
-
-ssim_loss = SSIMLoss()
-gradl1_loss = GradL1Loss()
\ No newline at end of file
diff --git a/zoedepth/utils/align/mlp_alignment.py b/zoedepth/utils/align/mlp_alignment.py
deleted file mode 100644
index 2ea91df321e956883847cd912aef5bd7b4e934d6..0000000000000000000000000000000000000000
--- a/zoedepth/utils/align/mlp_alignment.py
+++ /dev/null
@@ -1,262 +0,0 @@
-# MIT License
-
-# Copyright (c) 2022 Intelligent Systems Lab Org
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-# File author: Shariq Farooq Bhat, Zhenyu Li
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from tqdm.auto import tqdm
-import torch.optim as optim
-import torch.optim.lr_scheduler
-from zoedepth.utils.align.loss import SILogLoss, gradl1_loss, edge_aware_smoothness_per_pixel
-# from utils.misc import *
-from .depth_alignment import apply_depth_smoothing, scale_shift_linear
-import cv2
-import numpy as np
-
-def as_bchw_tensor(input_tensor, num, device=None):
-    if len(input_tensor.shape) == 2:
-        input_tensor = torch.tensor(input_tensor).unsqueeze(dim=0).unsqueeze(dim=0)
-    elif len(input_tensor.shape) == 3:
-        input_tensor = torch.tensor(input_tensor).unsqueeze(dim=0)
-    else:
-        input_tensor = input_tensor
-    if device is not None:
-        input_tensor = input_tensor.to(device)
-    return input_tensor
-
-
-def get_mlp(in_channels, out_channels):
-    conv_config = dict(kernel_size=1, padding=0, stride=1)
-    net =  nn.Sequential(
-        # nn.Conv2d(in_channels, 64, kernel_size=7, padding=3, stride=1),
-        # nn.GELU(),
-        nn.Conv2d(in_channels, 64, **conv_config),
-        nn.GELU(),
-        nn.Conv2d(64, 128, **conv_config),
-        nn.GELU(),
-        nn.Conv2d(128, out_channels, **conv_config),
-    )
-
-    # initialize last layer to predict zeroes
-    # net[-1].weight.data.zero_()
-    # net[-1].bias.data.zero_()
-    return net
-
-def smoothness_loss(depth):
-    depth_dx = depth[:, :, :-1, :-1] - depth[:, :, :-1, 1:]
-    depth_dy = depth[:, :, :-1, :-1] - depth[:, :, 1:, :-1]
-    depth_dx = depth_dx.abs().mean()
-    depth_dy = depth_dy.abs().mean()
-    return depth_dx + depth_dy
-
-def curvature_loss(depth):
-    depth_dx = depth[:, :, :-1, :-1] - depth[:, :, :-1, 1:]
-    depth_dy = depth[:, :, :-1, :-1] - depth[:, :, 1:, :-1]
-    depth_dxx = depth_dx[:, :, :, :-1] - depth_dx[:, :, :, 1:]
-    depth_dyy = depth_dy[:, :, :-1, :] - depth_dy[:, :, 1:, :]
-    depth_dxy = depth_dx[:, :, :-1, :-1] - depth_dx[:, :, 1:, 1:]
-    depth_dxx = depth_dxx.abs().mean()
-    depth_dyy = depth_dyy.abs().mean()
-    depth_dxy = depth_dxy.abs().mean()
-    return depth_dxx + depth_dyy + depth_dxy
-
-def multi_scale_curvature_loss(depth, scales=[1, 2, 4]):
-    loss = 0
-    for s in scales:
-        loss += curvature_loss(F.interpolate(depth, scale_factor=1/s, mode='bilinear', align_corners=False))
-    return loss
-
-
-def tv_loss(x):
-    """Total variation loss."""
-    b, c, h, w = x.shape
-    dh = torch.abs(x[:, :, 1:, :] - x[:, :, :-1, :])
-    dw = torch.abs(x[:, :, :, 1:] - x[:, :, :, :-1])
-    return torch.sum(dh) + torch.sum(dw)
-
-def scale_invariant_gradient_loss(pred, gt):
-    alpha = 1e-10
-    kernel_grad_x = torch.Tensor([[1, 0, -1], [2, 0, -2], [1, 0, -1]]).view((1, 1, 3, 3)).to(pred.device)
-    kernel_grad_y = torch.Tensor([[1, 2, 1], [0, 0, 0], [-1, -2, -1]]).view((1, 1, 3, 3)).to(pred.device)
-
-    g = torch.log(pred + alpha) - torch.log(gt + alpha)
-    g_x = F.conv2d(g, kernel_grad_x, padding=1)
-    g_y = F.conv2d(g, kernel_grad_y, padding=1)
-    # n, c, h, w = g.shape
-    # norm = 1/(h*w)
-    # Dg = norm * torch.sum(g**2) - (0.85/(norm**2)) * (torch.sum(g))**2
-
-    Dgx = torch.var(g_x) + 0.5 * torch.pow(torch.mean(g_x), 2)
-    Dgy = torch.var(g_y) + 0.5 * torch.pow(torch.mean(g_y), 2)
-
-
-    loss = 10 * torch.sqrt(Dgx) + 10 * torch.sqrt(Dgy)
-    return loss
-
-
-def positionalencoding2d(d_model, height, width):
-    """
-    :param d_model: dimension of the model
-    :param height: height of the positions
-    :param width: width of the positions
-    :return: d_model*height*width position matrix
-    """
-    if d_model % 4 != 0:
-        raise ValueError("Cannot use sin/cos positional encoding with "
-                         "odd dimension (got dim={:d})".format(d_model))
-    pe = torch.zeros(d_model, height, width)
-    # Each dimension use half of d_model
-    d_model = int(d_model / 2)
-    div_term = torch.exp(torch.arange(0., d_model, 2) *
-                         -(np.log(10000.0) / d_model))
-    pos_w = torch.arange(0., width).unsqueeze(1)
-    pos_h = torch.arange(0., height).unsqueeze(1)
-    pe[0:d_model:2, :, :] = torch.sin(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, height, 1)
-    pe[1:d_model:2, :, :] = torch.cos(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, height, 1)
-    pe[d_model::2, :, :] = torch.sin(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, width)
-    pe[d_model + 1::2, :, :] = torch.cos(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, width)
-
-    return pe
-
-def gaussian_rff(d_model, height, width, sigma=10):
-    assert d_model % 2 == 0
-    B = torch.randn((d_model//2, 2)) * sigma
-    
-    x = torch.linspace(-1, 1, width)
-    y = torch.linspace(-1, 1, height)
-
-    x, y = torch.meshgrid(x, y)
-    xy = torch.stack([x, y], dim=-1).view(-1, 2)
-
-    xy = torch.matmul(B, xy.T).T
-    xy = 2 * np.pi * xy.view(height, width, d_model//2)
-    enc = torch.cat([torch.sin(xy), torch.cos(xy)], dim=-1)
-    return enc.permute(2, 0, 1)
-
-
-def get_depth(pred, D):
-    a, b, c = torch.split(pred, 1, dim=1)
-    return 1e-7 + torch.relu(torch.exp(a) * D + (torch.sigmoid(c)-0.5)*torch.exp(b))
-    # return 1e-4 + torch.exp(a) * D + b
-    # return nn.Softplus()(a)
-
-
-def train_mlp(image, mask, dr, dp, lr=3e-2, num_iters=3000, device='cuda:0', pos_dim=32, loss_config=dict(beta=0.99),
-               w_smooth=1, w_curvature=0.0, w_gl1=0.1, w_tv=0.1, w_shift_reg=0.1, **kwargs):
-
-    mlp = get_mlp(pos_dim+4, 3)
-    # mlp = get_mlp(4, 3)
-    mlp = mlp.to(device)
-
-    optimizer = optim.AdamW(mlp.parameters(), lr=lr)
-    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, lr, epochs=num_iters, steps_per_epoch=1)
-
-    image = as_bchw_tensor(image, 3, device=device).detach()
-    h, w = image.shape[-2:]
-    pe = positionalencoding2d(pos_dim, h, w)
-    pe = as_bchw_tensor(pe, pos_dim, device=device)
-    D = as_bchw_tensor(dp, 1, device=device).detach()
-    # pe = as_bchw_tensor(gaussian_rff(pos_dim, h, w, sigma=5), pos_dim, device=device)
-    X = torch.cat([image, D, pe], dim=1)  # bchw
-    # X = torch.cat([image, D], dim=1)  # bchw
-    
-    Y = as_bchw_tensor(dr, 1, device=device).detach()
-    mask = as_bchw_tensor(mask, 1, device=device).detach()
-    pbar = tqdm(range(num_iters), desc=f"Training")
-    # beta_min, beta_max = 0.
-    si_log = SILogLoss(**loss_config)
-
-    for i in pbar:
-        optimizer.zero_grad()
-        # pred = dr.max().item() * torch.sigmoid(mlp(X))
-        pred = mlp(X)
-        a, b, c = torch.split(pred, 1, dim=1)
-        pred = get_depth(pred, D.detach())
-        loss_si = si_log(pred[mask], Y[mask])
-        loss = loss_si + w_curvature * multi_scale_curvature_loss(pred) + w_gl1 * gradl1_loss(pred, D.detach()) + w_smooth * edge_aware_smoothness_per_pixel(image, pred)
-        # loss_tv = w_tv * (tv_loss(a) + tv_loss(b) + tv_loss(c))
-        # loss_gl1 = w_gl1 * gradl1_loss(pred, D.detach())
-        # loss_gl1 = w_gl1 * scale_invariant_gradient_loss(pred, D.detach())
-        # loss_shift_reg = w_shift_reg * torch.mean(b**2)
-        # loss = loss_si + loss_gl1
-        # loss = F.mse_loss(pred[mask], Y[mask])
-        loss.backward()
-        optimizer.step()
-        scheduler.step()
-        pbar.set_postfix(loss=loss.item(), si=loss_si.item())
-
-    return mlp
-
-def predict_aligned(mlp, image, dp, pos_dim=32, **kwargs):
-    device = next(mlp.parameters()).device
-    image = as_bchw_tensor(image, 3, device=device)
-    h, w = image.shape[-2:]
-    pe = positionalencoding2d(pos_dim, h, w)
-    pe = as_bchw_tensor(pe, pos_dim, device=device)
-    D = as_bchw_tensor(dp, 1, device=device)
-    # pe = as_bchw_tensor(gaussian_rff(pos_dim, h, w, sigma=5), pos_dim, device=device)
-
-    X = torch.cat([image, D, pe], dim=1)  # bchw
-    # X = torch.cat([image, D], dim=1)  # bchw
-    pred = mlp(X)
-    pred = get_depth(pred, D)
-    return pred.detach()
-
-def align_by_mlp(image, mask, dr, dp, **kwargs):
-    mlp = train_mlp(image, mask, dr, dp, **kwargs)
-    pred = predict_aligned(mlp, image, dp, **kwargs)
-    return pred
-
-from abc import ABC, abstractmethod
-
-
-# Abstract class for depth alignment. All depth alignment methods should inherit from this class.
-# The abstract class defines the interface for depth alignment.
-class DepthAligner(ABC):
-    def __init__(self):
-        super().__init__()
-
-    @abstractmethod
-    def align(self, depth_src, depth_target, valid_mask, *args, **kwargs):
-        """
-        Aligns the depth_src to the depth_target such that the aligned depth_src is as close as possible to the depth_target.
-        """
-        raise NotImplementedError
-    
-class MLPAligner(DepthAligner):
-    def __init__(self):
-        super().__init__()
-
-    def align(self, depth_src, depth_target, valid_mask, image, **kwargs):
-        depth_src = as_bchw_tensor(depth_src, 1)
-        depth_target = as_bchw_tensor(depth_target, 1)
-        valid_mask = as_bchw_tensor(valid_mask, 1)
-        depth_target = scale_shift_linear(depth_target, depth_src, valid_mask)
-        aligned = align_by_mlp(image, valid_mask, depth_target, depth_src, **kwargs)
-
-        depth_numpy = aligned.squeeze().float().cpu().numpy()
-        blur_bilateral = cv2.bilateralFilter(depth_numpy, 5, 140, 140)
-        blur_gaussian = cv2.GaussianBlur(blur_bilateral, (5, 5), 0)
-        blur_gaussian = torch.from_numpy(blur_gaussian).to(aligned)
-        return blur_gaussian.unsqueeze(0)
\ No newline at end of file
diff --git a/zoedepth/utils/arg_utils.py b/zoedepth/utils/arg_utils.py
index 8a2f3213c2f99c54c76886f6ffb4d9a71d363560..8a3004ec3679c0a40fd8961253733fb4343ad545 100644
--- a/zoedepth/utils/arg_utils.py
+++ b/zoedepth/utils/arg_utils.py
@@ -1,26 +1,4 @@
-# MIT License
 
-# Copyright (c) 2022 Intelligent Systems Lab Org
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-# File author: Shariq Farooq Bhat
 
 def infer_type(x):  # hacky way to infer type from string args
     if not isinstance(x, str):
diff --git a/zoedepth/utils/config.py b/zoedepth/utils/config.py
index b55b1c086e6d1b6711af254d77dffa0ad82d2ec3..363b0e186cd1247e8f5fc224e6f69f5fc8190c99 100644
--- a/zoedepth/utils/config.py
+++ b/zoedepth/utils/config.py
@@ -22,7 +22,7 @@
 
 # File author: Shariq Farooq Bhat
 
-import json5
+import json
 import os
 
 from zoedepth.utils.easydict import EasyDict as edict
@@ -36,8 +36,7 @@ ROOT = pathlib.Path(__file__).parent.parent.resolve()
 HOME_DIR = os.path.expanduser("~")
 
 COMMON_CONFIG = {
-    # "save_dir": os.path.expanduser("~/shortcuts/monodepth3_checkpoints"),
-    "save_dir": "./nfs/monodepth3_checkpoints",
+    "save_dir": os.path.expanduser("~/shortcuts/monodepth3_checkpoints"),
     "project": "ZoeDepth",
     "tags": '',
     "notes": "",
@@ -99,14 +98,14 @@ DATASETS_CONFIG = {
         "avoid_boundary": False,
         "min_depth": 1e-3,   # originally 0.1
         "max_depth": 10,
-        "data_path": os.path.join("/ibex/ai/home/liz0l/codes/datasets/nyu/data_folder"),
-        "gt_path": os.path.join("/ibex/ai/home/liz0l/codes/datasets/nyu/data_folder"),
-        "filenames_file": "/ibex/ai/home/liz0l/codes/datasets/nyu/data_folder/nyu_train.txt",
+        "data_path": os.path.join(HOME_DIR, "shortcuts/datasets/nyu_depth_v2/sync/"),
+        "gt_path": os.path.join(HOME_DIR, "shortcuts/datasets/nyu_depth_v2/sync/"),
+        "filenames_file": "./train_test_inputs/nyudepthv2_train_files_with_gt.txt",
         "input_height": 480,
         "input_width": 640,
-        "data_path_eval": os.path.join("/ibex/ai/home/liz0l/codes/datasets/nyu/data_folder"),
-        "gt_path_eval": os.path.join("/ibex/ai/home/liz0l/codes/datasets/nyu/data_folder"),
-        "filenames_file_eval": "/ibex/ai/home/liz0l/codes/datasets/nyu/data_folder/nyu_test.txt",
+        "data_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/nyu_depth_v2/official_splits/test/"),
+        "gt_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/nyu_depth_v2/official_splits/test/"),
+        "filenames_file_eval": "./train_test_inputs/nyudepthv2_test_files_with_gt.txt",
         "min_depth_eval": 1e-3,
         "max_depth_eval": 10,
         "min_depth_diff": -10,
@@ -116,102 +115,7 @@ DATASETS_CONFIG = {
         "degree": 1.0,
         "do_kb_crop": False,
         "garg_crop": False,
-        "eigen_crop": False,
-    },
-    "u4k": {
-        "dataset": "u4k",
-        "min_depth": 1e-3,   # originally 0.1
-        "max_depth": 80,
-        "data_path": os.path.join("/ibex/ai/home/liz0l/codes/datasets/u4k"),
-        "filenames_train": "/ibex/ai/home/liz0l/codes/datasets/u4k/splits/train.txt",
-        "input_height": 480, # ? will not be used (random crop)
-        "input_width": 640, # ? will not be used (random crop)
-        "filenames_val": "/ibex/ai/home/liz0l/codes/datasets/u4k/splits/val.txt",
-        # "filenames_val": "/ibex/ai/home/liz0l/codes/datasets/u4k/splits/test.txt",
-        "filenames_test": "/ibex/ai/home/liz0l/codes/datasets/u4k/splits/test.txt",
-        "min_depth_eval": 1e-3,
-        "max_depth_eval": 80,
-        "min_depth_diff": -10,
-        "max_depth_diff": 10,
-
-        "do_random_rotate": True,
-        "degree": 1.0,
-        "do_kb_crop": False,
-        "garg_crop": False,
-        "eigen_crop": False,
-        
-        "num_sample_inout": 50000,
-        # "num_sample_inout": 40000,
-        "sampling_strategy": 'random',
-        # "sampling_strategy": 'dda',
-        "dilation_factor": 10,
-
-        "use_rgb": False,
-        "do_normalize": True, # do normalize in dataloader
-        "do_input_resize": True
-    },
-    "mid": {
-        "dataset": "mid",
-        "min_depth": 1e-3,   # originally 0.1
-        "max_depth": 10,
-        "data_path": os.path.join("/ibex/ai/home/liz0l/codes/datasets/middlebury"),
-        "filenames_train": "/ibex/ai/home/liz0l/codes/datasets/middlebury/splits/train.txt",
-        "input_height": 480, # ? will not be used (random crop)
-        "input_width": 640, # ? will not be used (random crop)
-        "filenames_val": "/ibex/ai/home/liz0l/codes/datasets/middlebury/splits/val.txt",
-        "filenames_test": "/ibex/ai/home/liz0l/codes/datasets/middlebury/splits/test.txt",
-        "min_depth_eval": 1e-3,
-        "max_depth_eval": 10,
-        "min_depth_diff": -10,
-        "max_depth_diff": 10,
-
-        "do_random_rotate": True,
-        "degree": 1.0,
-        "do_kb_crop": False,
-        "garg_crop": False,
-        "eigen_crop": False,
-        
-        "num_sample_inout": 50000,
-        # "num_sample_inout": 40000,
-        "sampling_strategy": 'random',
-        # "sampling_strategy": 'dda',
-        "dilation_factor": 10,
-
-        "use_rgb": False,
-        "do_normalize": True, # do normalize in dataloader
-        "do_input_resize": True
-    },
-    "gta": {
-        "dataset": "gta",
-        "min_depth": 1e-3, # originally 0.1
-        "max_depth": 80,
-        "data_path": os.path.join("/ibex/ai/home/liz0l/codes/datasets/gta/GTAV_1080"),
-        "filenames_train": "/ibex/ai/home/liz0l/codes/datasets/gta/GTAV_1080/train.txt",
-        "input_height": 480, # ? will not be used (random crop)
-        "input_width": 640, # ? will not be used (random crop)
-        "filenames_val": "/ibex/ai/home/liz0l/codes/datasets/gta/GTAV_1080/val.txt",
-        # "filenames_val": "/ibex/ai/home/liz0l/codes/datasets/u4k/splits/test.txt",
-        "filenames_test": "/ibex/ai/home/liz0l/codes/datasets/gta/GTAV_1080/test.txt",
-        "min_depth_eval": 1e-3,
-        "max_depth_eval": 80,
-        "min_depth_diff": -10,
-        "max_depth_diff": 10,
-
-        "do_random_rotate": True,
-        "degree": 1.0,
-        "do_kb_crop": False,
-        "garg_crop": False,
-        "eigen_crop": False,
-        
-        "num_sample_inout": 50000,
-        # "num_sample_inout": 40000,
-        "sampling_strategy": 'random',
-        # "sampling_strategy": 'dda',
-        "dilation_factor": 10,
-
-        "use_rgb": False,
-        "do_normalize": True, # do normalize in dataloader
-        "do_input_resize": True
+        "eigen_crop": True
     },
     "ibims": {
         "dataset": "ibims",
@@ -332,8 +236,7 @@ ALL_EVAL_DATASETS = ALL_INDOOR + ALL_OUTDOOR
 COMMON_TRAINING_CONFIG = {
     "dataset": "nyu",
     "distributed": True,
-    "workers": 24,
-    # "workers": 6,
+    "workers": 16,
     "clip_grad": 0.1,
     "use_shared_dict": False,
     "shared_dict": None,
@@ -345,11 +248,8 @@ COMMON_TRAINING_CONFIG = {
     "translate_prob": 0.2,
     "max_translation": 100,
 
-    "validate_every": 1,
-    # "validate_every": 1,
-    # "log_images_every": 0.01,
-    "log_images_every": 0.2,
-    # "log_images_every": 0.05,
+    "validate_every": 0.25,
+    "log_images_every": 0.1,
     "prefetch": False,
 }
 
@@ -403,7 +303,7 @@ def parse_list(config, key, dtype=int):
                                                      ), f"{key} should be a list of values dtype {dtype}. Given {config[key]} of type {type(config[key])} with values of type {[type(e) for e in config[key]]}."
 
 
-def get_model_config(model_name, model_version=None, model_cfg_path=""):
+def get_model_config(model_name, model_version=None):
     """Find and parse the .json config file for the model.
 
     Args:
@@ -413,33 +313,26 @@ def get_model_config(model_name, model_version=None, model_cfg_path=""):
     Returns:
         easydict: the config dictionary for the model.
     """
-
-    if model_cfg_path != "":
-        config_file = model_cfg_path
-    else:
-        config_fname = f"config_{model_name}_{model_version}.json" if model_version is not None else f"config_{model_name}.json"
-        config_file = os.path.join(ROOT, "models", model_name, config_fname)
+    config_fname = f"config_{model_name}_{model_version}.json" if model_version is not None else f"config_{model_name}.json"
+    config_file = os.path.join(ROOT, "models", model_name, config_fname)
     if not os.path.exists(config_file):
         return None
-    
-    # with open(config_file, "r") as f:
-    #     config = edict(json.load(f))
-    # try to be more friendly
-    with open(config_file, 'r') as f:
-        config = edict(json5.load(f))
+
+    with open(config_file, "r") as f:
+        config = edict(json.load(f))
 
     # handle dictionary inheritance
     # only training config is supported for inheritance
     if "inherit" in config.train and config.train.inherit is not None:
-        inherit_config = get_model_config(config.train["inherit"], model_cfg_path=model_cfg_path).train
+        inherit_config = get_model_config(config.train["inherit"]).train
         for key, value in inherit_config.items():
             if key not in config.train:
                 config.train[key] = value
     return edict(config)
 
 
-def update_model_config(config, mode, model_name, model_version=None, strict=False, model_cfg_path=""):
-    model_config = get_model_config(model_name, model_version, model_cfg_path=model_cfg_path)
+def update_model_config(config, mode, model_name, model_version=None, strict=False):
+    model_config = get_model_config(model_name, model_version)
     if model_config is not None:
         config = {**config, **
                   flatten({**model_config.model, **model_config[mode]})}
@@ -458,7 +351,7 @@ KEYS_TYPE_BOOL = ["use_amp", "distributed", "use_shared_dict", "same_lr", "aug",
                   "prefetch", "cycle_momentum"]  # Casting is not necessary as their int casted values in config are 0 or 1
 
 
-def get_config(model_name, mode='train', dataset=None, model_cfg_path=None, **overwrite_kwargs):
+def get_config(model_name, mode='train', dataset=None, **overwrite_kwargs):
     """Main entry point to get the config for the model.
 
     Args:
@@ -478,24 +371,24 @@ def get_config(model_name, mode='train', dataset=None, model_cfg_path=None, **ov
         easydict: The config dictionary for the model.
     """
 
-    check_choices("Model", model_name, ["zoedepth", "zoedepth_nk", "zoedepth_custom"])
+
+    check_choices("Model", model_name, ["zoedepth", "zoedepth_nk"])
     check_choices("Mode", mode, ["train", "infer", "eval"])
     if mode == "train":
-        check_choices("Dataset", dataset, ["nyu", "kitti", "mix", 'u4k', 'mid', 'gta', None])
+        check_choices("Dataset", dataset, ["nyu", "kitti", "mix", None])
 
-    
     config = flatten({**COMMON_CONFIG, **COMMON_TRAINING_CONFIG})
-    config = update_model_config(config, mode, model_name, model_cfg_path=model_cfg_path)
+    config = update_model_config(config, mode, model_name)
 
     # update with model version specific config
     version_name = overwrite_kwargs.get("version_name", config["version_name"])
-    config = update_model_config(config, mode, model_name, version_name, model_cfg_path=model_cfg_path)
+    config = update_model_config(config, mode, model_name, version_name)
 
     # update with config version if specified
     config_version = overwrite_kwargs.get("config_version", None)
     if config_version is not None:
         print("Overwriting config with config_version", config_version)
-        config = update_model_config(config, mode, model_name, config_version, model_cfg_path=model_cfg_path)
+        config = update_model_config(config, mode, model_name, config_version)
 
     # update with overwrite_kwargs
     # Combined args are useful for hyperparameter search
@@ -542,69 +435,3 @@ def get_config(model_name, mode='train', dataset=None, model_cfg_path=None, **ov
 def change_dataset(config, new_dataset):
     config.update(DATASETS_CONFIG[new_dataset])
     return config
-
-
-def get_config_user(model_name, mode='infer', model_cfg_path=None, **overwrite_kwargs):
-    """Main entry point to get the config for the model.
-
-    Args:
-        model_name (str): name of the desired model.
-        mode (str, optional): "train" or "infer". Defaults to 'train'.
-        dataset (str, optional): If specified, the corresponding dataset configuration is loaded as well. Defaults to None.
-    
-    Keyword Args: key-value pairs of arguments to overwrite the default config.
-
-    The order of precedence for overwriting the config is (Higher precedence first):
-        # 1. overwrite_kwargs
-        # 2. "config_version": Config file version if specified in overwrite_kwargs. The corresponding config loaded is config_{model_name}_{config_version}.json
-        # 3. "version_name": Default Model version specific config specified in overwrite_kwargs. The corresponding config loaded is config_{model_name}_{version_name}.json
-        # 4. common_config: Default config for all models specified in COMMON_CONFIG
-
-    Returns:
-        easydict: The config dictionary for the model.
-    """
-
-    check_choices("Model", model_name, ["zoedepth", "zoedepth_nk", "zoedepth_custom"])
-    check_choices("Mode", mode, ["train", "infer", "eval"])
-    
-    config = flatten({**COMMON_CONFIG, **COMMON_TRAINING_CONFIG})
-    config = update_model_config(config, mode, model_name, model_cfg_path=model_cfg_path)
-
-    # update with model version specific config
-    version_name = overwrite_kwargs.get("version_name", config["version_name"])
-    config = update_model_config(config, mode, model_name, version_name, model_cfg_path=model_cfg_path)
-
-    # update with config version if specified
-    config_version = overwrite_kwargs.get("config_version", None)
-    if config_version is not None:
-        print("Overwriting config with config_version", config_version)
-        config = update_model_config(config, mode, model_name, config_version, model_cfg_path=model_cfg_path)
-
-    # update with overwrite_kwargs
-    # Combined args are useful for hyperparameter search
-    overwrite_kwargs = split_combined_args(overwrite_kwargs)
-    config = {**config, **overwrite_kwargs}
-
-    # Casting to bool   # TODO: Not necessary. Remove and test
-    for key in KEYS_TYPE_BOOL:
-        if key in config:
-            config[key] = bool(config[key])
-
-    # Model specific post processing of config
-    parse_list(config, "n_attractors")
-
-    # adjust n_bins for each bin configuration if bin_conf is given and n_bins is passed in overwrite_kwargs
-    if 'bin_conf' in config and 'n_bins' in overwrite_kwargs:
-        bin_conf = config['bin_conf']  # list of dicts
-        n_bins = overwrite_kwargs['n_bins']
-        new_bin_conf = []
-        for conf in bin_conf:
-            conf['n_bins'] = n_bins
-            new_bin_conf.append(conf)
-        config['bin_conf'] = new_bin_conf
-
-    config['model'] = model_name
-    typed_config = {k: infer_type(v) for k, v in config.items()}
-    # add hostname to config
-    config['hostname'] = platform.node()
-    return edict(typed_config)
diff --git a/zoedepth/utils/geometry.py b/zoedepth/utils/geometry.py
index ac34589041e866949c8f65eac85abae4e17ad89f..e3da8c75b5a8e39b4b58a4dcd827b84d79b9115c 100644
--- a/zoedepth/utils/geometry.py
+++ b/zoedepth/utils/geometry.py
@@ -24,21 +24,21 @@
 
 import numpy as np
 
-def get_intrinsics(H,W,fov=55):
+def get_intrinsics(H,W):
     """
     Intrinsics for a pinhole camera model.
     Assume fov of 55 degrees and central principal point.
     """
-    f = 0.5 * W / np.tan(0.5 * fov * np.pi / 180.0)
+    f = 0.5 * W / np.tan(0.5 * 55 * np.pi / 180.0)
     cx = 0.5 * W
     cy = 0.5 * H
     return np.array([[f, 0, cx],
                      [0, f, cy],
                      [0, 0, 1]])
 
-def depth_to_points(depth, R=None, t=None, fov=55):
+def depth_to_points(depth, R=None, t=None):
 
-    K = get_intrinsics(depth.shape[1], depth.shape[2], fov=fov)
+    K = get_intrinsics(depth.shape[1], depth.shape[2])
     Kinv = np.linalg.inv(K)
     if R is None:
         R = np.eye(3)
diff --git a/zoedepth/utils/misc.py b/zoedepth/utils/misc.py
index 452ec23fbbcf4564d1eeb9309d0dcc4760469626..4bbe403d3669829eecdf658458c76aa5e87e2b33 100644
--- a/zoedepth/utils/misc.py
+++ b/zoedepth/utils/misc.py
@@ -42,9 +42,7 @@ import torch.nn as nn
 import torch.utils.data.distributed
 from PIL import Image
 from torchvision.transforms import ToTensor
-import cv2
 
-import matplotlib
 
 class RunningAverage:
     def __init__(self):
@@ -96,7 +94,7 @@ class RunningAverageDict:
         return {key: value.get_value() for key, value in self._dict.items()}
 
 
-def colorize(value, vmin=None, vmax=None, cmap='turbo_r', invalid_val=-99, invalid_mask=None, background_color=(128, 128, 128, 255), gamma_corrected=False, value_transform=None):
+def colorize(value, vmin=None, vmax=None, cmap='gray_r', invalid_val=-99, invalid_mask=None, background_color=(128, 128, 128, 255), gamma_corrected=False, value_transform=None):
     """Converts a depth map to a color image.
 
     Args:
@@ -201,88 +199,7 @@ def compute_errors(gt, pred):
                 silog=silog, sq_rel=sq_rel)
 
 
-def shift_2d_replace(data, dx, dy, constant=False):
-    shifted_data = np.roll(data, dx, axis=1)
-    if dx < 0:
-        shifted_data[:, dx:] = constant
-    elif dx > 0:
-        shifted_data[:, 0:dx] = constant
-
-    shifted_data = np.roll(shifted_data, dy, axis=0)
-    if dy < 0:
-        shifted_data[dy:, :] = constant
-    elif dy > 0:
-        shifted_data[0:dy, :] = constant
-    return shifted_data
-
-def soft_edge_error(pred, gt, radius=1):
-    abs_diff=[]
-    for i in range(-radius, radius + 1):
-        for j in range(-radius, radius + 1):
-            abs_diff.append(np.abs(shift_2d_replace(gt, i, j, 0) - pred))
-    return np.minimum.reduce(abs_diff)
-
-def get_boundaries(disp, th=1., dilation=10):
-    edges_y = np.logical_or(np.pad(np.abs(disp[1:, :] - disp[:-1, :]) > th, ((1, 0), (0, 0))),
-                            np.pad(np.abs(disp[:-1, :] - disp[1:, :]) > th, ((0, 1), (0, 0))))
-    edges_x = np.logical_or(np.pad(np.abs(disp[:, 1:] - disp[:, :-1]) > th, ((0, 0), (1, 0))),
-                            np.pad(np.abs(disp[:, :-1] - disp[:,1:]) > th, ((0, 0), (0, 1))))
-    edges = np.logical_or(edges_y,  edges_x).astype(np.float32)
-
-    if dilation > 0:
-        kernel = np.ones((dilation, dilation), np.uint8)
-        edges = cv2.dilate(edges, kernel, iterations=1)
-
-    return edges
-
-
-@torch.no_grad()
-def scale_shift_linear(rendered_depth, predicted_depth, mask, fuse=True, return_params=False):
-    """
-    Optimize a scale and shift parameter in the least squares sense, such that rendered_depth and predicted_depth match.
-    Formally, solves the following objective:
-
-    min     || (d * a + b) - d_hat ||
-    a, b
-
-    where d = 1 / predicted_depth, d_hat = 1 / rendered_depth
-
-    :param rendered_depth: torch.Tensor (H, W)
-    :param predicted_depth:  torch.Tensor (H, W)
-    :param mask: torch.Tensor (H, W) - True: valid points of rendered_depth, False: invalid points of rendered_depth (ignore)
-    :param fuse: whether to fuse shifted/scaled predicted_depth with the rendered_depth
-
-    :return: scale/shift corrected depth
-    """
-    if mask.sum() == 0:
-        return predicted_depth
-
-    # rendered_disparity = 1 / rendered_depth[mask].unsqueeze(-1)
-    # predicted_disparity = 1 / predicted_depth[mask].unsqueeze(-1)
-
-    rendered_disparity = rendered_depth[mask].unsqueeze(-1)
-    predicted_disparity = predicted_depth[mask].unsqueeze(-1)
-
-    X = torch.cat([predicted_disparity, torch.ones_like(predicted_disparity)], dim=1)
-    XTX_inv = (X.T @ X).inverse()
-    XTY = X.T @ rendered_disparity
-    AB = XTX_inv @ XTY
-
-    if return_params:
-        return AB
-
-    fixed_disparity = (predicted_depth) * AB[0] + AB[1]
-    fixed_depth = fixed_disparity
-
-    if fuse:
-        fused_depth = torch.where(mask, rendered_depth, fixed_depth)
-        return fused_depth
-    else:
-        return fixed_depth
-
-
-
-def compute_metrics(gt, pred, interpolate=True, garg_crop=False, eigen_crop=True, dataset='nyu', min_depth_eval=0.1, max_depth_eval=10, disp_gt_edges=None, pred_depths=None, **kwargs):
+def compute_metrics(gt, pred, interpolate=True, garg_crop=False, eigen_crop=True, dataset='nyu', min_depth_eval=0.1, max_depth_eval=10, **kwargs):
     """Compute metrics of predicted depth maps. Applies cropping and masking as necessary or specified via arguments. Refer to compute_errors for more details on metrics.
     """
     if 'config' in kwargs:
@@ -294,7 +211,7 @@ def compute_metrics(gt, pred, interpolate=True, garg_crop=False, eigen_crop=True
 
     if gt.shape[-2:] != pred.shape[-2:] and interpolate:
         pred = nn.functional.interpolate(
-            pred.unsqueeze(dim=0).unsqueeze(dim=0), gt.shape[-2:], mode='bilinear', align_corners=True).squeeze()
+            pred, gt.shape[-2:], mode='bilinear', align_corners=True)
 
     pred = pred.squeeze().cpu().numpy()
     pred[pred < min_depth_eval] = min_depth_eval
@@ -306,7 +223,6 @@ def compute_metrics(gt, pred, interpolate=True, garg_crop=False, eigen_crop=True
     valid_mask = np.logical_and(
         gt_depth > min_depth_eval, gt_depth < max_depth_eval)
 
-    eval_mask = np.ones(valid_mask.shape)
     if garg_crop or eigen_crop:
         gt_height, gt_width = gt_depth.shape
         eval_mask = np.zeros(valid_mask.shape)
@@ -326,36 +242,8 @@ def compute_metrics(gt, pred, interpolate=True, garg_crop=False, eigen_crop=True
         else:
             eval_mask = np.ones(valid_mask.shape)
     valid_mask = np.logical_and(valid_mask, eval_mask)
+    return compute_errors(gt_depth[valid_mask], pred[valid_mask])
 
-    # if dataset == 'nyu':
-    #     # pred = scale_shift_linear(torch.tensor(pred_depths), torch.tensor(pred), torch.tensor(valid_mask), fuse=False).numpy()
-    #     pred = scale_shift_linear(torch.tensor(gt), torch.tensor(pred), torch.tensor(valid_mask), fuse=False).numpy()
-        
-    metrics = compute_errors(gt_depth[valid_mask], pred[valid_mask])
-
-    mask = valid_mask.squeeze() # squeeze
-    gt = gt_depth
-    pred = pred
-    see_depth = 0
-    if disp_gt_edges is None:
-        print("Maybe we need edge maps from origin disp!")
-        edges = get_boundaries(gt, th=0.08, dilation=0)
-    else:
-        edges = disp_gt_edges
-    
-    mask = np.logical_and(mask, edges)
-    import matplotlib.pyplot as plt
-    if mask.sum() > 0:
-        see_depth = soft_edge_error(pred, gt)[mask].mean()
-    metrics['see'] = see_depth
-    
-    return metrics
-
-# 'abs_rel': 0.07546425755890458, 'rmse': 0.2714709522322233, base zoe
-# 'abs_rel': 0.04409278385819647, 'rmse': 0.18093922881791188, base zoe+opt
-
-# patchfusion + pred scale-shift: 'abs_rel': 0.09078774519765959, 'rmse': 0.31991247948976803,
-# gt scale-shift abs_rel': 0.06316796072476771, 'rmse': 0.24189620860353886,
 
 #################################### Model uilts ################################################
 
@@ -384,8 +272,6 @@ def parallelize(config, model, find_unused_parameters=True):
         model = model.cuda(config.gpu)
         model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.gpu], output_device=config.gpu,
                                                           find_unused_parameters=find_unused_parameters)
-        # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.gpu], output_device=config.gpu,
-        #                                                   find_unused_parameters=True)
 
     elif config.gpu is None:
         # Use DP
@@ -479,20 +365,4 @@ def save_raw_16bit(depth, fpath="raw.png"):
     depth = depth.astype(np.uint16)
     depth = Image.fromarray(depth)
     depth.save(fpath)
-    print("Saved raw depth to", fpath)
-
-
-def generatemask(size, k_size=-1, sigma=-1, h_factor=0.03, w_factor=0.02):
-    # Generates a Guassian mask
-    mask = np.zeros(size, dtype=np.float32)
-    if sigma == -1:
-        sigma = int(size[0]/16)
-    if k_size == -1:
-        k_size = int(2 * np.ceil(2 * int(size[0]/16)) + 1)
-    # mask[int(0.02*size[0]):size[0] - int(0.02*size[0]), int(0.015*size[1]): size[1] - int(0.015*size[1])] = 1
-    mask[int(h_factor*size[0]):size[0] - int(h_factor*size[0]), int(w_factor*size[1]): size[1] - int(w_factor*size[1])] = 1
-    mask = cv2.GaussianBlur(mask, (int(k_size), int(k_size)), sigma)
-    mask = (mask - mask.min()) / (mask.max() - mask.min())
-    mask = mask.astype(np.float32)
-    return mask
-
+    print("Saved raw depth to", fpath)
\ No newline at end of file