diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..dfe0770424b2a19faf507a501ebfc23be8f54e7b 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,35 +1,2 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
+# Auto detect text files and perform LF normalization
+* text=auto
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..de5331ee8c7488df14839eac87f3b9ca671a6353
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,160 @@
+# Ultimate RVC project
+audio
+models
+temp
+uv
+uv.lock
+
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
diff --git a/.gradio/certificate.pem b/.gradio/certificate.pem
new file mode 100644
index 0000000000000000000000000000000000000000..b85c8037f6b60976b2546fdbae88312c5246d9a3
--- /dev/null
+++ b/.gradio/certificate.pem
@@ -0,0 +1,31 @@
+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..d59342ac267bd454180135a9f1d75e2ae4d9f1d8
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 JackismyShephard
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
index c288c38a912dc4c30e064d136e5d56e139c3b9c9..24d1730bc8cf39d90c6e791425438917b8e0ce24 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,254 @@
 ---
 title: HRVC
-emoji: 🌖
-colorFrom: green
-colorTo: gray
+app_file: src/ultimate_rvc/web/main.py
 sdk: gradio
 sdk_version: 5.6.0
-app_file: app.py
-pinned: false
 ---
+# Ultimate RVC
 
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+An extension of [AiCoverGen](https://github.com/SociallyIneptWeeb/AICoverGen), which provides several new features and improvements, enabling users to generate song covers using RVC with ease. Ideal for people who want to incorporate singing functionality into their AI assistant/chatbot/vtuber, or for people who want to hear their favourite characters sing their favourite song.
+
+<!-- Showcase: TBA -->
+
+![ ](images/webui_generate.png?raw=true)
+
+Ultimate RVC is under constant development and testing, but you can try it out right now locally or on Google Colab!
+
+## New Features
+
+* Easy and automated setup using launcher scripts for both windows and Debian-based linux systems
+* Caching system which saves intermediate audio files as needed, thereby reducing inference time as much as possible. For example, if song A has already been converted using model B and now you want to convert song A using model C, then vocal extraction can be skipped and inference time reduced drastically
+* Ability to listen to intermediate audio files in the UI. This is useful for getting an idea of what is happening in each step of the song cover generation pipeline
+* A "multi-step" song cover generation tab: here you can try out each step of the song cover generation pipeline in isolation. For example, if you already have extracted vocals available and only want to convert these using your voice model, then you can do that here. Besides, this tab is useful for experimenting with settings for each step of the song cover generation pipeline
+* An overhaul of the song input component for the song cover generation pipeline. Now cached input songs can be selected from a dropdown, so that you don't have to supply the Youtube link of a song each time you want to convert it.
+* A new "manage models" tab, which collects and revamps all existing functionality for managing voice models, as well as adds some new features, such as the ability to delete existing models
+* A new "manage audio" tab, which allows you to interact with all audio generated by the app. Currently, this tab supports deleting audio files.
+* Lots of visual and performance improvements resulting from updating from Gradio 3 to Gradio 5 and from python 3.9 to python 3.12
+* A redistributable package on PyPI, which allows you to access the Ultimate RVC project without cloning any repositories.
+
+## Colab notebook
+
+For those without a powerful enough NVIDIA GPU, you may try Ultimate RVC out using Google Colab.
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JackismyShephard/ultimate-rvc/blob/main/notebooks/ultimate_rvc_colab.ipynb)
+
+For those who want to run the Ultimate RVC project locally, follow the setup guide below.
+
+## Setup
+
+The Ultimate RVC project currently supports Windows and Debian-based Linux distributions, namely Ubuntu 22.04 and Ubuntu 24.04. Support for other platforms is not guaranteed.
+
+To setup the project follow the steps below and execute the provided commands in an appropriate terminal. On windows this terminal should be **powershell**, while on Debian-based linux distributions it should be a **bash**-compliant shell.
+
+### Install Git
+
+Follow the instructions [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) to install Git on your computer.
+
+### Set execution policy (Windows only)
+
+To execute the subsequent commands on Windows, it is necessary to first grant
+powershell permission to run scripts. This can be done at a user level as follows:
+
+```console
+Set-ExecutionPolicy RemoteSigned -Scope CurrentUser
+```
+
+### Clone Ultimate RVC repository
+
+```console
+git clone https://github.com/JackismyShephard/ultimate-rvc
+cd ultimate-rvc
+```
+
+### Install dependencies
+
+```console
+./urvc install 
+```
+Note that on Linux, this command will install the CUDA 12.4 toolkit system-wide, if it is not already available. In case you have problems, you may need to install the toolkit manually.
+
+## Usage
+
+### Start the app
+
+```console
+./urvc run
+```
+
+Once the following output message `Running on local URL:  http://127.0.0.1:7860` appears, you can click on the link to open a tab with the web app.
+
+### Manage models
+
+#### Download models
+
+![ ](images/webui_dl_model.png?raw=true)
+
+Navigate to the `Download model` subtab under the `Manage models` tab, and paste the download link to an RVC model and give it a unique name.
+You may search the [AI Hub Discord](https://discord.gg/aihub) where already trained voice models are available for download.
+The downloaded zip file should contain the .pth model file and an optional .index file.
+
+Once the 2 input fields are filled in, simply click `Download`! Once the output message says `[NAME] Model successfully downloaded!`, you should be able to use it in the `Generate song covers` tab!
+
+#### Upload models
+
+![ ](images/webui_upload_model.png?raw=true)
+
+For people who have trained RVC v2 models locally and would like to use them for AI cover generations.
+Navigate to the `Upload model` subtab under the `Manage models` tab, and follow the instructions.
+Once the output message says `Model with name [NAME] successfully uploaded!`, you should be able to use it in the `Generate song covers` tab!
+
+#### Delete RVC models
+
+TBA
+
+### Generate song covers
+
+#### One-click generation
+
+![ ](images/webui_generate.png?raw=true)
+
+* From the Voice model dropdown menu, select the voice model to use.
+* In the song input field, copy and paste the link to any song on YouTube, the full path to a local audio file, or select a cached input song.
+* Pitch should be set to either -12, 0, or 12 depending on the original vocals and the RVC AI modal. This ensures the voice is not *out of tune*.
+* Other advanced options for vocal conversion, audio mixing and etc. can be viewed by clicking the appropriate accordion arrow to expand.
+
+Once all options are filled in, click `Generate` and the AI generated cover should appear in a less than a few minutes depending on your GPU.
+
+#### Multi-step generation
+
+TBA
+
+## CLI
+
+### Manual download of RVC models
+
+Unzip (if needed) and transfer the `.pth` and `.index` files to a new folder in the [rvc models](models/rvc) directory. Each folder should only contain one `.pth` and one `.index` file.
+
+The directory structure should look something like this:
+
+```text
+├── models
+|   ├── audio_separator
+|   ├── rvc
+│       ├── John
+│       │   ├── JohnV2.pth
+│       │   └── added_IVF2237_Flat_nprobe_1_v2.index
+│       ├── May
+│       │   ├── May.pth
+│       │   └── added_IVF2237_Flat_nprobe_1_v2.index
+│       └── hubert_base.pt
+├── notebooks
+├── notes
+└── src
+```
+
+### Running the pipeline
+
+#### Usage
+
+```console
+./urvc cli song-cover run-pipeline [OPTIONS] SOURCE MODEL_NAME 
+```
+
+##### Arguments
+
+* `SOURCE`: A Youtube URL, the path to a local audio file or the path to a song directory.  [required]
+* `MODEL_NAME`: The name of the voice model to use for vocal conversion.  [required]
+
+##### Options
+
+* `--n-octaves INTEGER`: The number of octaves to pitch-shift the converted vocals by.Use 1 for male-to-female and -1 for vice-versa.  [default: 0]
+* `--n-semitones INTEGER`: The number of semi-tones to pitch-shift the converted vocals, instrumentals, and backup vocals by. Altering this slightly reduces sound quality  [default: 0]
+* `--f0-method [rmvpe|mangio-crepe]`: The method to use for pitch detection during vocal conversion. Best option is RMVPE (clarity in vocals), then Mangio-Crepe (smoother vocals).  [default: rmvpe]
+* `--index-rate FLOAT RANGE`: A decimal number e.g. 0.5, Controls how much of the accent in the voice model to keep in the converted vocals. Increase to bias the conversion towards the accent of the voice model.  [default: 0.5; 0<=x<=1]
+* `--filter-radius INTEGER RANGE`: A number between 0 and 7. If >=3: apply median filtering to the pitch results harvested during vocal conversion. Can help reduce breathiness in the converted vocals.  [default: 3; 0<=x<=7]
+* `--rms-mix-rate FLOAT RANGE`: A decimal number e.g. 0.25. Controls how much to mimic the loudness of the input vocals (0) or a fixed loudness (1) during vocal conversion.  [default: 0.25; 0<=x<=1]
+* `--protect FLOAT RANGE`: A decimal number e.g. 0.33. Controls protection of voiceless consonants and breath sounds during vocal conversion. Decrease to increase protection at the cost of indexing accuracy. Set to 0.5 to disable.  [default: 0.33; 0<=x<=0.5]
+* `--hop-length INTEGER`: Controls how often the CREPE-based pitch detection algorithm checks for pitch changes during vocal conversion. Measured in milliseconds. Lower values lead to longer conversion times and a higher risk of voice cracks, but better pitch accuracy. Recommended value: 128.  [default: 128]
+* `--room-size FLOAT RANGE`: The room size of the reverb effect applied to the converted vocals. Increase for longer reverb time. Should be a value between 0 and 1.  [default: 0.15; 0<=x<=1]
+* `--wet-level FLOAT RANGE`: The loudness of the converted vocals with reverb effect applied. Should be a value between 0 and 1  [default: 0.2; 0<=x<=1]
+* `--dry-level FLOAT RANGE`: The loudness of the converted vocals wihout reverb effect applied. Should be a value between 0 and 1.  [default: 0.8; 0<=x<=1]
+* `--damping FLOAT RANGE`: The absorption of high frequencies in the reverb effect applied to the converted vocals. Should be a value between 0 and 1.  [default: 0.7; 0<=x<=1]
+* `--main-gain INTEGER`: The gain to apply to the post-processed vocals. Measured in dB.  [default: 0]
+* `--inst-gain INTEGER`: The gain to apply to the pitch-shifted instrumentals. Measured in dB.  [default: 0]
+* `--backup-gain INTEGER`: The gain to apply to the pitch-shifted backup vocals. Measured in dB.  [default: 0]
+* `--output-sr INTEGER`: The sample rate of the song cover.  [default: 44100]
+* `--output-format [mp3|wav|flac|ogg|m4a|aac]`: The audio format of the song cover.  [default: mp3]
+* `--output-name TEXT`: The name of the song cover.
+* `--help`: Show this message and exit.
+
+## Update to latest version
+
+```console
+./urvc update
+```
+
+## Development mode
+
+When developing new features or debugging, it is recommended to run the app in development mode. This enables hot reloading, which means that the app will automatically reload when changes are made to the code.
+
+```console
+./urvc dev
+```
+
+## PyPI package
+
+The Ultimate RVC project is also available as a [distributable package](https://pypi.org/project/ultimate-rvc/) on [PyPI](https://pypi.org/).
+
+### Installation
+
+The package can be installed with pip in a **Python 3.12**-based environment. To do so requires first installing PyTorch with Cuda support:
+
+```console
+pip install torch==2.5.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124
+```
+
+Additionally, on Windows the `diffq` package must be installed manually as follows:
+
+```console
+pip install https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/diffq-0.2.4-cp312-cp312-win_amd64.whl
+```
+
+The Ultimate RVC project package can then be installed as follows:
+
+```console
+pip install ultimate-rvc
+```
+
+### Usage
+
+The `ultimate-rvc` package can be used as a python library but is primarily intended to be used as a command line tool. The package exposes two top-level commands:
+
+* `urvc` which lets the user generate song covers directly from their terminal
+* `urvc-web` which starts a local instance of the Ultimate RVC web application
+
+For more information on either command supply the option `--help`.
+
+## Environment Variables
+
+The behaviour of the Ultimate RVC project can be customized via a number of environment variables. Currently these environment variables control only logging behaviour. They are as follows:
+
+* `URVC_CONSOLE_LOG_LEVEL`: The log level for console logging. If not set, defaults to `ERROR`.
+* `URVC_FILE_LOG_LEVEL`: The log level for file logging. If not set, defaults to `INFO`.
+* `URVC_LOGS_DIR`: The directory in which log files will be stored. If not set, logs will be stored in a `logs` directory in the current working directory.
+* `URVC_NO_LOGGING`: If set to `1`, logging will be disabled.
+
+## Terms of Use
+
+The use of the converted voice for the following purposes is prohibited.
+
+* Criticizing or attacking individuals.
+
+* Advocating for or opposing specific political positions, religions, or ideologies.
+
+* Publicly displaying strongly stimulating expressions without proper zoning.
+
+* Selling of voice models and generated voice clips.
+
+* Impersonation of the original owner of the voice with malicious intentions to harm/hurt others.
+
+* Fraudulent purposes that lead to identity theft or fraudulent phone calls.
+
+## Disclaimer
+
+I am not liable for any direct, indirect, consequential, incidental, or special damages arising out of or in any way connected with the use/misuse or inability to use this software.
diff --git a/images/webui_dl_model.png b/images/webui_dl_model.png
new file mode 100644
index 0000000000000000000000000000000000000000..537f98f3474774f16714df5dc5c13cad6af3d3be
Binary files /dev/null and b/images/webui_dl_model.png differ
diff --git a/images/webui_generate.png b/images/webui_generate.png
new file mode 100644
index 0000000000000000000000000000000000000000..ee780f33bd94619484006ae106e9c2b9f39bb851
Binary files /dev/null and b/images/webui_generate.png differ
diff --git a/images/webui_upload_model.png b/images/webui_upload_model.png
new file mode 100644
index 0000000000000000000000000000000000000000..8a8d8342d4ff7923257af0a46939a2f61b76d796
Binary files /dev/null and b/images/webui_upload_model.png differ
diff --git a/notebooks/ultimate_rvc_colab.ipynb b/notebooks/ultimate_rvc_colab.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..993a063c2450d3c2a9aa04208f542198d6996b59
--- /dev/null
+++ b/notebooks/ultimate_rvc_colab.ipynb
@@ -0,0 +1,109 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "kmyCzJVyCymN"
+   },
+   "source": [
+    "Colab for [Ultimate RVC](https://github.com/JackismyShephard/ultimate-rvc)\n",
+    "\n",
+    "This Colab notebook will **help** you if you don’t have a GPU or if your PC isn’t very powerful.\n",
+    "\n",
+    "Simply click `Runtime` in the top navigation bar and `Run all`. Wait for the output of the final cell to show the public gradio url and click on it.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# @title 0: Initialize notebook\n",
+    "%pip install ipython-autotime\n",
+    "%load_ext autotime\n",
+    "\n",
+    "import codecs\n",
+    "import os\n",
+    "\n",
+    "from IPython.display import clear_output\n",
+    "\n",
+    "clear_output()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "aaokDv1VzpAX"
+   },
+   "outputs": [],
+   "source": [
+    "# @title 1: Clone repository\n",
+    "cloneing = codecs.decode(\n",
+    "    \"uggcf://tvguho.pbz/WnpxvfzlFurcuneq/hygvzngr-eip.tvg\",\n",
+    "    \"rot_13\",\n",
+    ")\n",
+    "\n",
+    "!git clone $cloneing HRVC\n",
+    "%cd /content/HRVC\n",
+    "clear_output()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "lVGNygIa0F_1"
+   },
+   "outputs": [],
+   "source": [
+    "# @title 2: Install dependencies\n",
+    "\n",
+    "light = codecs.decode(\"uggcf://nfgeny.fu/hi/0.5.0/vafgnyy.fu\", \"rot_13\")\n",
+    "inits = codecs.decode(\"./fep/hygvzngr_eip/pber/znva.cl\", \"rot_13\")\n",
+    "\n",
+    "!apt install -y python3-dev unzip\n",
+    "!curl -LsSf $light | sh\n",
+    "\n",
+    "os.environ[\"URVC_CONSOLE_LOG_LEVEL\"] = \"WARNING\"\n",
+    "!uv run -q $inits\n",
+    "clear_output()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "lVGNygIa0F_2"
+   },
+   "outputs": [],
+   "source": [
+    "# @title 3: Run Ultimate RVC\n",
+    "\n",
+    "runpice = codecs.decode(\"./fep/hygvzngr_eip/jro/znva.cl\", \"rot_13\")\n",
+    "\n",
+    "!uv run $runpice --share"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/notes/TODO.md b/notes/TODO.md
new file mode 100644
index 0000000000000000000000000000000000000000..7e14c46f3ad1aba08b088f8544b87e718e9735db
--- /dev/null
+++ b/notes/TODO.md
@@ -0,0 +1,462 @@
+# TODO
+
+* should rename instances of "models" to "voice models"
+
+## Project/task management
+
+* Should find tool for project/task management
+* Tool should support:
+  * hierarchical tasks
+  * custom labels and or priorities on tasks
+  * being able to filter tasks based on those labels
+  * being able to close and resolve tasks
+  * Being able to integrate with vscode
+  * Access for multiple people (in a team)
+* Should migrate the content of this file into tool
+* Potential candidates
+  * GitHub projects
+    * Does not yet support hierarchical tasks so no
+  * Trello
+    * Does not seem to support hierarchical tasks either
+  * Notion
+    * Seems to support hierarchical tasks, but is complicated
+  * Todoist
+    * seems to support both hierarchical tasks, custom labels, filtering on those labels, multiple users and there are unofficial plugins for vscode.
+
+## Front end
+
+### Modularization
+
+* Improve modularization of web code using helper functions defined [here](https://huggingface.co/spaces/WoWoWoWololo/wrapping-layouts/blob/main/app.py)
+* Split front-end modules into further sub-modules.
+  * Structure of web folder should be:
+    * `web`
+      * `manage_models`
+        * `__init__.py`
+        * `main.py`
+      * `manage_audio`
+        * `__init__.py`
+        * `main.py`
+      * `generate_song_covers`
+        * `__init__.py`
+        * `main.py`
+        * `one_click_generation`
+          * `__init__.py`
+          * `main.py`
+          * `accordions`
+            * `__init__.py`
+            * `options_x.py` ... ?
+        * `multi_step_generation`
+          * `__init__.py`
+          * `main.py`
+          * `accordions`
+            * `__init__.py`
+            * `step_X.py` ...
+      * `common.py`
+    * For `multi_step_generation/step_X.py`, its potential render function might have to take the set of all "input tracks" in the multi-step generation tab, so these will then have to be defined in `multi_step_generation/main.py`. Other components passed to `multi_step_generation/main.py` might also need to be passed further down to `multi_step_generation/step_X.py`
+    * For `one_click_generation/option_X.py`, its potential render function should
+        render the accordion for the given options and return the components defined in the accordion? Other components passed to `one_click_generation/main.py` might also need to be passed further down to `one_click_generation/option_X.py`
+  * Import components instead of passing them as inputs to render functions (DIFFICULT TO IMPLEMENT)
+    * We have had problems before with component ids when components are instantiated outside a Blocks context in a separate module and then import into other modules and rendered in their blocks contexts.
+
+### Multi-step generation
+
+* If possible merge two consecutive event listeners using `update_cached_songs` in the song retrieval accordion.
+* add description describing how to use each accordion and suggestions for workflows
+
+* add option for adding more input tracks to the mix song step
+  * new components should be created dynamically based on a textfield with names and a button for creating new component
+  * when creating a new component a new transfer button and dropdown should also be created
+  * and the transfer choices for all dropdowns should be updated to also include the new input track
+  * we need to consider how to want to handle vertical space
+    * should be we make a new row once more than 3 tracks are on one row?
+      * yes and there should be also created the new slider on a new row
+      * right under the first row (which itself is under the row with song dir dropdown)
+
+* should also have the possiblity to add more tracks to the pitch shift accordion.
+
+* add a confirmation box with warning if trying to transfer output track to input track that is not empty.
+  * could also have the possibility to ask the user to transfer to create a new input track and transfer the output track to it.
+  * this would just be the same pop up confirmation box as before but in addition to yes and cancel options it will also have a "transfer to new input track" option.
+  * we need custom javasctip for this.
+
+### Common
+
+* fix problem with typing of block.launch()
+  * problem stems from doing from gradio import routes
+  * so instead should import from gradio.routes directly
+  * open a pr with changes
+* save default values for options for song generation in an `SongCoverOptionDefault` enum.
+  * then reference this enum across the two tabs
+  * and also use `list[SongCoverOptionDefault]` as input to reset settings click event listener in single click generation tab.
+* Persist state of app (currently selected settings etc.) across re-renders
+  * This includes:
+    * refreshing a browser windows
+    * Opening app in new browser window
+    * Maybe it should also include when app is started anew?
+  * Possible solutions
+    * use gr.browserstate to allow state to be preserved acrross page loads.
+    * Save any changes to components to a session dictionary and load from it upon refresh
+      * See [here](https://github.com/gradio-app/gradio/issues/3106#issuecomment-1694704623)
+      * Problem is that this solution might not work with accordions or other types of blocks
+            * should use .expand() and .collapse() event listeners on accordions to programmatically reset the state of accordions to what they were before after user has refreshed the page
+    * Use localstorage
+      * see [here](https://huggingface.co/spaces/YiXinCoding/gradio-chat-history/blob/main/app.py) and [here](https://huggingface.co/spaces/radames/gradio_window_localStorage/blob/main/app.py)
+
+    * Whenever the state of a component is changed save the new state to a custom JSON file.
+      * Then whenever the app is refreshed load the current state of components from the JSON file
+      * This solution should probably work for Block types that are not components
+* need to fix the `INFO: Could not find files for the given pattern(s)` on startup of web application on windows (DIFFICULT TO IMPLEMENT)
+  * this is an error that gradio needs to fix
+* Remove reset button on slider components (DIFFICULT TO IMPLEMENT)
+  * this is a gradio feature that needs to be removed.
+* Fix that gradio removes special symbols from audio paths when loaded into audio components (DIFFICULT TO IMPLEMENT)
+  * includes parenthesis, question marks, etc.
+  * its a gradio bug so report?
+* Add button for cancelling any currently running jobs (DIFFICULT TO IMPLEMENT)
+  * Not supported by Gradio natively
+  * Also difficult to implement manually as Gradio seems to be running called backend functions in thread environments
+* dont show error upon missing confirmation (DIFFICULT TO IMPLEMENT)
+  * can return `gr.update()`instead of raising an error in relevant event listener function
+  * but problem is that subsequent steps will still be executed in this case
+* clearing temporary files with the `delete_cache` parameter only seems to work if all windows are closed before closing the app process (DIFFICULT TO IMPLEMENT)
+  * this is a gradio bug so report?
+
+## Online hosting optimization
+
+* make concurrency_id and concurrency limit on components be dependent on whether gpu is used or not
+  * if only cpu then there should be no limit
+* increase value of `default_concurrency_limit` in `Block.queue` so that the same event listener
+  * can be called multiple times concurrently
+* use `Block.launch()` with `max_file_size` to prevent too large uploads
+* define as many functions with async as possible to increase responsiveness of app
+  * and then use `Block.launch()` with `max_threads`set to an appropriate value representing the number of concurrent threads that can be run on the server (default is 40)
+* Persist state of app (currently selected settings etc.) across re-renders
+* consider setting `max_size` in `Block.queue()` to explicitly limit the number of people that can be in the queue at the same time
+* clearing of temporary files should happen after a user logs in and out
+  * and in this case it should only be temporary files for the active user that are cleared
+    * Is that even possible to control?
+* enable server side rendering (requires installing node and setting ssr_mode = true in .launch) (DIFFICULT TO IMPLEMENT)
+  * Also needs to set GRADIO_NODE_PATH to point to the node executable
+  * problem is that on windows there is a ERR_UNSUPPORTED_ESM_URL_SCHEME which needs to be fixed by gradio
+    * see here https://github.com/nodejs/node/issues/31710
+  * on linux it works but it is not possible to shutdown server using CTRL+ C
+
+
+## Back end
+
+### `generate_song_cover.py`
+
+* intermediate file prefixes should be made into enums 
+* find framework for caching intermediate results rather than relying on your homemade system
+
+  * Joblib: <https://medium.com/@yuxuzi/unlocking-efficiency-in-machine-learning-projects-with-joblib-a-python-pipeline-powerhouse-feb0ebfdf4df>
+  * scikit learn: <https://scikit-learn.org/stable/modules/compose.html#pipeline>
+
+  * <https://softwarepatternslexicon.com/machine-learning/infrastructure-and-scalability/workflow-management/pipeline-caching/>
+  * <https://github.com/bmabey/provenance>
+  * <https://docs.sweep.dev/blogs/file-cache>
+
+* Support specific audio formats for intermediate audio file?
+  * it might require some more code to support custom output format for all pipeline functions.
+
+* expand `_get_model_name` so that it can take any audio file in an intermediate audio folder as input (DIFFICULT TO IMPLEMENT)
+  * Function should then try to recursively
+    * look for a corresponding json metadata file
+    * find the model name in that file if it exists
+    * otherwise find the path in the input field in the metadata file
+    * repeat
+  * should also consider whether input audio file belongs to step before audio conversion step
+* use pydantic models to constrain numeric inputs (DIFFICULT TO IMPLEMENT)
+  * for inputs to `convert` function for example
+  * Use `Annotated[basic type, Field[constraint]]` syntax along with a @validate_call decorator on functions
+  * Problem is that pyright does not support `Annotated` so we would have to switch to mypy
+  
+### `manage_models.py`
+
+* use pandas.read_json to load public models table (DIFFICULT TO IMPLEMENT)
+
+## CLI
+
+### Add remaining CLI interfaces
+
+* Interface for `core.manage_models`
+* Interface for `core.manage_audio`
+* Interfaces for individual pipeline functions defined in `core.generate_song_covers`
+
+## python package management
+
+* need to make project version (in `pyproject.toml`) dynamic so that it is updated automatically when a new release is made
+* once diffq-fixed is used by audio-separator we can remove the url dependency on windows
+  * we will still need to wait for uv to make it easy to install package with torch dependency
+  * also it is still necessary to install pytorch first as it is not on pypi index
+* figure out way of making ./urvc commands execute faster
+  * when ultimate rvc is downloaded as a pypi package the exposed commands are much faster so investigate this
+* update dependencies in pyproject.toml
+  * use latest compatible version of all packages
+  * remove commented out code, unless strictly necessary
+
+## Audio separation
+
+* expand back-end function(s) so that they are parametrized by both model type as well as model settings
+  * Need to decide whether we only want to support common model settings or also settings that are unique to each model
+    * It will probably be the latter, which will then require some extra checks.
+  * Need to decide which models supported by `audio_separator` that we want to support
+    * Not all of them seem to work
+    * Probably MDX models and MDXC models
+    * Maybe also VR and demucs?
+  * Revisit online guide for optimal models and settings
+* In multi-step generation tab
+  * Expand audio-separation accordion so that model can be selected and appropriate settings for that model can then be selected.
+    * Model specific settings should expand based on selected model
+* In one-click generation
+  * Should have an "vocal extration" option accordion
+    * Should be able to choose which audio separation steps to include in pipeline
+      * possible steps
+        * step 1: separating audio form instrumentals
+        * step 2: separating main vocals from background vocals:
+        * step 3: de-reverbing vocals
+      * Should pick steps from dropdown?
+      * For each selected step a new sub-accordion with options for that step will then appear
+        * Each accordion should include general settings
+        * We should decide whether model specific settings should also be supported
+        * We Should also decide whether sub-accordion should setting for choosing a model and if so render specific settings based the chosen model
+    * Alternative layout:
+      * have option to choose number of separation steps
+      * then dynamically render sub accordions for each of the selected number of steps
+        * In this case it should be possible to choose models for each accordion
+          * this field should be iniitally empty
+        * Other setttings should probably have sensible defaults that are the same
+      * It might also be a good idea to then have an "examples" pane with recommended combinations of extractions steps
+      * When one of these is selected, then the selected number of accordions with the preset settings should be filled out
+  * optimize pre-processing
+    * check <https://github.com/ArkanDash/Multi-Model-RVC-Inference>
+  * Alternatives to `audio-separator` package:
+    * [Deezer Spleeter](https://github.com/deezer/spleeter)
+      * supports both CLI and python package
+    * [Asteroid](https://github.com/asteroid-team/asteroid)
+    * [Nuzzle](https://github.com/nussl/nussl)
+
+## GitHub
+
+### Actions
+
+* linting with Ruff
+* typechecking with Pyright
+* running all tests
+* automatic building and publishing of project to pypi
+  * includes automatic update of project version number
+* or use pre-commit?
+
+### README
+
+* Fill out TBA sections in README
+* Add note about not using with VPN?
+* Add different emblems/badges in header
+  * like test coverage, build status, etc. (look at other projects for inspiration)
+* spice up text with emojis (look at tiango's projects for inspiration)
+
+### Releases
+
+* Make regular releases like done for Applio
+  * Will be an `.exe` file that when run unzips contents into application folder, where `./urvc run` can then be executed.
+  * Could it be possible to have `.exe` file just start webapp when clicked?
+* Could also include pypi package as a release?
+
+* use pyinstaller to install app into executable that also includes sox and ffmpeg as dependencies (DLLs)
+
+### Other
+
+* In the future consider detaching repo from where it is forked from:
+  * because it is not possible to make the repo private otherwise
+  * see: <https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/detaching-a-fork>
+
+## Incorporate upstream changes
+
+* Incorporate RVC code from [rvc-cli](https://github.com/blaisewf/rvc-cli) (i.e. changes from Applio)
+  * more options for voice conversion and more efficient voice conversion
+  * batch conversion sub-tab
+  * TTS tab
+  * Model training tab
+  * support more pre-trained models
+    * sub-tab under "manage models" tab
+  * support for querying online database with many models that can be downloaded
+  * support for audio and model analysis.
+  * Voice blending tab
+* Incorporate latest changes from [RVC-WebUI](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)
+
+## Vocal Conversion
+
+* support arbitrary combination of pitch detection algorithms
+  * source: <https://github.com/gitmylo/audio-webui>
+* Investigate using onnx models for inference speedup on cpu
+* Add more pitch detection methods
+  * pm
+  * harvest
+  * dio
+  * rvmpe+
+* Implement multi-gpu Inference
+
+## TTS conversion
+
+* also include original edge voice as output
+  * source: <https://github.com/litagin02/rvc-tts-webui>
+
+## Model management
+
+### Training models
+
+* have learning rate for training
+  * source: <https://github.com/gitmylo/audio-webui>
+* have a quick training button
+  * or have preprocess dataset, extract features and generate index happen by default
+* Support a loss/training graph
+  * source: <https://github.com/gitmylo/audio-webui>
+
+### Download models
+
+* Support batch downloading multiple models
+  * requires a tabular request form where both a link column and a name column has to be filled out
+  * we can allow selecting multiple items from public models table and then copying them over
+* support quering online database for models matching a given search string like what is done in applio app
+  * first n rows of online database should be shown by default in public models table
+    * more rows should be retrieved by scrolling down or clicking a button
+  * user search string should filter/narrow returned number of rows in public models table
+  * When clicking a set of rows they should then be copied over for downloading in the "download" table
+* support a column with preview sample in public models table
+  * Only possible if voice snippets are also returned when querying the online database
+* Otherwise we can always support voice snippets for voice models that have already been downloaded
+  * run model on sample text ("quick brown fox runs over the lazy") after it is downloaded
+  * save the results in a `audio/model_preview` folder
+  * Preview can then be loaded into a preview audio component when selecting a model from a dropdown
+  * or if we replace the dropdown with a table with two columns we can have the audio track displayed in the second column
+
+### Model analysis
+
+* we could provide a new tab to analyze an existing model like what is done in applio
+  * or this tab could be consolidated with the delete model tab?
+
+* we could also provide extra model information after model is downloaded
+  * potentialy in dropdown to expand?
+
+## Audio management
+
+### General
+
+* Support audio information tool like in applio?
+  * A new tab where you can upload a song to analyze?
+* more elaborate solution:
+  * tab where where you
+    * can select any song directory
+    * select any step in the audio generation pipeline
+    * then select any intermediate audio file generated in that step
+    * Then have the possibility to
+      * Listen to the song
+      * see a table with its metadata (based on its associated `.json` file)
+        * add timestamp to json files so they can be sorted in table according to creation date
+      * And other statistics in a separate component (graph etc.)
+  * Could have delete buttons both at the level of song_directory, step, and for each song?
+  * Also consider splitting intermediate audio tracks for each step in to subfolder (0,1,2,3...)
+
+## Other settings
+
+* rework other settings tab
+  * this should also contain other settings such as the ability to change the theme of the app
+  * there should be a button to apply settings which will reload the app with the new settings
+
+## Audio post-processing
+
+* Support more effects from the `pedalboard` pakcage.
+  * Guitar-style effects: Chorus, Distortion, Phaser, Clipping
+  * Loudness and dynamic range effects: Compressor, Gain, Limiter
+  * Equalizers and filters: HighpassFilter, LadderFilter, LowpassFilter
+  * Spatial effects: Convolution, Delay, Reverb
+  * Pitch effects: PitchShift
+  * Lossy compression: GSMFullRateCompressor, MP3Compressor
+  * Quality reduction: Resample, Bitcrush
+  * NoiseGate
+  * PeakFilter
+
+## Audio Mixing
+
+* Add main gain loudness slider?
+* Add option to equalize output audio with respect to input audio
+  * i.e. song cover gain (and possibly also more general dynamics) should be the same as those for source song.
+  * check to see if pydub has functionality for this
+  * otherwise a simple solution would be computing the RMS of the difference between the loudness of the input and output track
+
+  ```python
+    rms = np.sqrt(np.mean(np.square(signal)))
+    dB  = 20*np.log10(rms)
+    #add db to output file in mixing function (using pydub)
+  ```
+
+  * When this option is selected the option to set main gain of ouput should be disabled?
+
+* add more equalization options
+  * using `pydub.effects` and `pydub.scipy_effects`?
+
+## Custom UI
+
+* Experiment with new themes including [Building new ones](https://www.gradio.app/guides/theming-guid)
+  * first of all make new theme that is like the default gradio 4 theme in terms of using semi transparent orange as the main color and semi-transparent grey for secondary color. The new gradio 5 theme is good apart from using solid colors so maybe use that as base theme.
+  * Support both dark and light theme in app?
+  * Add Support for changing theme in app?
+  * Use Applio theme as inspiration for default theme?
+* Experiment with using custom CSS
+  * Pass `css = {css_string}` to `gr.Blocks` and use `elem_classes` and `elem_id` to have components target the styles define in the CSS string.
+* Experiment with [custom DataFrame styling](https://www.gradio.app/guides/styling-the-gradio-dataframe)
+* Experiment with custom Javascript
+* Look for opportunities for defining new useful custom components
+
+## Real-time vocal conversion
+
+* Should support being used as OBS plugin
+* Latency is real issue
+* Implementations details:
+  * implement back-end in Rust?
+  * implement front-end using svelte?
+  * implement desktop application using C++ or C#?
+* see <https://github.com/w-okada/voice-changer> and <https://github.com/RVC-Project/obs-rvc> for inspiration
+
+## AI assistant mode
+
+* similar to vocal conversion streaming but instead of converting your voice on the fly, it should:
+  * take your voice,
+  * do some language modelling (with an LLM or something)
+  * then produce an appropriate verbal response
+* We already have Kyutais [moshi](https://moshi.chat/?queue_id=talktomoshi)
+  * Maybe that model can be finetuned to reply with a voice
+  * i.e. your favorite singer, actor, best friend, family member.
+
+## Ultimate RVC bot for discord
+
+* maybe also make a forum on  discord?
+
+## Make app production ready
+
+* have a "report a bug" tab like in applio?
+* should have separate accounts for users when hosting online
+  * use `gr.LoginButton` and `gr.LogoutButton`?
+
+* deploy using docker
+  * See <https://www.gradio.app/guides/deploying-gradio-with-docker>
+* Host on own web-server with Nginx
+  * see <https://www.gradio.app/guides/running-gradio-on-your-web-server-with-nginx>
+
+* Consider having concurrency limit be dynamic, i.e. instead of always being 1 for jobs using gpu consider having it depend upon what resources are available.
+  * We can app set the GPU_CONCURRENCY limit to be os.envrion["GPU_CONCURRENCY_LIMIT] or 1 and then pass GPU_CONCURRENCY as input to places where event listeners are defined
+
+## Colab notebook
+
+* find way of saving virtual environment with python 3.11 in colab notebook (DIFFICULT TO IMPLEMENT)
+  * so that this environment can be loaded directly rather than downloading all dependencies every time app is opened
+
+## Testing
+
+* Add example audio files to use for testing
+  * Should be located in `audio/examples`
+  * could have sub-folders `input` and `output`
+    * in `output` folder we have `output_audio.ext` files each with a corresponding `input_audio.json` file containing metadata explaining arguments used to generate output
+    * We can then test that actual output is close enough to expected output using audio similarity metric.
+* Setup unit testing framework using pytest
diff --git a/notes/app-doc.md b/notes/app-doc.md
new file mode 100644
index 0000000000000000000000000000000000000000..5f011ba7902c8314db7fea80be3abd9b0553529c
--- /dev/null
+++ b/notes/app-doc.md
@@ -0,0 +1,19 @@
+# `main`
+
+Run the Ultimate RVC web application.
+
+**Usage**:
+
+```console
+$ main [OPTIONS]
+```
+
+**Options**:
+
+* `-s, --share`: Enable sharing
+* `-l, --listen`: Make the web application reachable from your local network.
+* `-h, --listen-host TEXT`: The hostname that the server will use.
+* `-p, --listen-port INTEGER`: The listening port that the server will use.
+* `--install-completion`: Install completion for the current shell.
+* `--show-completion`: Show completion for the current shell, to copy it or customize the installation.
+* `--help`: Show this message and exit.
diff --git a/notes/cli-doc.md b/notes/cli-doc.md
new file mode 100644
index 0000000000000000000000000000000000000000..941708b7254df14f3acfc07ecbeb99a40bfe95fd
--- /dev/null
+++ b/notes/cli-doc.md
@@ -0,0 +1,74 @@
+# `urvc-cli`
+
+CLI for the Ultimate RVC project
+
+**Usage**:
+
+```console
+$ urvc-cli [OPTIONS] COMMAND [ARGS]...
+```
+
+**Options**:
+
+* `--install-completion`: Install completion for the current shell.
+* `--show-completion`: Show completion for the current shell, to copy it or customize the installation.
+* `--help`: Show this message and exit.
+
+**Commands**:
+
+* `song-cover`: Generate song covers
+
+## `urvc-cli song-cover`
+
+Generate song covers
+
+**Usage**:
+
+```console
+$ urvc-cli song-cover [OPTIONS] COMMAND [ARGS]...
+```
+
+**Options**:
+
+* `--help`: Show this message and exit.
+
+**Commands**:
+
+* `run-pipeline`: Run the song cover generation pipeline.
+
+### `urvc-cli song-cover run-pipeline`
+
+Run the song cover generation pipeline.
+
+**Usage**:
+
+```console
+$ urvc-cli song-cover run-pipeline [OPTIONS] SOURCE MODEL_NAME
+```
+
+**Arguments**:
+
+* `SOURCE`: A Youtube URL, the path to a local audio file or the path to a song directory.  [required]
+* `MODEL_NAME`: The name of the voice model to use for vocal conversion.  [required]
+
+**Options**:
+
+* `--n-octaves INTEGER`: The number of octaves to pitch-shift the converted vocals by.Use 1 for male-to-female and -1 for vice-versa.  [default: 0]
+* `--n-semitones INTEGER`: The number of semi-tones to pitch-shift the converted vocals, instrumentals, and backup vocals by. Altering this slightly reduces sound quality  [default: 0]
+* `--f0-method [rmvpe|mangio-crepe]`: The method to use for pitch detection during vocal conversion. Best option is RMVPE (clarity in vocals), then Mangio-Crepe (smoother vocals).  [default: rmvpe]
+* `--index-rate FLOAT RANGE`: A decimal number e.g. 0.5, Controls how much of the accent in the voice model to keep in the converted vocals. Increase to bias the conversion towards the accent of the voice model.  [default: 0.5; 0<=x<=1]
+* `--filter-radius INTEGER RANGE`: A number between 0 and 7. If >=3: apply median filtering to the pitch results harvested during vocal conversion. Can help reduce breathiness in the converted vocals.  [default: 3; 0<=x<=7]
+* `--rms-mix-rate FLOAT RANGE`: A decimal number e.g. 0.25. Controls how much to mimic the loudness of the input vocals (0) or a fixed loudness (1) during vocal conversion.  [default: 0.25; 0<=x<=1]
+* `--protect FLOAT RANGE`: A decimal number e.g. 0.33. Controls protection of voiceless consonants and breath sounds during vocal conversion. Decrease to increase protection at the cost of indexing accuracy. Set to 0.5 to disable.  [default: 0.33; 0<=x<=0.5]
+* `--hop-length INTEGER`: Controls how often the CREPE-based pitch detection algorithm checks for pitch changes during vocal conversion. Measured in milliseconds. Lower values lead to longer conversion times and a higher risk of voice cracks, but better pitch accuracy. Recommended value: 128.  [default: 128]
+* `--room-size FLOAT RANGE`: The room size of the reverb effect applied to the converted vocals. Increase for longer reverb time. Should be a value between 0 and 1.  [default: 0.15; 0<=x<=1]
+* `--wet-level FLOAT RANGE`: The loudness of the converted vocals with reverb effect applied. Should be a value between 0 and 1  [default: 0.2; 0<=x<=1]
+* `--dry-level FLOAT RANGE`: The loudness of the converted vocals wihout reverb effect applied. Should be a value between 0 and 1.  [default: 0.8; 0<=x<=1]
+* `--damping FLOAT RANGE`: The absorption of high frequencies in the reverb effect applied to the converted vocals. Should be a value between 0 and 1.  [default: 0.7; 0<=x<=1]
+* `--main-gain INTEGER`: The gain to apply to the post-processed vocals. Measured in dB.  [default: 0]
+* `--inst-gain INTEGER`: The gain to apply to the pitch-shifted instrumentals. Measured in dB.  [default: 0]
+* `--backup-gain INTEGER`: The gain to apply to the pitch-shifted backup vocals. Measured in dB.  [default: 0]
+* `--output-sr INTEGER`: The sample rate of the song cover.  [default: 44100]
+* `--output-format [mp3|wav|flac|ogg|m4a|aac]`: The audio format of the song cover.  [default: mp3]
+* `--output-name TEXT`: The name of the song cover.
+* `--help`: Show this message and exit.
diff --git a/notes/gradio.md b/notes/gradio.md
new file mode 100644
index 0000000000000000000000000000000000000000..39ed657a93e96785b1c6b2ac8425b3b5684cd0e6
--- /dev/null
+++ b/notes/gradio.md
@@ -0,0 +1,615 @@
+
+# Gradio notes
+
+## Modularizing large gradio codebases
+
+See this [tutorial](https://www.gradio.app/guides/wrapping-layouts) and corresponding [code](https://huggingface.co/spaces/WoWoWoWololo/wrapping-layouts/blob/main/app.py).
+
+## Event listeners
+
+### Attaching event listeners using decorators
+
+```python
+@greet_btn.click(inputs=name, outputs=output)
+def greet(name):
+    return "Hello " + name + "!"
+```
+
+### Function input using dicts
+
+```python
+a = gr.Number(label="a")
+b = gr.Number(label="b")
+
+def sub(data):
+    return data[a] - data[b]
+sub_btn.click(sub, inputs={a, b}, outputs=c)
+```
+
+This syntax may be better for functions with many inputs
+
+### Function output using dicts
+
+```python
+food_box = gr.Number(value=10, label="Food Count")
+status_box = gr.Textbox()
+
+def eat(food):
+    if food > 0:
+        return {food_box: food - 1, status_box: "full"}
+    else:
+        return {status_box: "hungry"}
+
+gr.Button("Eat").click(
+    fn=eat,
+    inputs=food_box,
+    outputs=[food_box, status_box]
+)
+```
+
+Allows you to skip updating some output components.
+
+### Binding multiple event listeners to one function
+
+```python
+name = gr.Textbox(label="Name")
+output = gr.Textbox(label="Output Box")
+greet_btn = gr.Button("Greet")
+trigger = gr.Textbox(label="Trigger Box")
+
+def greet(name, evt_data: gr.EventData):
+    return "Hello " + name + "!", evt_data.target.__class__.__name__
+
+def clear_name(evt_data: gr.EventData):
+    return ""
+
+gr.on(
+    triggers=[name.submit, greet_btn.click],
+    fn=greet,
+    inputs=name,
+    outputs=[output, trigger],
+).then(clear_name, outputs=[name])
+```
+
+* Use `gr.on` with optional `triggers` argument. If `triggers` is not set then the given function will be called for all `.change` event listeners in the app.
+* Allows you to DRY a lot of code potentially.
+
+### Running events continuously
+
+```python
+with gr.Blocks as demo:
+    timer = gr.Timer(5)
+    textbox = gr.Textbox()
+    textbox2 = gr.Textbox()
+    timer.tick(set_textbox_fn, textbox, textbox2)
+```
+
+Or alternatively the following semantics can be used:
+
+```python
+with gr.Blocks as demo:
+timer = gr.Timer(5)
+textbox = gr.Textbox()
+textbox2 = gr.Textbox(set_textbox_fn, inputs=[textbox], every=timer)
+```
+
+## Other semantics
+
+### Conditional component values
+
+```python
+with gr.Blocks() as demo:
+    num1 = gr.Number()
+    num2 = gr.Number()
+    product = gr.Number(lambda a, b: a * b, inputs=[num1, num2])
+```
+
+* Value of component must be a function taking two component values and returning a new component value
+* Component must also take a list of inputs indicating which other components should be used to compute its value
+* Components value will always be updated whenever the other components `.change` event listeners are called.
+* Hence this method can be used to DRY code with many `.change` event listeners
+
+### Dynamic behavior
+
+We can use the `@gr.render` decorator to dynamically define components and event listeners while an app is executing
+
+#### Dynamic components
+
+```python
+import gradio as gr
+
+    with gr.Blocks() as demo:
+        input_text = gr.Textbox(label="input")
+
+        @gr.render(inputs=input_text)
+        def show_split(text):
+            if len(text) == 0:
+                gr.Markdown("## No Input Provided")
+            else:
+                for letter in text:
+                    gr.Textbox(letter)
+
+    demo.launch()
+```
+
+By default `@gr.render` is called whenever the `.change` event for the given input components are executed or when the app is loaded. This can be overriden by also giving a triggers argument to the decorator:
+
+```python
+@gr.render(inputs=input_text, triggers = [input_text.submit])
+...
+```
+
+#### Dynamic event listeners
+
+```python
+with gr.Blocks() as demo:
+    text_count = gr.State(1)
+    add_btn = gr.Button("Add Box")
+    add_btn.click(lambda x: x + 1, text_count, text_count)
+
+    @gr.render(inputs=text_count)
+    def render_count(count):
+        boxes = []
+        for i in range(count):
+            box = gr.Textbox(key=i, label=f"Box {i}")
+            boxes.append(box)
+
+        def merge(*args):
+            return " ".join(args)
+
+        merge_btn.click(merge, boxes, output)
+
+    merge_btn = gr.Button("Merge")
+    output = gr.Textbox(label="Merged Output")
+```
+
+* All event listeners that use components created inside a render function must also be defined inside that render function
+* The event listener can still reference components outside the render function
+* Just as with components, whenever a function re-renders, the event listeners created from the previous render are cleared and the new event listeners from the latest run are attached.
+* setting `key = ...` when instantiating a  component ensures that the value of the component is preserved upon rerender
+  * This is might also allow us to preserve session state easily across browser refresh?
+
+#### A more elaborate example
+
+```python
+import gradio as gr
+
+with gr.Blocks() as demo:
+
+    tasks = gr.State([])
+    new_task = gr.Textbox(label="Task Name", autofocus=True)
+
+    def add_task(tasks, new_task_name):
+        return tasks + [{"name": new_task_name, "complete": False}], ""
+
+    new_task.submit(add_task, [tasks, new_task], [tasks, new_task])
+
+    @gr.render(inputs=tasks)
+    def render_todos(task_list):
+        complete = [task for task in task_list if task["complete"]]
+        incomplete = [task for task in task_list if not task["complete"]]
+        gr.Markdown(f"### Incomplete Tasks ({len(incomplete)})")
+        for task in incomplete:
+            with gr.Row():
+                gr.Textbox(task['name'], show_label=False, container=False)
+                done_btn = gr.Button("Done", scale=0)
+                def mark_done(task=task):
+                    task["complete"] = True
+                    return task_list
+                done_btn.click(mark_done, None, [tasks])
+
+                delete_btn = gr.Button("Delete", scale=0, variant="stop")
+                def delete(task=task):
+                    task_list.remove(task)
+                    return task_list
+                delete_btn.click(delete, None, [tasks])
+
+        gr.Markdown(f"### Complete Tasks ({len(complete)})")
+        for task in complete:
+            gr.Textbox(task['name'], show_label=False, container=False)
+
+demo.launch()
+```
+
+* Any event listener that modifies a state variable in a manner that should trigger a re-render must set the state variable as an output. This lets Gradio know to check if the variable has changed behind the scenes.
+* In a `gr.render`, if a variable in a loop is used inside an event listener function, that variable should be "frozen" via setting it to itself as a default argument in the function header. See how we have task=task in both mark_done and delete. This freezes the variable to its "loop-time" value.
+
+### Progress bars
+
+Instead of doing `gr.progress(percentage, desc= "...")` in core helper functions you can just use tqdm directly in your code by instantiating `gr.progress(track_tqdm = true)` in a web helper function/harness.
+
+Alternatively, you can also do `gr.Progress().tqdm(iterable, description, total, unit)` to attach a tqdm iterable to the progress bar
+
+Benefits of either approach is:
+
+* we do not have to supply a `gr.Progress` object to core functions.
+* Perhaps it will also be possible to get a progress bar that automatically generates several update steps for a given caption, rather than just one step as is the case when using `gr.Progress`
+
+### State
+
+Any variable created outside a function call is shared by all users of app
+
+So when deploying app in future need to use `gr.State()` for all variables declared outside functions?
+
+## Notes on Gradio classes
+
+* `Blocks.launch()`
+  * `prevent_thread_lock` can be used to have an easier way of shutting down app?
+  * `show_error`: if `True`can allow us not to have to reraise core exceptions as `gr.Error`?
+* `Tab`
+  * event listener triggered when tab is selected could be useful?
+* `File`
+  * `file_type`: can use this to limit input types to .pth, .index and .zip when downloading a model
+* `Label`
+  * Intended for output of classification models
+  * for actual labels in UI maybe use `gr.Markdown`?
+
+* `Button`
+  * `link`: link to open when button is clicked?
+  * `icon`: path to icon to display on button
+
+* `Audio`: relevant event listeners:
+  * `upload`: when a value is uploaded
+  * `input`: when a value is changed
+  * `clear`: when a value is cleared
+* `Dropdown`
+  * `height`
+  * `min_width`
+  * `wrap`: if text in cells should wrap
+  * `column_widths`: width of each column
+  * `datatype`: list of `"str"`, `"number"`, `"bool"`, `"date"`, `"markdown"`
+
+## Performance optimization
+
+* Can set `max_threads` argument for `Block.launch()`
+if you have any async definitions in your code (`async def`).
+* can set `max_size` argument on `Block.queue()`. This limits how many people can wait in line in the queue. If too many people are in line, new people trying to join will receive an error message. This can be better than default which is just having people wait indefinitely
+* Can increase `default_concurrency_limit` for `Block.queue()`. Default is `1`. Increasing to more might make operations more effective.
+* Rewrite functions so that they take a batched input and set `batched = True` on the event listener calling the function
+
+## Environment Variables
+
+Gradio supports environment variables which can be used to customize the behavior
+of your app from the command line instead of setting these parameters in `Blocks.launch()`
+
+* GRADIO_ANALYTICS_ENABLED
+* GRADIO_SERVER_PORT
+* GRADIO_SERVER_NAME
+* GRADIO_TEMP_DIR
+* GRADIO_SHARE
+* GRADIO_ALLOWED_PATHS
+* GRADIO_BLOCKED_PATHS
+
+These could be useful when running gradio apps from a shell script.
+
+## Networking
+
+### File Access
+
+Users can access:
+
+* Temporary files created by gradio
+* Files that are allowed via the `allowed_paths` parameter set in `Block.launch()`
+* static files that are set via [gr.set_static_paths](https://www.gradio.app/docs/gradio/set_static_paths)
+  * Accepts a list of directories or files names that will not be copied to the cached but served directly from computer.
+  * BONUS: This can be used in ULTIMATE RVC for dispensing with the temp gradio directory. Need to consider possible ramifications before implementing this though.
+
+Users cannot access:
+
+* Files that are blocked via the `blocked_paths` parameter set in `Block.launch()`
+  * This parameter takes precedence over the `allowed_paths` parameter and over default allowed paths
+* Any other paths on the host machine
+  * This is something to consider when hosting app online
+
+#### Limiting file upload size
+
+you can use `Block.launch(max_file_size= ...)` to limit max file size in MBs for each user.
+
+### Access network request
+
+you can access information from a network request directly within a gradio app:
+
+```python
+import gradio as gr
+
+def echo(text, request: gr.Request):
+    if request:
+        print("Request headers dictionary:", request.headers)
+        print("IP address:", request.client.host)
+        print("Query parameters:", dict(request.query_params))
+    return text
+
+io = gr.Interface(echo, "textbox", "textbox").launch()
+```
+
+If the network request is not done via the gradio UI then it will be `None` so always check if it exists
+
+### Authentication
+
+#### Password protection
+
+You can have an authentication page in front of your app by doing:
+
+```python
+demo.launch(auth=("admin", "pass1234"))
+```
+
+More complex handling can be achieved by giving a function as input:
+
+```python
+def same_auth(username, password):
+    return username == password
+demo.launch(auth=same_auth)
+```
+
+Also support a logout page:
+
+```python
+import gradio as gr
+
+def update_message(request: gr.Request):
+    return f"Welcome, {request.username}"
+
+with gr.Blocks() as demo:
+    m = gr.Markdown()
+    logout_button = gr.Button("Logout", link="/logout")
+    demo.load(update_message, None, m)
+    
+demo.launch(auth=[("Pete", "Pete"), ("Dawood", "Dawood")])
+```
+
+NOTE:
+
+* For authentication to work properly, third party cookies must be enabled in your browser. This is not the case by default for Safari or for Chrome Incognito Mode.
+* Gradio's built-in authentication provides a straightforward and basic layer of access control but does not offer robust security features for applications that require stringent access controls (e.g. multi-factor authentication, rate limiting, or automatic lockout policies).
+
+##### Custom user content
+
+Customize content for each user by accessing the network request directly:
+
+```python
+import gradio as gr
+
+def update_message(request: gr.Request):
+    return f"Welcome, {request.username}"
+
+with gr.Blocks() as demo:
+    m = gr.Markdown()
+    demo.load(update_message, None, m)
+    
+demo.launch(auth=[("Abubakar", "Abubakar"), ("Ali", "Ali")])
+```
+
+#### OAuth Authentication
+
+See <https://www.gradio.app/guides/sharing-your-app#o-auth-with-external-providers>
+
+## Styling
+
+### UI Layout
+
+#### `gr.Row`
+
+* `equal_height = false` will not force component on the same row to have the same height
+* experiment with `variant = 'panel'` or `variant = 'compact'` for different look
+
+#### `gr.Column`
+
+* experiment with `variant = 'panel'` or `variant = 'compact'` for different look
+
+#### `gr.Block`
+
+* `fill_height = True` and `fill_width = True` can be used to fill browser window
+
+#### `gr.Component`
+
+* `scale = 0` can be used to prevent component from expanding to take up space.
+
+### DataFrame styling
+
+See <https://www.gradio.app/guides/styling-the-gradio-dataframe>
+
+### Themes
+
+```python
+with gr.Blocks(theme=gr.themes.Glass()):
+...
+```
+
+See this [theming guide](https://www.gradio.app/guides/theming-guide) for how to create new custom themes both using the gradio theme builder
+
+### Custom CSS
+
+Change background color to red:
+
+```python
+with gr.Blocks(css=".gradio-container {background-color: red}") as demo:
+...
+```
+
+Set background to image file:
+
+```python
+with gr.Blocks(css=".gradio-container {background: url('file=clouds.jpg')}") as demo:
+...
+```
+
+#### Customize Component style
+
+Use `elem_id` and `elem_classes` when instantiating component. This will allow you to select elements more easily with CSS:
+
+```python
+css = """
+#warning {background-color: #FFCCCB}
+.feedback textarea {font-size: 24px !important}
+"""
+
+with gr.Blocks(css=css) as demo:
+    box1 = gr.Textbox(value="Good Job", elem_classes="feedback")
+    box2 = gr.Textbox(value="Failure", elem_id="warning", elem_classes="feedback")
+```
+
+* `elem_id` adds an HTML element id to the specific component
+* `elem_classes`adds a class or list of classes to the component.
+
+## Custom front-end logic
+
+### Custom Javascript
+
+You can add javascript
+
+* as a string or file path when instantiating a block:
+```blocks(js = path or string)```
+  * Javascript will be executed when app loads?
+* as a string to an event listener. This javascript code will be executed before the main function attached to the event listner.
+* add javascript code to the head param of the blocks initializer. This will add the code to the head of the HTML document:
+
+    ```python
+    head = f"""
+    <script async src="https://www.googletagmanager.com/gtag/js?id={google_analytics_tracking_id}"></script>
+    <script>
+        window.dataLayer = window.dataLayer || [];
+        function gtag(){{dataLayer.push(arguments);}}
+        gtag('js', new Date());
+        gtag('config', '{google_analytics_tracking_id}');
+    </script>
+    """
+
+    with gr.Blocks(head=head) as demo:
+        ...demo code...
+    ```
+
+### Custom Components
+
+See <https://www.gradio.app/guides/custom-components-in-five-minutes>
+
+## Connecting to databases
+
+Might be useful when we need to retrieve voice models hosted online later.
+
+Can import data using a combination of `sqlalchemy.create_engine` and `pandas.read_sql_query`:
+
+```python
+from sqlalchemy import create_engine
+import pandas as pd
+
+engine = create_engine('sqlite:///your_database.db')
+
+with gr.Blocks() as demo:
+    origin = gr.Dropdown(["DFW", "DAL", "HOU"], value="DFW", label="Origin")
+
+    gr.LinePlot(
+        lambda origin: pd.read_sql_query(
+            f"SELECT time, price from flight_info WHERE origin = {origin};", 
+            engine
+        ), inputs=origin, x="time", y="price")
+```
+
+## Sharing a Gradio App
+
+### Direct sharing
+
+* You can do `Blocks.launch(share = True)` to launch app on a public link that expires in 72 hours
+* IT is possible to set up your own Share Server on your own cloud server to overcome this restriction
+  * See <https://github.com/huggingface/frp/>
+
+### Embedding hosted HF space
+
+You can embed a gradio app hosted on huggingface spaces into any other web app.
+
+## Gradio app in production
+
+Useful information for migrating gradio app to production.
+
+### App hosting
+
+#### Custom web-server with Nginx
+
+see <https://www.gradio.app/guides/running-gradio-on-your-web-server-with-nginx>
+
+#### Deploying a gradio app with docker
+
+See <https://www.gradio.app/guides/deploying-gradio-with-docker>
+
+#### Running serverless apps
+
+Web apps hosted completely in your browser (without any server for backend) can be implemented using a combination of Gradio lite + transformers.js.
+
+More information:
+
+* <https://www.gradio.app/guides/gradio-lite>
+* <https://www.gradio.app/guides/gradio-lite-and-transformers-js>
+
+#### Zero-GPU spaces
+
+In development.
+
+see <https://www.gradio.app/main/docs/python-client/using-zero-gpu-spaces>
+
+#### Analytics dashboard
+
+Used for monitoring traffic.
+
+Analytics can be disabled by setting `analytics_enabled = False` as argument to `gr.Blocks()`
+
+### Gradio App as API
+
+Each gradio app has a button that redirects you to documentation for a corresponding API. This API can be called via:
+
+* Dedicated [Python](https://www.gradio.app/guides/getting-started-with-the-python-client) or [Javascript](https://www.gradio.app/guides/getting-started-with-the-js-client) API clients.
+* [Curl](https://www.gradio.app/guides/querying-gradio-apps-with-curl)
+* Community made [Rust client](https://www.gradio.app/docs/third-party-clients/rust-client).
+
+Alternatively, one can
+
+* mount gradio app within existing fastapi application
+* do a combination where the python gradio client is used inside fastapi app to query an endpoint from a gradio app.
+
+#### Mounting app within FastAPI app
+
+```python
+from fastapi import FastAPI
+import gradio as gr
+
+CUSTOM_PATH = "/gradio"
+
+app = FastAPI()
+
+@app.get("/")
+def read_main():
+    return {"message": "This is your main app"}
+
+io = gr.Interface(lambda x: "Hello, " + x + "!", "textbox", "textbox")
+app = gr.mount_gradio_app(app, io, path=CUSTOM_PATH)
+```
+
+* Run this from the terminal as you would normally start a FastAPI app: `uvicorn run:app`
+* and navigate to <http://localhost:8000/gradio> in your browser.
+
+#### Using a block context as a function to call
+
+```python
+english_translator = gr.load(name="spaces/gradio/english_translator")
+def generate_text(text):
+    english_text = english_generator(text)[0]["generated_text"]
+```
+
+If the app you are loading defines more than one function, you can specify which function to use with the `fn_index` and `api_name` parameters:
+
+```python
+translate_btn.click(translate, inputs=english, outputs=german, api_name="translate-to-german")
+....
+english_generator(text, api_name="translate-to-german")[0]["generated_text"]
+```
+
+#### Automatic API documentation
+
+1. Record api calls to generate snippets of calls made in app. Gradio
+
+2. Gradio can then reconstruct documentation describing what happened
+
+#### LLM agents
+
+LLM agents such as those defined using LangChain can call gradio apps and compose the results they produce.
+
+More information: <https://www.gradio.app/guides/gradio-and-llm-agents>
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..013eb40dd1f56ea218f1c2acb785a0a3f0196ce5
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,225 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "ultimate-rvc"
+version = "0.1.24"
+description = "Ultimate RVC"
+readme = "README.md"
+requires-python = "==3.12.*"
+dependencies = [
+    # General
+    "lib==4.0.0",
+
+    #Validation
+    "pydantic==2.9.2",
+
+    # CLI
+    "typer==0.12.5",
+    
+    # Networking
+    "requests==2.32.3",
+    "yt_dlp==2024.11.4",
+    "nodejs-wheel-binaries==22.11.0",
+    # TODO add these later
+    # "deemix",
+    # "wget",
+    # "flask",
+    # "beautifulsoup4",
+    # "pypresence",
+
+    # Data science
+    "numpy==1.26.4",
+    "scipy==1.14.1",
+    "matplotlib==3.9.2",
+    "tqdm==4.66.6",
+    "gradio==5.6.0",
+    
+    # Machine learning
+    "torch==2.5.1+cu124",
+    "torchaudio==2.5.1+cu124",
+    "torchcrepe==0.0.23",
+    "fairseq-fixed==0.12.3.1",
+    "faiss-cpu==1.9.0",
+    # Version of onnxruntime-gpu needs to align with what 
+    # version audio-separator package uses.
+    "onnxruntime-gpu==1.19.2", 
+    "tensorboardX==2.6.2.2",
+    # TODO add these later
+    # "tensorboard",
+    # "torchfcpe",
+    # "local-attention",
+    # "libf0",
+    # "einops",
+    # "numba; sys_platform == 'linux'",
+    # "numba==0.57.0; sys_platform == 'darwin' or sys_platform == 'win32'",
+    
+    # Audio
+    "static-ffmpeg==2.7",
+    "static-sox==1.0.1",
+    "typed-ffmpeg==2.1.0",
+    "soundfile==0.12.1",
+    "librosa==0.10.2",
+    "sox==1.5.0",
+    "pydub==0.25.1",
+    "pydub-stubs==0.25.1.2",
+    "pedalboard==0.9.16",
+    "audio-separator[gpu]==0.24.1",
+    "praat-parselmouth==0.4.5",
+    "pyworld-fixed==0.3.8",
+    "diffq==0.2.4"
+    # TODO add the later
+    # "noisereduce",
+    # "audio_upscaler==0.1.4",
+    # "edge-tts==6.1.9",
+    # "ffmpeg-python>=0.2.0",
+    # "ffmpy==0.3.1"
+]
+
+[project.scripts]
+urvc = "ultimate_rvc.cli.main:app"
+urvc-web = "ultimate_rvc.web.main:app_wrapper"
+
+[tool.uv]
+environments = ["sys_platform == 'win32'", "sys_platform == 'linux'"]
+cache-dir = "./uv/cache"
+compile-bytecode = true
+
+[tool.uv.sources]
+torch = { index = "torch-cu124"}
+torchaudio = { index = "torch-cu124"}
+diffq = { url = "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/diffq-0.2.4-cp312-cp312-win_amd64.whl", marker = "sys_platform == 'win32'"}
+
+[[tool.uv.index]]
+name = "torch-cu124"
+url = "https://download.pytorch.org/whl/cu124"
+explicit = true
+
+[tool.pyright]
+stubPath = "src/ultimate_rvc/stubs"
+pythonVersion = "3.12"
+pythonPlatform = "All"
+typeCheckingMode = "strict"
+ignore = ["**/.venv"]
+exclude = ["./uv"]
+
+[tool.black]
+target-version = ['py312']
+preview = true
+enable-unstable-feature = ["string_processing"]
+
+[tool.ruff]
+target-version = "py312"
+fix = true
+required-version = ">=0.5.7"
+
+[tool.ruff.format]
+docstring-code-format = true
+preview = true
+
+[tool.ruff.lint]
+select = ["ALL"]
+extend-select = ["I"]
+ignore = [
+    # Ignore missing blank before between class name and docstring
+    "D203", 
+    # Do not require a description after summary line in docstring
+    "D205",
+    # Do not require summary line to be located on first physical line of docstring
+    "D212", 
+    # Do not require docstring section names to end with colon
+    "D416",  
+    # Ignore TODO notes
+    "FIX002",
+    "TD002",
+    "TD003",
+    "TD004",
+    # Ignore missing copyright notice
+    "CPY001", 
+    # Ignore function signatures with too many arguments
+    "PLR0913",
+    # ignore function signatures with too many positional arguments
+    "PLR0917", 
+    # Ignore boolean positional argument in function signature
+    "FBT002", 
+    "FBT001",
+]
+unfixable = ["F401"]
+preview = true
+
+[tool.ruff.lint.flake8-annotations]
+#ignore-fully-untyped = true
+
+[tool.ruff.lint.isort]
+relative-imports-order = "closest-to-furthest"
+section-order = [
+    "future",
+    "typing",
+    "standard-library",
+    "third-party",
+    "networking",
+    "validation",
+    "data-science",
+    "machine-learning",
+    "audio",
+    "cli",
+    "first-party",
+    "ultimate_rvc",
+    "local-folder",
+]
+
+[tool.ruff.lint.isort.sections]
+"typing" = ["typing", "typing_extensions"]
+"networking" = [
+    "requests",
+    "yt_dlp",
+    "deemix",
+    "wget",
+    "flask",
+    "beautifulsoup4",
+    "pypresence",
+]
+"validation" = ["pydantic"]
+"data-science" = [
+    "numpy", 
+    "scipy", 
+    "matplotlib", 
+    "tqdm", 
+    "pandas", 
+    "gradio"
+]
+"machine-learning" = [
+    "torch",
+    "torchaudio",
+    "torchcrepe",
+    "fairseq",
+    "faiss",
+    "tensorboard",
+    "torchfcpe",
+    "local_attention",
+    "libf0",
+    "einops",
+    "numba",
+]
+"audio" = [
+    "static_ffmpeg",
+    "static_sox",
+    "ffmpeg",
+    "soundfile",
+    "librosa",
+    "sox",
+    "pydub",
+    "pedalboard",
+    "audio_separator",
+    "parselmouth",
+    "pyworld",
+    "noisereduce",
+    "audio_upscaler",
+    "edge_tts",
+    "ffmpy",
+]
+"cli" = ["typer", "rich"]
+"ultimate_rvc" = ["ultimate_rvc"]
+[tool.ruff.lint.pycodestyle]
+max-doc-length = 72
diff --git a/src/ultimate_rvc/__init__.py b/src/ultimate_rvc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f548ee4413ee775ebe71cc21c5398390e8777208
--- /dev/null
+++ b/src/ultimate_rvc/__init__.py
@@ -0,0 +1,40 @@
+"""The Ultimate RVC project."""
+
+import logging
+import os
+from logging.handlers import RotatingFileHandler
+from pathlib import Path
+
+from ultimate_rvc.common import BASE_DIR
+
+logger = logging.getLogger()
+
+URVC_NO_LOGGING = os.getenv("URVC_NO_LOGGING", "0") == "1"
+URVC_LOGS_DIR = Path(os.getenv("URVC_LOGS_DIR") or BASE_DIR / "logs")
+URVC_CONSOLE_LOG_LEVEL = os.getenv("URVC_CONSOLE_LOG_LEVEL", "ERROR")
+URVC_FILE_LOG_LEVEL = os.getenv("URVC_FILE_LOG_LEVEL", "INFO")
+
+if URVC_NO_LOGGING:
+    logging.basicConfig(handlers=[logging.NullHandler()])
+
+else:
+    stream_handler = logging.StreamHandler()
+    stream_handler.setLevel(URVC_CONSOLE_LOG_LEVEL)
+
+    URVC_LOGS_DIR.mkdir(exist_ok=True, parents=True)
+    file_handler = RotatingFileHandler(
+        URVC_LOGS_DIR / "ultimate_rvc.log",
+        mode="a",
+        maxBytes=1024 * 1024 * 5,
+        backupCount=1,
+        encoding="utf-8",
+    )
+    file_handler.setLevel(URVC_FILE_LOG_LEVEL)
+
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        style="%",
+        level=logging.DEBUG,
+        handlers=[stream_handler, file_handler],
+    )
diff --git a/src/ultimate_rvc/cli/__init__.py b/src/ultimate_rvc/cli/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..53cba53ec5bb02021d18d29c0300e05173e70d0a
--- /dev/null
+++ b/src/ultimate_rvc/cli/__init__.py
@@ -0,0 +1,8 @@
+"""
+Package which defines the command-line interface for the Ultimate RVC
+project.
+"""
+
+from ultimate_rvc.core.main import initialize
+
+initialize()
diff --git a/src/ultimate_rvc/cli/generate/song_cover.py b/src/ultimate_rvc/cli/generate/song_cover.py
new file mode 100644
index 0000000000000000000000000000000000000000..d823db13268e150242f6632a65ef6dce22ebcaf4
--- /dev/null
+++ b/src/ultimate_rvc/cli/generate/song_cover.py
@@ -0,0 +1,409 @@
+"""
+Module which defines the command-line interface for generating a song
+cover.
+"""
+
+from typing import Annotated
+
+from pathlib import Path
+
+import typer
+from rich import print as rprint
+from rich.panel import Panel
+from rich.table import Table
+
+from ultimate_rvc.core.generate.song_cover import run_pipeline as _run_pipeline
+from ultimate_rvc.core.generate.song_cover import to_wav as _to_wav
+from ultimate_rvc.typing_extra import AudioExt, F0Method
+
+app = typer.Typer(
+    name="song-cover",
+    no_args_is_help=True,
+    help="Generate song covers",
+    rich_markup_mode="markdown",
+)
+
+
+def complete_name(incomplete: str, enumeration: list[str]) -> list[str]:
+    """
+    Return a list of names that start with the incomplete string.
+
+    Parameters
+    ----------
+    incomplete : str
+        The incomplete string to complete.
+    enumeration : list[str]
+        The list of names to complete from.
+
+    Returns
+    -------
+    list[str]
+        The list of names that start with the incomplete string.
+
+    """
+    return [name for name in list(enumeration) if name.startswith(incomplete)]
+
+
+def complete_audio_ext(incomplete: str) -> list[str]:
+    """
+    Return a list of audio extensions that start with the incomplete
+    string.
+
+    Parameters
+    ----------
+    incomplete : str
+        The incomplete string to complete.
+
+    Returns
+    -------
+    list[str]
+        The list of audio extensions that start with the incomplete
+        string.
+
+    """
+    return complete_name(incomplete, list(AudioExt))
+
+
+def complete_f0_method(incomplete: str) -> list[str]:
+    """
+    Return a list of F0 methods that start with the incomplete string.
+
+    Parameters
+    ----------
+    incomplete : str
+        The incomplete string to complete.
+
+    Returns
+    -------
+    list[str]
+        The list of F0 methods that start with the incomplete string.
+
+    """
+    return complete_name(incomplete, list(F0Method))
+
+
+@app.command(no_args_is_help=True)
+def to_wav(
+    audio_track: Annotated[
+        Path,
+        typer.Argument(
+            help="The path to the audio track to convert.",
+            exists=True,
+            file_okay=True,
+            dir_okay=False,
+            resolve_path=True,
+        ),
+    ],
+    song_dir: Annotated[
+        Path,
+        typer.Argument(
+            help=(
+                "The path to the song directory where the converted audio track will be"
+                " saved."
+            ),
+            exists=True,
+            file_okay=False,
+            dir_okay=True,
+            resolve_path=True,
+        ),
+    ],
+    prefix: Annotated[
+        str,
+        typer.Argument(
+            help="The prefix to use for the name of the converted audio track.",
+        ),
+    ],
+    accepted_format: Annotated[
+        list[AudioExt] | None,
+        typer.Option(
+            case_sensitive=False,
+            autocompletion=complete_audio_ext,
+            help=(
+                "An audio format to accept for conversion. This option can be used"
+                " multiple times to accept multiple formats. If not provided, the"
+                " default accepted formats are mp3, ogg, flac, m4a and aac."
+            ),
+        ),
+    ] = None,
+) -> None:
+    """
+    Convert a given audio track to wav format if its current format
+    is an accepted format. See the --accepted-formats option for more
+    information on accepted formats.
+
+    """
+    rprint()
+    wav_path = _to_wav(
+        audio_track=audio_track,
+        song_dir=song_dir,
+        prefix=prefix,
+        accepted_formats=set(accepted_format) if accepted_format else None,
+    )
+    if wav_path == audio_track:
+        rprint(
+            "[+] Audio track was not converted to WAV format. Presumably, "
+            "its format is not in the given list of accepted formats.",
+        )
+    else:
+        rprint("[+] Audio track succesfully converted to WAV format!")
+        rprint(Panel(f"[green]{wav_path}", title="WAV Audio Track Path"))
+
+
+@app.command(no_args_is_help=True)
+def run_pipeline(
+    source: Annotated[
+        str,
+        typer.Argument(
+            help=(
+                "A Youtube URL, the path to a local audio file or the path to a"
+                " song directory."
+            ),
+        ),
+    ],
+    model_name: Annotated[
+        str,
+        typer.Argument(help="The name of the voice model to use for vocal conversion."),
+    ],
+    n_octaves: Annotated[
+        int,
+        typer.Option(
+            rich_help_panel="Vocal Conversion Options",
+            help=(
+                "The number of octaves to pitch-shift the converted vocals by.Use 1 for"
+                " male-to-female and -1 for vice-versa."
+            ),
+        ),
+    ] = 0,
+    n_semitones: Annotated[
+        int,
+        typer.Option(
+            rich_help_panel="Vocal Conversion Options",
+            help=(
+                "The number of semi-tones to pitch-shift the converted vocals,"
+                " instrumentals, and backup vocals by. Altering this slightly reduces"
+                " sound quality"
+            ),
+        ),
+    ] = 0,
+    f0_method: Annotated[
+        F0Method,
+        typer.Option(
+            case_sensitive=False,
+            autocompletion=complete_f0_method,
+            rich_help_panel="Vocal Conversion Options",
+            help=(
+                "The method to use for pitch detection during vocal conversion. Best"
+                " option is RMVPE (clarity in vocals), then Mangio-Crepe (smoother"
+                " vocals)."
+            ),
+        ),
+    ] = F0Method.RMVPE,
+    index_rate: Annotated[
+        float,
+        typer.Option(
+            min=0,
+            max=1,
+            rich_help_panel="Vocal Conversion Options",
+            help=(
+                "A decimal number e.g. 0.5, Controls how much of the accent in the"
+                " voice model to keep in the converted vocals. Increase to bias the"
+                " conversion towards the accent of the voice model."
+            ),
+        ),
+    ] = 0.5,
+    filter_radius: Annotated[
+        int,
+        typer.Option(
+            min=0,
+            max=7,
+            rich_help_panel="Vocal Conversion Options",
+            help=(
+                "A number between 0 and 7. If >=3: apply median filtering to the pitch"
+                " results harvested during vocal conversion. Can help reduce"
+                " breathiness in the converted vocals."
+            ),
+        ),
+    ] = 3,
+    rms_mix_rate: Annotated[
+        float,
+        typer.Option(
+            min=0,
+            max=1,
+            rich_help_panel="Vocal Conversion Options",
+            help=(
+                "A decimal number e.g. 0.25. Controls how much to mimic the loudness of"
+                " the input vocals (0) or a fixed loudness (1) during vocal conversion."
+            ),
+        ),
+    ] = 0.25,
+    protect: Annotated[
+        float,
+        typer.Option(
+            min=0,
+            max=0.5,
+            rich_help_panel="Vocal Conversion Options",
+            help=(
+                "A decimal number e.g. 0.33. Controls protection of voiceless"
+                " consonants and breath sounds during vocal conversion. Decrease to"
+                " increase protection at the cost of indexing accuracy. Set to 0.5 to"
+                " disable."
+            ),
+        ),
+    ] = 0.33,
+    hop_length: Annotated[
+        int,
+        typer.Option(
+            rich_help_panel="Vocal Conversion Options",
+            help=(
+                "Controls how often the CREPE-based pitch detection algorithm checks"
+                " for pitch changes during vocal conversion. Measured in milliseconds."
+                " Lower values lead to longer conversion times and a higher risk of"
+                " voice cracks, but better pitch accuracy. Recommended value: 128."
+            ),
+        ),
+    ] = 128,
+    room_size: Annotated[
+        float,
+        typer.Option(
+            min=0,
+            max=1,
+            rich_help_panel="Vocal Post-processing Options",
+            help=(
+                "The room size of the reverb effect applied to the converted vocals."
+                " Increase for longer reverb time. Should be a value between 0 and 1."
+            ),
+        ),
+    ] = 0.15,
+    wet_level: Annotated[
+        float,
+        typer.Option(
+            min=0,
+            max=1,
+            rich_help_panel="Vocal Post-processing Options",
+            help=(
+                "The loudness of the converted vocals with reverb effect applied."
+                " Should be a value between 0 and 1"
+            ),
+        ),
+    ] = 0.2,
+    dry_level: Annotated[
+        float,
+        typer.Option(
+            min=0,
+            max=1,
+            rich_help_panel="Vocal Post-processing Options",
+            help=(
+                "The loudness of the converted vocals wihout reverb effect applied."
+                " Should be a value between 0 and 1."
+            ),
+        ),
+    ] = 0.8,
+    damping: Annotated[
+        float,
+        typer.Option(
+            min=0,
+            max=1,
+            rich_help_panel="Vocal Post-processing Options",
+            help=(
+                "The absorption of high frequencies in the reverb effect applied to the"
+                " converted vocals. Should be a value between 0 and 1."
+            ),
+        ),
+    ] = 0.7,
+    main_gain: Annotated[
+        int,
+        typer.Option(
+            rich_help_panel="Audio Mixing Options",
+            help="The gain to apply to the post-processed vocals. Measured in dB.",
+        ),
+    ] = 0,
+    inst_gain: Annotated[
+        int,
+        typer.Option(
+            rich_help_panel="Audio Mixing Options",
+            help=(
+                "The gain to apply to the pitch-shifted instrumentals. Measured in dB."
+            ),
+        ),
+    ] = 0,
+    backup_gain: Annotated[
+        int,
+        typer.Option(
+            rich_help_panel="Audio Mixing Options",
+            help=(
+                "The gain to apply to the pitch-shifted backup vocals. Measured in dB."
+            ),
+        ),
+    ] = 0,
+    output_sr: Annotated[
+        int,
+        typer.Option(
+            rich_help_panel="Audio Mixing Options",
+            help="The sample rate of the song cover.",
+        ),
+    ] = 44100,
+    output_format: Annotated[
+        AudioExt,
+        typer.Option(
+            case_sensitive=False,
+            autocompletion=complete_audio_ext,
+            rich_help_panel="Audio Mixing Options",
+            help="The audio format of the song cover.",
+        ),
+    ] = AudioExt.MP3,
+    output_name: Annotated[
+        str | None,
+        typer.Option(
+            rich_help_panel="Audio Mixing Options",
+            help="The name of the song cover.",
+        ),
+    ] = None,
+) -> None:
+    """Run the song cover generation pipeline."""
+    [song_cover_path, *intermediate_audio_file_paths] = _run_pipeline(
+        source=source,
+        model_name=model_name,
+        n_octaves=n_octaves,
+        n_semitones=n_semitones,
+        f0_method=f0_method,
+        index_rate=index_rate,
+        filter_radius=filter_radius,
+        rms_mix_rate=rms_mix_rate,
+        protect=protect,
+        hop_length=hop_length,
+        room_size=room_size,
+        wet_level=wet_level,
+        dry_level=dry_level,
+        damping=damping,
+        main_gain=main_gain,
+        inst_gain=inst_gain,
+        backup_gain=backup_gain,
+        output_sr=output_sr,
+        output_format=output_format,
+        output_name=output_name,
+        progress_bar=None,
+    )
+    table = Table()
+    table.add_column("Type")
+    table.add_column("Path")
+    for name, path in zip(
+        [
+            "Song",
+            "Vocals",
+            "Instrumentals",
+            "Main vocals",
+            "Backup vocals",
+            "De-reverbed main vocals",
+            "Main vocals reverb",
+            "Converted vocals",
+            "Post-processed vocals",
+            "Pitch-shifted instrumentals",
+            "Pitch-shifted backup vocals",
+        ],
+        intermediate_audio_file_paths,
+        strict=True,
+    ):
+        table.add_row(name, f"[green]{path}")
+    rprint("[+] Song cover succesfully generated!")
+    rprint(Panel(f"[green]{song_cover_path}", title="Song Cover Path"))
+    rprint(Panel(table, title="Intermediate Audio Files"))
diff --git a/src/ultimate_rvc/cli/main.py b/src/ultimate_rvc/cli/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..07d198627e6bdae1e407c0160ba4d95e5f716bd2
--- /dev/null
+++ b/src/ultimate_rvc/cli/main.py
@@ -0,0 +1,21 @@
+"""
+Module which defines the command-line interface for the Ultimate RVC
+project.
+"""
+
+import typer
+
+from ultimate_rvc.cli.generate.song_cover import app as song_cover_app
+
+app = typer.Typer(
+    name="urvc-cli",
+    no_args_is_help=True,
+    help="CLI for the Ultimate RVC project",
+    rich_markup_mode="markdown",
+)
+
+app.add_typer(song_cover_app)
+
+
+if __name__ == "__main__":
+    app()
diff --git a/src/ultimate_rvc/common.py b/src/ultimate_rvc/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7c51417a6b1adf2361df247d78bd11ef236b29b
--- /dev/null
+++ b/src/ultimate_rvc/common.py
@@ -0,0 +1,10 @@
+"""Common variables used in the Ultimate RVC project."""
+
+from pathlib import Path
+
+BASE_DIR = Path.cwd()
+MODELS_DIR = BASE_DIR / "models"
+RVC_MODELS_DIR = MODELS_DIR / "rvc"
+SEPARATOR_MODELS_DIR = MODELS_DIR / "audio_separator"
+AUDIO_DIR = BASE_DIR / "audio"
+TEMP_DIR = BASE_DIR / "temp"
diff --git a/src/ultimate_rvc/core/__init__.py b/src/ultimate_rvc/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d6f97bf14ed6e6c46603e9c966f5ee10b933850
--- /dev/null
+++ b/src/ultimate_rvc/core/__init__.py
@@ -0,0 +1,7 @@
+"""
+core package for the Ultimate RVC project.
+
+This package contains modules for managing date and settings as well as
+generating audio using RVC based methods.
+
+"""
diff --git a/src/ultimate_rvc/core/common.py b/src/ultimate_rvc/core/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..b33d6bb876d7978a5171e372d972ce6bb4f565fd
--- /dev/null
+++ b/src/ultimate_rvc/core/common.py
@@ -0,0 +1,285 @@
+"""Common utility functions for the core of the Ultimate RVC project."""
+
+import hashlib
+import json
+import shutil
+from collections.abc import Sequence
+from pathlib import Path
+
+import requests
+
+from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
+
+import gradio as gr
+
+from rich import print as rprint
+
+from ultimate_rvc.common import AUDIO_DIR, RVC_MODELS_DIR
+from ultimate_rvc.core.exceptions import Entity, HttpUrlError, NotFoundError
+from ultimate_rvc.typing_extra import Json, StrPath
+
+RVC_DOWNLOAD_URL = "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/"
+INTERMEDIATE_AUDIO_BASE_DIR = AUDIO_DIR / "intermediate"
+OUTPUT_AUDIO_DIR = AUDIO_DIR / "output"
+FLAG_FILE = RVC_MODELS_DIR / ".initialized"
+
+
+def display_progress(
+    message: str,
+    percentage: float | None = None,
+    progress_bar: gr.Progress | None = None,
+) -> None:
+    """
+    Display progress message and percentage in console and potentially
+    also Gradio progress bar.
+
+    Parameters
+    ----------
+    message : str
+        Message to display.
+    percentage : float, optional
+        Percentage to display.
+    progress_bar : gr.Progress, optional
+        The Gradio progress bar to update.
+
+    """
+    rprint(message)
+    if progress_bar is not None:
+        progress_bar(percentage, desc=message)
+
+
+def remove_suffix_after(text: str, occurrence: str) -> str:
+    """
+    Remove suffix after the first occurrence of a substring in a string.
+
+    Parameters
+    ----------
+    text : str
+        The string to remove the suffix from.
+    occurrence : str
+        The substring to remove the suffix after.
+
+    Returns
+    -------
+    str
+        The string with the suffix removed.
+
+    """
+    location = text.rfind(occurrence)
+    if location == -1:
+        return text
+    return text[: location + len(occurrence)]
+
+
+def copy_files_to_new_dir(files: Sequence[StrPath], directory: StrPath) -> None:
+    """
+    Copy files to a new directory.
+
+    Parameters
+    ----------
+    files : Sequence[StrPath]
+        Paths to the files to copy.
+    directory : StrPath
+        Path to the directory to copy the files to.
+
+    Raises
+    ------
+    NotFoundError
+        If a file does not exist.
+
+    """
+    dir_path = Path(directory)
+    dir_path.mkdir(parents=True)
+    for file in files:
+        file_path = Path(file)
+        if not file_path.exists():
+            raise NotFoundError(entity=Entity.FILE, location=file_path)
+        shutil.copyfile(file_path, dir_path / file_path.name)
+
+
+def copy_file_safe(src: StrPath, dest: StrPath) -> Path:
+    """
+    Copy a file to a new location, appending a number if a file with the
+    same name already exists.
+
+    Parameters
+    ----------
+    src : strPath
+        The source file path.
+    dest : strPath
+        The candidate destination file path.
+
+    Returns
+    -------
+    Path
+        The final destination file path.
+
+    """
+    dest_path = Path(dest)
+    src_path = Path(src)
+    dest_dir = dest_path.parent
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    dest_file = dest_path
+    counter = 1
+
+    while dest_file.exists():
+        dest_file = dest_dir / f"{dest_path.stem} ({counter}){src_path.suffix}"
+        counter += 1
+
+    shutil.copyfile(src, dest_file)
+    return dest_file
+
+
+def json_dumps(thing: Json) -> str:
+    """
+    Dump a JSON-serializable object to a JSON string.
+
+    Parameters
+    ----------
+    thing : Json
+        The JSON-serializable object to dump.
+
+    Returns
+    -------
+    str
+        The JSON string representation of the object.
+
+    """
+    return json.dumps(thing, ensure_ascii=False, indent=4)
+
+
+def json_dump(thing: Json, file: StrPath) -> None:
+    """
+    Dump a JSON-serializable object to a JSON file.
+
+    Parameters
+    ----------
+    thing : Json
+        The JSON-serializable object to dump.
+    file : StrPath
+        The path to the JSON file.
+
+    """
+    with Path(file).open("w", encoding="utf-8") as fp:
+        json.dump(thing, fp, ensure_ascii=False, indent=4)
+
+
+def json_load(file: StrPath, encoding: str = "utf-8") -> Json:
+    """
+    Load a JSON-serializable object from a JSON file.
+
+    Parameters
+    ----------
+    file : StrPath
+        The path to the JSON file.
+    encoding : str, default='utf-8'
+        The encoding of the JSON file.
+
+    Returns
+    -------
+    Json
+        The JSON-serializable object loaded from the JSON file.
+
+    """
+    with Path(file).open(encoding=encoding) as fp:
+        return json.load(fp)
+
+
+def get_hash(thing: Json, size: int = 5) -> str:
+    """
+    Get the hash of a JSON-serializable object.
+
+    Parameters
+    ----------
+    thing : Json
+        The JSON-serializable object to hash.
+    size : int, default=5
+        The size of the hash in bytes.
+
+    Returns
+    -------
+    str
+        The hash of the JSON-serializable object.
+
+    """
+    return hashlib.blake2b(
+        json_dumps(thing).encode("utf-8"),
+        digest_size=size,
+    ).hexdigest()
+
+
+# NOTE consider increasing size to 16 otherwise we might have problems
+# with hash collisions
+def get_file_hash(file: StrPath, size: int = 5) -> str:
+    """
+    Get the hash of a file.
+
+    Parameters
+    ----------
+    file : StrPath
+        The path to the file.
+    size : int, default=5
+        The size of the hash in bytes.
+
+    Returns
+    -------
+    str
+        The hash of the file.
+
+    """
+    with Path(file).open("rb") as fp:
+        file_hash = hashlib.file_digest(fp, lambda: hashlib.blake2b(digest_size=size))
+    return file_hash.hexdigest()
+
+
+def validate_url(url: str) -> None:
+    """
+    Validate a HTTP-based URL.
+
+    Parameters
+    ----------
+    url : str
+        The URL to validate.
+
+    Raises
+    ------
+    HttpUrlError
+        If the URL is invalid.
+
+    """
+    try:
+        TypeAdapter(AnyHttpUrl).validate_python(url)
+    except ValidationError:
+        raise HttpUrlError(url) from None
+
+
+def _download_base_model(url: str, name: str, directory: StrPath) -> None:
+    """
+    Download a base model and save it to an existing directory.
+
+    Parameters
+    ----------
+    url : str
+        An URL pointing to a location where a base model is hosted.
+    name : str
+        The name of the base model to download.
+    directory : str
+        The path to the directory where the base model should be saved.
+
+    """
+    dir_path = Path(directory)
+    with requests.get(f"{url}{name}", timeout=10) as r:
+        r.raise_for_status()
+        with (dir_path / name).open("wb") as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                f.write(chunk)
+
+
+def download_base_models() -> None:
+    """Download base models."""
+    RVC_MODELS_DIR.mkdir(parents=True, exist_ok=True)
+    base_model_names = ["hubert_base.pt", "rmvpe.pt"]
+    for base_model_name in base_model_names:
+        if not Path(RVC_MODELS_DIR / base_model_name).is_file():
+            rprint(f"Downloading {base_model_name}...")
+            _download_base_model(RVC_DOWNLOAD_URL, base_model_name, RVC_MODELS_DIR)
diff --git a/src/ultimate_rvc/core/exceptions.py b/src/ultimate_rvc/core/exceptions.py
new file mode 100644
index 0000000000000000000000000000000000000000..3532d7eceb130bbaa3e6ca8a4b6972c2e1f453ac
--- /dev/null
+++ b/src/ultimate_rvc/core/exceptions.py
@@ -0,0 +1,297 @@
+"""
+Module which defines custom exception and enumerations used when
+instiating and re-raising those exceptions.
+"""
+
+from enum import StrEnum
+
+from ultimate_rvc.typing_extra import StrPath
+
+
+class Entity(StrEnum):
+    """Enumeration of entities that can be provided."""
+
+    DIRECTORY = "directory"
+    DIRECTORIES = "directories"
+    FILE = "file"
+    FILES = "files"
+    URL = "URL"
+    MODEL_NAME = "model name"
+    MODEL_NAMES = "model names"
+    MODEL_FILE = "model file"
+    SOURCE = "source"
+    SONG_DIR = "song directory"
+    AUDIO_TRACK = "audio track"
+    AUDIO_TRACK_GAIN_PAIRS = "pairs of audio track and gain"
+    SONG = "song"
+    VOCALS_TRACK = "vocals track"
+    INSTRUMENTALS_TRACK = "instrumentals track"
+    BACKUP_VOCALS_TRACK = "backup vocals track"
+    MAIN_VOCALS_TRACK = "main vocals track"
+
+
+class Location(StrEnum):
+    """Enumeration of locations where entities can be found."""
+
+    INTERMEDIATE_AUDIO_ROOT = "the root of the intermediate audio base directory"
+    OUTPUT_AUDIO_ROOT = "the root of the output audio directory"
+    EXTRACTED_ZIP_FILE = "extracted zip file"
+
+
+class UIMessage(StrEnum):
+    """
+    Enumeration of messages that can be displayed in the UI
+    in place of core exception messages.
+    """
+
+    NO_AUDIO_TRACK = "No audio tracks provided."
+    NO_SONG_DIR = "No song directory selected."
+    NO_SONG_DIRS = (
+        "No song directories selected. Please select one or more song directories"
+        " containing intermediate audio files to delete."
+    )
+    NO_OUTPUT_AUDIO_FILES = (
+        "No files selected. Please select one or more output audio files to delete."
+    )
+    NO_UPLOADED_FILES = "No files selected."
+    NO_VOICE_MODEL = "No voice model selected."
+    NO_VOICE_MODELS = "No voice models selected."
+    NO_SOURCE = (
+        "No source provided. Please provide a valid Youtube URL, local audio file"
+        " or song directory."
+    )
+
+
+class NotProvidedError(ValueError):
+    """Raised when an entity is not provided."""
+
+    def __init__(self, entity: Entity, ui_msg: UIMessage | None = None) -> None:
+        """
+        Initialize a NotProvidedError instance.
+
+        Exception message will be formatted as:
+
+        "No `<entity>` provided."
+
+        Parameters
+        ----------
+        entity : Entity
+            The entity that was not provided.
+        ui_msg : UIMessage, default=None
+            Message which, if provided, is displayed in the UI
+            instead of the default exception message.
+
+        """
+        super().__init__(f"No {entity} provided.")
+        self.ui_msg = ui_msg
+
+
+class NotFoundError(OSError):
+    """Raised when an entity is not found."""
+
+    def __init__(
+        self,
+        entity: Entity,
+        location: StrPath | Location,
+        is_path: bool = True,
+    ) -> None:
+        """
+        Initialize a NotFoundError instance.
+
+        Exception message will be formatted as:
+
+        "`<entity>` not found `(`in `|` as:`)` `<location>`."
+
+        Parameters
+        ----------
+        entity : Entity
+            The entity that was not found.
+        location : StrPath | Location
+            The location where the entity was not found.
+        is_path : bool, default=True
+            Whether the location is a path to the entity.
+
+        """
+        proposition = "at:" if is_path else "in"
+        entity_cap = entity.capitalize() if not entity.isupper() else entity
+        super().__init__(
+            f"{entity_cap} not found {proposition} {location}",
+        )
+
+
+class VoiceModelNotFoundError(OSError):
+    """Raised when a voice model is not found."""
+
+    def __init__(self, name: str) -> None:
+        r"""
+        Initialize a VoiceModelNotFoundError instance.
+
+        Exception message will be formatted as:
+
+        'Voice model with name "`<name>`" not found.'
+
+        Parameters
+        ----------
+        name : str
+            The name of the voice model that was not found.
+
+        """
+        super().__init__(f'Voice model with name "{name}" not found.')
+
+
+class VoiceModelExistsError(OSError):
+    """Raised when a voice model already exists."""
+
+    def __init__(self, name: str) -> None:
+        r"""
+        Initialize a VoiceModelExistsError instance.
+
+        Exception message will be formatted as:
+
+        "Voice model with name '`<name>`' already exists. Please provide
+        a different name for your voice model."
+
+        Parameters
+        ----------
+        name : str
+            The name of the voice model that already exists.
+
+        """
+        super().__init__(
+            f'Voice model with name "{name}" already exists. Please provide a different'
+            " name for your voice model.",
+        )
+
+
+class InvalidLocationError(OSError):
+    """Raised when an entity is in a wrong location."""
+
+    def __init__(self, entity: Entity, location: Location, path: StrPath) -> None:
+        r"""
+        Initialize an InvalidLocationError instance.
+
+        Exception message will be formatted as:
+
+        "`<entity>` should be located in `<location>` but found at:
+        `<path>`"
+
+        Parameters
+        ----------
+        entity : Entity
+            The entity that is in a wrong location.
+        location : Location
+            The correct location for the entity.
+        path : StrPath
+            The path to the entity.
+
+        """
+        entity_cap = entity.capitalize() if not entity.isupper() else entity
+        super().__init__(
+            f"{entity_cap} should be located in {location} but found at: {path}",
+        )
+
+
+class HttpUrlError(OSError):
+    """Raised when a HTTP-based URL is invalid."""
+
+    def __init__(self, url: str) -> None:
+        """
+        Initialize a HttpUrlError instance.
+
+        Exception message will be formatted as:
+
+        "Invalid HTTP-based URL: `<url>`"
+
+        Parameters
+        ----------
+        url : str
+            The invalid HTTP-based URL.
+
+        """
+        super().__init__(
+            f"Invalid HTTP-based URL: {url}",
+        )
+
+
+class YoutubeUrlError(OSError):
+    """
+    Raised when an URL does not point to a YouTube video or
+    , potentially, a Youtube playlist.
+    """
+
+    def __init__(self, url: str, playlist: bool) -> None:
+        """
+        Initialize a YoutubeURlError instance.
+
+        Exception message will be formatted as:
+
+        "URL does not point to a YouTube video `[`or playlist`]`:
+         `<url>`"
+
+        Parameters
+        ----------
+        url : str
+            The URL that does not point to a YouTube video or playlist.
+        playlist : bool
+            Whether the URL might point to a YouTube playlist.
+
+        """
+        suffix = "or playlist" if playlist else ""
+        super().__init__(
+            f"Not able to access Youtube video {suffix} at: {url}",
+        )
+
+
+class UploadLimitError(ValueError):
+    """Raised when the upload limit for an entity is exceeded."""
+
+    def __init__(self, entity: Entity, limit: str | float) -> None:
+        """
+        Initialize an UploadLimitError instance.
+
+        Exception message will be formatted as:
+
+        "At most `<limit>` `<entity>` can be uploaded."
+
+        Parameters
+        ----------
+        entity : Entity
+            The entity for which the upload limit was exceeded.
+        limit : str
+            The upload limit.
+
+        """
+        super().__init__(f"At most {limit} {entity} can be uploaded.")
+
+
+class UploadFormatError(ValueError):
+    """
+    Raised when one or more uploaded entities have an invalid format
+    .
+    """
+
+    def __init__(self, entity: Entity, formats: list[str], multiple: bool) -> None:
+        """
+        Initialize an UploadFileFormatError instance.
+
+
+        Exception message will be formatted as:
+
+        "Only `<entity>` with the following formats can be uploaded
+        `(`by themselves | together`)`: `<formats>`."
+
+        Parameters
+        ----------
+        entity : Entity
+            The entity that was uploaded with an invalid format.
+        formats : list[str]
+            Valid formats.
+        multiple : bool
+            Whether multiple entities are uploaded.
+
+        """
+        suffix = "by themselves" if not multiple else "together (at most one of each)"
+        super().__init__(
+            f"Only {entity} with the following formats can be uploaded {suffix}:"
+            f" {', '.join(formats)}.",
+        )
diff --git a/src/ultimate_rvc/core/generate/__init__.py b/src/ultimate_rvc/core/generate/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c462c1ad7d14175be00fa5a38be585be4bfaf30
--- /dev/null
+++ b/src/ultimate_rvc/core/generate/__init__.py
@@ -0,0 +1,13 @@
+"""
+Package which defines modules that facilitate RVC based audio
+generation.
+"""
+
+import static_ffmpeg
+import static_sox
+
+from ultimate_rvc.core.common import download_base_models
+
+download_base_models()
+static_ffmpeg.add_paths()
+static_sox.add_paths()
diff --git a/src/ultimate_rvc/core/generate/song_cover.py b/src/ultimate_rvc/core/generate/song_cover.py
new file mode 100644
index 0000000000000000000000000000000000000000..1da6e55eaaf88603f3892be273449b7f3d6231e8
--- /dev/null
+++ b/src/ultimate_rvc/core/generate/song_cover.py
@@ -0,0 +1,1728 @@
+"""
+Module which defines functions that faciliatate song cover generation
+using RVC.
+"""
+
+import gc
+import logging
+import operator
+import shutil
+from collections.abc import Sequence
+from contextlib import suppress
+from functools import reduce
+from itertools import starmap
+from pathlib import Path
+from urllib.parse import parse_qs, urlparse
+
+import yt_dlp
+
+from pydantic import ValidationError
+
+import gradio as gr
+
+import ffmpeg
+import soundfile as sf
+import sox
+from audio_separator.separator import Separator
+from pedalboard import Compressor, HighpassFilter, Reverb
+from pedalboard._pedalboard import Pedalboard  # noqa: PLC2701
+from pedalboard.io import AudioFile
+from pydub import AudioSegment
+from pydub import utils as pydub_utils
+
+from ultimate_rvc.common import RVC_MODELS_DIR, SEPARATOR_MODELS_DIR
+from ultimate_rvc.core.common import (
+    INTERMEDIATE_AUDIO_BASE_DIR,
+    OUTPUT_AUDIO_DIR,
+    copy_file_safe,
+    display_progress,
+    get_file_hash,
+    get_hash,
+    json_dump,
+    json_dumps,
+    json_load,
+    validate_url,
+)
+from ultimate_rvc.core.exceptions import (
+    Entity,
+    InvalidLocationError,
+    Location,
+    NotFoundError,
+    NotProvidedError,
+    UIMessage,
+    VoiceModelNotFoundError,
+    YoutubeUrlError,
+)
+from ultimate_rvc.core.typing_extra import (
+    AudioExtInternal,
+    ConvertedVocalsMetaData,
+    EffectedVocalsMetaData,
+    FileMetaData,
+    MixedSongMetaData,
+    PitchShiftMetaData,
+    SeparatedAudioMetaData,
+    SourceType,
+    StagedAudioMetaData,
+    WaveifiedAudioMetaData,
+)
+from ultimate_rvc.typing_extra import (
+    AudioExt,
+    F0Method,
+    Json,
+    SegmentSize,
+    SeparationModel,
+    StrPath,
+)
+from ultimate_rvc.vc.rvc import Config, get_vc, load_hubert, rvc_infer
+
+logger = logging.getLogger(__name__)
+
+
+def _get_audio_separator(
+    output_dir: StrPath = INTERMEDIATE_AUDIO_BASE_DIR,
+    output_format: str = AudioExt.WAV,
+    segment_size: int = SegmentSize.SEG_256,
+    sample_rate: int = 44100,
+) -> Separator:
+    """
+    Get an audio separator.
+
+    Parameters
+    ----------
+    output_dir : StrPath, default=INTERMEDIATE_AUDIO_BASE_DIR
+        The directory to save the separated audio to.
+    output_format : str, default=AudioExt.WAV
+        The format to save the separated audio in.
+    segment_size : int, default=SegmentSize.SEG_256
+        The segment size to use for separation.
+    sample_rate : int, default=44100
+        The sample rate to use for separation.
+
+    Returns
+    -------
+    Separator
+        An audio separator.
+
+    """
+    return Separator(
+        model_file_dir=SEPARATOR_MODELS_DIR,
+        output_dir=output_dir,
+        output_format=output_format,
+        sample_rate=sample_rate,
+        mdx_params={
+            "hop_length": 1024,
+            "segment_size": segment_size,
+            "overlap": 0.001,
+            "batch_size": 1,
+            "enable_denoise": False,
+        },
+    )
+
+
+def initialize_audio_separator(progress_bar: gr.Progress | None = None) -> None:
+    """
+    Initialize the audio separator by downloading the models it uses.
+
+    Parameters
+    ----------
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+
+    """
+    audio_separator = _get_audio_separator()
+    for i, separator_model in enumerate(SeparationModel):
+        if not Path(SEPARATOR_MODELS_DIR / separator_model).is_file():
+            display_progress(
+                f"Downloading {separator_model}...",
+                i / len(SeparationModel),
+                progress_bar,
+            )
+            audio_separator.download_model_files(separator_model)
+
+
+def _get_input_audio_path(directory: StrPath) -> Path | None:
+    """
+    Get the path to the input audio file in the provided directory, if
+    it exists.
+
+    The provided directory must be located in the root of the
+    intermediate audio base directory.
+
+    Parameters
+    ----------
+    directory : StrPath
+        The path to a directory.
+
+    Returns
+    -------
+    Path | None
+        The path to the input audio file in the provided directory, if
+        it exists.
+
+    Raises
+    ------
+    NotFoundError
+        If the provided path does not point to an existing directory.
+    InvalidLocationError
+        If the provided path is not located in the root of the
+        intermediate audio base directory"
+
+    """
+    dir_path = Path(directory)
+
+    if not dir_path.is_dir():
+        raise NotFoundError(entity=Entity.DIRECTORY, location=dir_path)
+
+    if dir_path.parent != INTERMEDIATE_AUDIO_BASE_DIR:
+        raise InvalidLocationError(
+            entity=Entity.DIRECTORY,
+            location=Location.INTERMEDIATE_AUDIO_ROOT,
+            path=dir_path,
+        )
+    # NOTE directory should never contain more than one element which
+    # matches the pattern "00_*"
+    return next(dir_path.glob("00_*"), None)
+
+
+def _get_input_audio_paths() -> list[Path]:
+    """
+    Get the paths to all input audio files in the intermediate audio
+    base directory.
+
+    Returns
+    -------
+    list[Path]
+        The paths to all input audio files in the intermediate audio
+        base directory.
+
+    """
+    # NOTE if we later add .json file for input then
+    # we need to exclude those here
+    return list(INTERMEDIATE_AUDIO_BASE_DIR.glob("*/00_*"))
+
+
+def get_named_song_dirs() -> list[tuple[str, str]]:
+    """
+    Get the names of all saved songs and the paths to the
+    directories where they are stored.
+
+    Returns
+    -------
+    list[tuple[str, Path]]
+        A list of tuples containing the name of each saved song
+        and the path to the directory where it is stored.
+
+    """
+    return sorted(
+        [
+            (
+                path.stem.removeprefix("00_"),
+                str(path.parent),
+            )
+            for path in _get_input_audio_paths()
+        ],
+        key=operator.itemgetter(0),
+    )
+
+
+def _get_model_name(
+    effected_vocals_track: StrPath | None = None,
+    song_dir: StrPath | None = None,
+) -> str:
+    """
+    Infer the name of the voice model used for vocal conversion from a
+    an effected vocals track in a given song directory.
+
+    If a voice model name cannot be inferred, "Unknown" is returned.
+
+    Parameters
+    ----------
+    effected_vocals_track : StrPath, optional
+        The path to an effected vocals track.
+    song_dir : StrPath, optional
+        The path to a song directory.
+
+    Returns
+    -------
+    str
+        The name of the voice model used for vocal conversion.
+
+    """
+    model_name = "Unknown"
+    if not (effected_vocals_track and song_dir):
+        return model_name
+    effected_vocals_path = Path(effected_vocals_track)
+    song_dir_path = Path(song_dir)
+    effected_vocals_json_path = song_dir_path / f"{effected_vocals_path.stem}.json"
+    if not effected_vocals_json_path.is_file():
+        return model_name
+    effected_vocals_dict = json_load(effected_vocals_json_path)
+    try:
+        effected_vocals_metadata = EffectedVocalsMetaData.model_validate(
+            effected_vocals_dict,
+        )
+    except ValidationError:
+        return model_name
+    converted_vocals_track_name = effected_vocals_metadata.vocals_track.name
+    converted_vocals_json_path = song_dir_path / Path(
+        converted_vocals_track_name,
+    ).with_suffix(
+        ".json",
+    )
+    if not converted_vocals_json_path.is_file():
+        return model_name
+    converted_vocals_dict = json_load(converted_vocals_json_path)
+    try:
+        converted_vocals_metadata = ConvertedVocalsMetaData.model_validate(
+            converted_vocals_dict,
+        )
+    except ValidationError:
+        return model_name
+    return converted_vocals_metadata.model_name
+
+
+def get_song_cover_name(
+    effected_vocals_track: StrPath | None = None,
+    song_dir: StrPath | None = None,
+    model_name: str | None = None,
+) -> str:
+    """
+    Generate a suitable name for a cover of a song based on the name
+    of that song and the voice model used for vocal conversion.
+
+    If the path of an existing song directory is provided, the name
+    of the song is inferred from that directory. If a voice model is not
+    provided but the path of an existing song directory and the path of
+    an effected vocals track in that directory are provided, then the
+    voice model is inferred from the effected vocals track.
+
+    Parameters
+    ----------
+    effected_vocals_track : StrPath, optional
+        The path to an effected vocals track.
+    song_dir : StrPath, optional
+        The path to a song directory.
+    model_name : str, optional
+        The name of a voice model.
+
+    Returns
+    -------
+    str
+        The song cover name
+
+    """
+    song_name = "Unknown"
+    if song_dir and (song_path := _get_input_audio_path(song_dir)):
+        song_name = song_path.stem.removeprefix("00_")
+    model_name = model_name or _get_model_name(effected_vocals_track, song_dir)
+
+    return f"{song_name} ({model_name} Ver)"
+
+
+def _get_youtube_id(url: str, ignore_playlist: bool = True) -> str:
+    """
+    Get the id of a YouTube video or playlist.
+
+    Parameters
+    ----------
+    url : str
+        URL which points to a YouTube video or playlist.
+    ignore_playlist : bool, default=True
+        Whether to get the id of the first video in a playlist or the
+        playlist id itself.
+
+    Returns
+    -------
+    str
+        The id of a YouTube video or playlist.
+
+    Raises
+    ------
+    YoutubeUrlError
+        If the provided URL does not point to a YouTube video
+        or playlist.
+
+    """
+    yt_id = None
+    validate_url(url)
+    query = urlparse(url)
+    if query.hostname == "youtu.be":
+        yt_id = query.query[2:] if query.path[1:] == "watch" else query.path[1:]
+
+    elif query.hostname in {"www.youtube.com", "youtube.com", "music.youtube.com"}:
+        if not ignore_playlist:
+            with suppress(KeyError):
+                yt_id = parse_qs(query.query)["list"][0]
+        elif query.path == "/watch":
+            yt_id = parse_qs(query.query)["v"][0]
+        elif query.path[:7] == "/watch/":
+            yt_id = query.path.split("/")[1]
+        elif query.path[:7] == "/embed/" or query.path[:3] == "/v/":
+            yt_id = query.path.split("/")[2]
+    if yt_id is None:
+        raise YoutubeUrlError(url=url, playlist=True)
+
+    return yt_id
+
+
+def init_song_dir(
+    source: str,
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.5,
+) -> tuple[Path, SourceType]:
+    """
+    Initialize a directory for a song provided by a given source.
+
+
+    The song directory is initialized as follows:
+
+    * If the source is a YouTube URL, the id of the video which
+    that URL points to is extracted. A new song directory with the name
+    of that id is then created, if it does not already exist.
+    * If the source is a path to a local audio file, the hash of
+    that audio file is extracted. A new song directory with the name of
+    that hash is then created, if it does not already exist.
+    * if the source is a path to an existing song directory, then
+    that song directory is used as is.
+
+    Parameters
+    ----------
+    source : str
+        The source providing the song to initialize a directory for.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.5
+        Percentage to display in the progress bar.
+
+    Returns
+    -------
+    song_dir : Path
+        The path to the initialized song directory.
+    source_type : SourceType
+        The type of source provided.
+
+    Raises
+    ------
+    NotProvidedError
+        If no source is provided.
+    InvalidLocationError
+        If a provided path points to a directory that is not located in
+        the root of the intermediate audio base directory.
+    NotFoundError
+        If the provided source is a path to a file that does not exist.
+
+    """
+    if not source:
+        raise NotProvidedError(entity=Entity.SOURCE, ui_msg=UIMessage.NO_SOURCE)
+    source_path = Path(source)
+
+    display_progress("[~] Initializing song directory...", percentage, progress_bar)
+
+    # if source is a path to an existing song directory
+    if source_path.is_dir():
+        if source_path.parent != INTERMEDIATE_AUDIO_BASE_DIR:
+            raise InvalidLocationError(
+                entity=Entity.DIRECTORY,
+                location=Location.INTERMEDIATE_AUDIO_ROOT,
+                path=source_path,
+            )
+        display_progress(
+            "[~] Using existing song directory...",
+            percentage,
+            progress_bar,
+        )
+        source_type = SourceType.SONG_DIR
+        return source_path, source_type
+
+    # if source is a URL
+    if urlparse(source).scheme == "https":
+        source_type = SourceType.URL
+        song_id = _get_youtube_id(source)
+
+    # if source is a path to a local audio file
+    elif source_path.is_file():
+        source_type = SourceType.FILE
+        song_id = get_file_hash(source_path)
+    else:
+        raise NotFoundError(entity=Entity.FILE, location=source_path)
+
+    song_dir_path = INTERMEDIATE_AUDIO_BASE_DIR / song_id
+
+    song_dir_path.mkdir(parents=True, exist_ok=True)
+
+    return song_dir_path, source_type
+
+
+# NOTE consider increasing hash_size to 16. Otherwise
+# we might have problems with hash collisions when using app as CLI
+def get_unique_base_path(
+    song_dir: StrPath,
+    prefix: str,
+    args_dict: Json,
+    hash_size: int = 5,
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.5,
+) -> Path:
+    """
+    Get a unique base path (a path without any extension) for a file in
+    a song directory by hashing the arguments used to generate
+    the audio that is stored or will be stored in that file.
+
+    Parameters
+    ----------
+    song_dir :StrPath
+        The path to a song directory.
+    prefix : str
+        The prefix to use for the base path.
+    args_dict : Json
+        A JSON-serializable dictionary of named arguments used to
+        generate the audio that is stored or will be stored in a file
+        in the song directory.
+    hash_size : int, default=5
+        The size (in bytes) of the hash to use for the base path.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.5
+        Percentage to display in the progress bar.
+
+    Returns
+    -------
+    Path
+        The unique base path for a file in a song directory.
+
+    Raises
+    ------
+    NotProvidedError
+        If no song directory is provided.
+
+    """
+    if not song_dir:
+        raise NotProvidedError(entity=Entity.SONG_DIR, ui_msg=UIMessage.NO_SONG_DIR)
+    song_dir_path = Path(song_dir)
+    dict_hash = get_hash(args_dict, size=hash_size)
+    while True:
+        base_path = song_dir_path / f"{prefix}_{dict_hash}"
+        json_path = base_path.with_suffix(".json")
+        if json_path.exists():
+            file_dict = json_load(json_path)
+            if file_dict == args_dict:
+                return base_path
+            display_progress("[~] Rehashing...", percentage, progress_bar)
+            dict_hash = get_hash(dict_hash, size=hash_size)
+        else:
+            return base_path
+
+
+def _get_youtube_audio(url: str, directory: StrPath) -> Path:
+    """
+    Download audio from a YouTube video.
+
+    Parameters
+    ----------
+    url : str
+        URL which points to a YouTube video.
+    directory : StrPath
+        The directory to save the downloaded audio file to.
+
+    Returns
+    -------
+    Path
+        The path to the downloaded audio file.
+
+    Raises
+    ------
+    YoutubeUrlError
+        If the provided URL does not point to a YouTube video.
+
+    """
+    validate_url(url)
+    outtmpl = str(Path(directory, "00_%(title)s"))
+    ydl_opts = {
+        "quiet": True,
+        "no_warnings": True,
+        "format": "bestaudio",
+        "outtmpl": outtmpl,
+        "ignoreerrors": True,
+        "nocheckcertificate": True,
+        "postprocessors": [
+            {
+                "key": "FFmpegExtractAudio",
+                "preferredcodec": "wav",
+                "preferredquality": 0,
+            },
+        ],
+    }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        result = ydl.extract_info(url, download=True)
+        if not result:
+            raise YoutubeUrlError(url, playlist=False)
+        file = ydl.prepare_filename(result, outtmpl=f"{outtmpl}.wav")
+
+    return Path(file)
+
+
+def retrieve_song(
+    source: str,
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.5,
+) -> tuple[Path, Path]:
+    """
+    Retrieve a song from a source that can either be a YouTube URL, a
+    local audio file or a song directory.
+
+    Parameters
+    ----------
+    source : str
+        A Youtube URL, the path to a local audio file or the path to a
+        song directory.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.5
+        Percentage to display in the progress bar.
+
+    Returns
+    -------
+    song : Path
+        The path to the retrieved song.
+    song_dir : Path
+        The path to the song directory containing the retrieved song.
+
+    Raises
+    ------
+    NotProvidedError
+        If no source is provided.
+
+    """
+    if not source:
+        raise NotProvidedError(entity=Entity.SOURCE, ui_msg=UIMessage.NO_SOURCE)
+
+    song_dir_path, source_type = init_song_dir(source, progress_bar, percentage)
+    song_path = _get_input_audio_path(song_dir_path)
+
+    if not song_path:
+        if source_type == SourceType.URL:
+            display_progress("[~] Downloading song...", percentage, progress_bar)
+            song_url = source.split("&")[0]
+            song_path = _get_youtube_audio(song_url, song_dir_path)
+
+        else:
+            display_progress("[~] Copying song...", percentage, progress_bar)
+            source_path = Path(source)
+            song_name = f"00_{source_path.name}"
+            song_path = song_dir_path / song_name
+            shutil.copyfile(source_path, song_path)
+
+    return song_path, song_dir_path
+
+
+def _validate_exists(
+    identifier: StrPath,
+    entity: Entity,
+) -> Path:
+    """
+    Validate that the provided identifier is not none and that it
+    identifies an existing entity, which can be either a voice model,
+    a song directory or an audio track.
+
+    Parameters
+    ----------
+    identifier : StrPath
+        The identifier to validate.
+    entity : Entity
+        The entity that the identifier should identify.
+
+    Returns
+    -------
+    Path
+        The path to the identified entity.
+
+    Raises
+    ------
+    NotProvidedError
+        If the identifier is None.
+    NotFoundError
+        If the identifier does not identify an existing entity.
+    VoiceModelNotFoundError
+        If the identifier does not identify an existing voice model.
+    NotImplementedError
+        If the provided entity is not supported.
+
+    """
+    match entity:
+        case Entity.MODEL_NAME:
+            if not identifier:
+                raise NotProvidedError(entity=entity, ui_msg=UIMessage.NO_VOICE_MODEL)
+            path = RVC_MODELS_DIR / identifier
+            if not path.is_dir():
+                raise VoiceModelNotFoundError(str(identifier))
+        case Entity.SONG_DIR:
+            if not identifier:
+                raise NotProvidedError(entity=entity, ui_msg=UIMessage.NO_SONG_DIR)
+            path = Path(identifier)
+            if not path.is_dir():
+                raise NotFoundError(entity=entity, location=path)
+        case (
+            Entity.SONG
+            | Entity.AUDIO_TRACK
+            | Entity.VOCALS_TRACK
+            | Entity.INSTRUMENTALS_TRACK
+            | Entity.MAIN_VOCALS_TRACK
+            | Entity.BACKUP_VOCALS_TRACK
+        ):
+            if not identifier:
+                raise NotProvidedError(entity=entity)
+            path = Path(identifier)
+            if not path.is_file():
+                raise NotFoundError(entity=entity, location=path)
+        case _:
+            error_msg = f"Entity {entity} not supported."
+            raise NotImplementedError(error_msg)
+    return path
+
+
+def _validate_all_exist(
+    identifier_entity_pairs: Sequence[tuple[StrPath, Entity]],
+) -> list[Path]:
+    """
+    Validate that all provided identifiers are not none and that they
+    identify existing entities, which can be either voice models, song
+    directories or audio tracks.
+
+    Parameters
+    ----------
+    identifier_entity_pairs : Sequence[tuple[StrPath, Entity]]
+        The pairs of identifiers and entities to validate.
+
+    Returns
+    -------
+    list[Path]
+        The paths to the identified entities.
+
+    """
+    return list(starmap(_validate_exists, identifier_entity_pairs))
+
+
+def separate_audio(
+    audio_track: StrPath,
+    song_dir: StrPath,
+    model_name: SeparationModel,
+    segment_size: int,
+    display_msg: str = "[~] Separating audio...",
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.5,
+) -> tuple[Path, Path]:
+    """
+    Separate an audio track into a primary stem and a secondary stem.
+
+    Parameters
+    ----------
+    audio_track : StrPath
+        The path to the audio track to separate.
+    song_dir : StrPath
+        The path to the song directory where the separated primary stem
+        and secondary stem will be saved.
+    model_name : str
+        The name of the model to use for audio separation.
+    segment_size : int
+        The segment size to use for audio separation.
+    display_msg : str
+        The message to display when separating the audio track.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.5
+        Percentage to display in the progress bar.
+
+    Returns
+    -------
+    primary_path : Path
+        The path to the separated primary stem.
+    secondary_path : Path
+        The path to the separated secondary stem.
+
+    """
+    audio_path, song_dir_path = _validate_all_exist(
+        [(audio_track, Entity.AUDIO_TRACK), (song_dir, Entity.SONG_DIR)],
+    )
+
+    args_dict = SeparatedAudioMetaData(
+        audio_track=FileMetaData(
+            name=audio_path.name,
+            hash_id=get_file_hash(audio_path),
+        ),
+        model_name=model_name,
+        segment_size=segment_size,
+    ).model_dump()
+
+    paths = [
+        get_unique_base_path(
+            song_dir_path,
+            prefix,
+            args_dict,
+            progress_bar=progress_bar,
+            percentage=percentage,
+        ).with_suffix(suffix)
+        for prefix in ["11_Stem_Primary", "11_Stem_Secondary"]
+        for suffix in [".wav", ".json"]
+    ]
+
+    (
+        primary_path,
+        primary_json_path,
+        secondary_path,
+        secondary_json_path,
+    ) = paths
+
+    if not all(path.exists() for path in paths):
+        display_progress(display_msg, percentage, progress_bar)
+        audio_separator = _get_audio_separator(
+            output_dir=song_dir_path,
+            segment_size=segment_size,
+        )
+        audio_separator.load_model(model_name)
+        audio_separator.separate(
+            str(audio_path),
+            primary_output_name=primary_path.stem,
+            secondary_output_name=secondary_path.stem,
+        )
+        json_dump(args_dict, primary_json_path)
+        json_dump(args_dict, secondary_json_path)
+
+    return primary_path, secondary_path
+
+
+def _get_rvc_files(model_name: str) -> tuple[Path, Path | None]:
+    """
+    Get the RVC model file and potential index file of a voice model.
+
+    Parameters
+    ----------
+    model_name : str
+        The name of the voice model to get the RVC files of.
+
+    Returns
+    -------
+    model_file : Path
+        The path to the RVC model file.
+    index_file : Path | None
+        The path to the RVC index file, if it exists.
+
+    Raises
+    ------
+    NotFoundError
+        If no model file exists in the voice model directory.
+
+
+    """
+    model_dir_path = _validate_exists(model_name, Entity.MODEL_NAME)
+    file_path_map = {
+        ext: path
+        for path in model_dir_path.iterdir()
+        for ext in [".pth", ".index"]
+        if ext == path.suffix
+    }
+
+    if ".pth" not in file_path_map:
+        raise NotFoundError(
+            entity=Entity.MODEL_FILE,
+            location=model_dir_path,
+            is_path=False,
+        )
+
+    model_file = model_dir_path / file_path_map[".pth"]
+    index_file = (
+        model_dir_path / file_path_map[".index"] if ".index" in file_path_map else None
+    )
+
+    return model_file, index_file
+
+
+def _convert(
+    voice_track: StrPath,
+    output_file: StrPath,
+    model_name: str,
+    n_semitones: int = 0,
+    f0_method: F0Method = F0Method.RMVPE,
+    index_rate: float = 0.5,
+    filter_radius: int = 3,
+    rms_mix_rate: float = 0.25,
+    protect: float = 0.33,
+    hop_length: int = 128,
+    output_sr: int = 44100,
+) -> None:
+    """
+    Convert a voice track using a voice model and save the result to a
+    an output file.
+
+    Parameters
+    ----------
+    voice_track : StrPath
+        The path to the voice track to convert.
+    output_file : StrPath
+        The path to the file to save the converted voice track to.
+    model_name : str
+        The name of the model to use for voice conversion.
+    n_semitones : int, default=0
+        The number of semitones to pitch-shift the converted voice by.
+    f0_method : F0Method, default=F0Method.RMVPE
+        The method to use for pitch detection.
+    index_rate : float, default=0.5
+        The influence of the index file on the voice conversion.
+    filter_radius : int, default=3
+        The filter radius to use for the voice conversion.
+    rms_mix_rate : float, default=0.25
+        The blending rate of the volume envelope of the converted voice.
+    protect : float, default=0.33
+        The protection rate for consonants and breathing sounds.
+    hop_length : int, default=128
+        The hop length to use for crepe-based pitch detection.
+    output_sr : int, default=44100
+        The sample rate of the output audio file.
+
+    """
+    rvc_model_path, rvc_index_path = _get_rvc_files(model_name)
+    device = "cuda:0"
+    config = Config(device, is_half=True)
+    hubert_model = load_hubert(
+        device,
+        str(RVC_MODELS_DIR / "hubert_base.pt"),
+        is_half=config.is_half,
+    )
+    cpt, version, net_g, tgt_sr, vc = get_vc(
+        device,
+        config,
+        str(rvc_model_path),
+        is_half=config.is_half,
+    )
+
+    # convert main vocals
+    rvc_infer(
+        str(rvc_index_path) if rvc_index_path else "",
+        index_rate,
+        str(voice_track),
+        str(output_file),
+        n_semitones,
+        f0_method,
+        cpt,
+        version,
+        net_g,
+        filter_radius,
+        tgt_sr,
+        rms_mix_rate,
+        protect,
+        hop_length,
+        vc,
+        hubert_model,
+        output_sr,
+    )
+    del hubert_model, cpt
+    gc.collect()
+
+
+def convert(
+    vocals_track: StrPath,
+    song_dir: StrPath,
+    model_name: str,
+    n_octaves: int = 0,
+    n_semitones: int = 0,
+    f0_method: F0Method = F0Method.RMVPE,
+    index_rate: float = 0.5,
+    filter_radius: int = 3,
+    rms_mix_rate: float = 0.25,
+    protect: float = 0.33,
+    hop_length: int = 128,
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.5,
+) -> Path:
+    """
+    Convert a vocals track using a voice model.
+
+    Parameters
+    ----------
+    vocals_track : StrPath
+        The path to the vocals track to convert.
+    song_dir : StrPath
+        The path to the song directory where the converted vocals track
+        will be saved.
+    model_name : str
+        The name of the model to use for vocal conversion.
+    n_octaves : int, default=0
+        The number of octaves to pitch-shift the converted vocals by.
+    n_semitones : int, default=0
+        The number of semitones to pitch-shift the converted vocals by.
+    f0_method : F0Method, default=F0Method.RMVPE
+        The method to use for pitch detection.
+    index_rate : float, default=0.5
+        The influence of the index file on the vocal conversion.
+    filter_radius : int, default=3
+        The filter radius to use for the vocal conversion.
+    rms_mix_rate : float, default=0.25
+        The blending rate of the volume envelope of the converted
+        vocals.
+    protect : float, default=0.33
+        The protection rate for consonants and breathing sounds.
+    hop_length : int, default=128
+        The hop length to use for crepe-based pitch detection.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.5
+        Percentage to display in the progress bar.
+
+    Returns
+    -------
+    Path
+        The path to the converted vocals track.
+
+    """
+    vocals_path, song_dir_path, _ = _validate_all_exist(
+        [
+            (vocals_track, Entity.VOCALS_TRACK),
+            (song_dir, Entity.SONG_DIR),
+            (model_name, Entity.MODEL_NAME),
+        ],
+    )
+
+    n_semitones = n_octaves * 12 + n_semitones
+
+    args_dict = ConvertedVocalsMetaData(
+        vocals_track=FileMetaData(
+            name=vocals_path.name,
+            hash_id=get_file_hash(vocals_path),
+        ),
+        model_name=model_name,
+        n_semitones=n_semitones,
+        f0_method=f0_method,
+        index_rate=index_rate,
+        filter_radius=filter_radius,
+        rms_mix_rate=rms_mix_rate,
+        protect=protect,
+        hop_length=hop_length,
+    ).model_dump()
+
+    paths = [
+        get_unique_base_path(
+            song_dir_path,
+            "21_Vocals_Converted",
+            args_dict,
+            progress_bar=progress_bar,
+            percentage=percentage,
+        ).with_suffix(suffix)
+        for suffix in [".wav", ".json"]
+    ]
+
+    converted_vocals_path, converted_vocals_json_path = paths
+
+    if not all(path.exists() for path in paths):
+        display_progress("[~] Converting vocals using RVC...", percentage, progress_bar)
+        _convert(
+            vocals_path,
+            converted_vocals_path,
+            model_name,
+            n_semitones,
+            f0_method,
+            index_rate,
+            filter_radius,
+            rms_mix_rate,
+            protect,
+            hop_length,
+            output_sr=44100,
+        )
+        json_dump(args_dict, converted_vocals_json_path)
+    return converted_vocals_path
+
+
+def to_wav(
+    audio_track: StrPath,
+    song_dir: StrPath,
+    prefix: str,
+    accepted_formats: set[AudioExt] | None = None,
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.5,
+) -> Path:
+    """
+    Convert a given audio track to wav format if its current format is
+    one of the given accepted formats.
+
+    Parameters
+    ----------
+    audio_track : StrPath
+        The path to the audio track to convert.
+    song_dir : StrPath
+        The path to the song directory where the converted audio track
+        will be saved.
+    prefix : str
+        The prefix to use for the name of the converted audio track.
+    accepted_formats : set[AudioExt], optional
+        The audio formats to accept for conversion. If None, the
+        accepted formats are mp3, ogg, flac, m4a and aac.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.5
+        Percentage to display in the progress bar.
+
+    Returns
+    -------
+    Path
+        The path to the audio track in wav format or the original audio
+        track if it is not in one of the accepted formats.
+
+    """
+    if accepted_formats is None:
+        accepted_formats = set(AudioExt) - {AudioExt.WAV}
+
+    audio_path, song_dir_path = _validate_all_exist(
+        [(audio_track, Entity.AUDIO_TRACK), (song_dir, Entity.SONG_DIR)],
+    )
+
+    wav_path = audio_path
+
+    song_info = pydub_utils.mediainfo(str(audio_path))
+    logger.info("Song Info:\n%s", json_dumps(song_info))
+    if any(
+        accepted_format in song_info["format_name"]
+        if accepted_format == AudioExt.M4A
+        else accepted_format == song_info["format_name"]
+        for accepted_format in accepted_formats
+    ):
+        args_dict = WaveifiedAudioMetaData(
+            audio_track=FileMetaData(
+                name=audio_path.name,
+                hash_id=get_file_hash(audio_path),
+            ),
+        ).model_dump()
+
+        paths = [
+            get_unique_base_path(
+                song_dir_path,
+                prefix,
+                args_dict,
+                progress_bar=progress_bar,
+                percentage=percentage,
+            ).with_suffix(suffix)
+            for suffix in [".wav", ".json"]
+        ]
+        wav_path, wav_json_path = paths
+        if not all(path.exists() for path in paths):
+            display_progress(
+                "[~] Converting audio track to wav format...",
+                percentage,
+                progress_bar,
+            )
+
+            _, stderr = (
+                ffmpeg.input(audio_path)
+                .output(filename=wav_path, f="wav")
+                .run(
+                    overwrite_output=True,
+                    quiet=True,
+                )
+            )
+            logger.info("FFmpeg stderr:\n%s", stderr.decode("utf-8"))
+            json_dump(args_dict, wav_json_path)
+
+    return wav_path
+
+
+def _add_effects(
+    audio_track: StrPath,
+    output_file: StrPath,
+    room_size: float = 0.15,
+    wet_level: float = 0.2,
+    dry_level: float = 0.8,
+    damping: float = 0.7,
+) -> None:
+    """
+    Add high-pass filter, compressor and reverb effects to an audio
+    track.
+
+    Parameters
+    ----------
+    audio_track : StrPath
+        The path to the audio track to add effects to.
+    output_file : StrPath
+        The path to the file to save the effected audio track to.
+    room_size : float, default=0.15
+        The room size of the reverb effect.
+    wet_level : float, default=0.2
+        The wetness level of the reverb effect.
+    dry_level : float, default=0.8
+        The dryness level of the reverb effect.
+    damping : float, default=0.7
+        The damping of the reverb effect.
+
+    """
+    board = Pedalboard(
+        [
+            HighpassFilter(),
+            Compressor(ratio=4, threshold_db=-15),
+            Reverb(
+                room_size=room_size,
+                dry_level=dry_level,
+                wet_level=wet_level,
+                damping=damping,
+            ),
+        ],
+    )
+
+    with (
+        AudioFile(str(audio_track)) as f,
+        AudioFile(str(output_file), "w", f.samplerate, f.num_channels) as o,
+    ):
+        # Read one second of audio at a time, until the file is empty:
+        while f.tell() < f.frames:
+            chunk = f.read(int(f.samplerate))
+            effected = board(chunk, f.samplerate, reset=False)
+            o.write(effected)
+
+
+def postprocess(
+    vocals_track: StrPath,
+    song_dir: StrPath,
+    room_size: float = 0.15,
+    wet_level: float = 0.2,
+    dry_level: float = 0.8,
+    damping: float = 0.7,
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.5,
+) -> Path:
+    """
+    Apply high-pass filter, compressor and reverb effects to a vocals
+    track.
+
+    Parameters
+    ----------
+    vocals_track : StrPath
+        The path to the vocals track to add effects to.
+    song_dir : StrPath
+        The path to the song directory where the effected vocals track
+        will be saved.
+    room_size : float, default=0.15
+        The room size of the reverb effect.
+    wet_level : float, default=0.2
+        The wetness level of the reverb effect.
+    dry_level : float, default=0.8
+        The dryness level of the reverb effect.
+    damping : float, default=0.7
+        The damping of the reverb effect.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.5
+        Percentage to display in the progress bar.
+
+    Returns
+    -------
+    Path
+        The path to the effected vocals track.
+
+    """
+    vocals_path, song_dir_path = _validate_all_exist(
+        [(vocals_track, Entity.VOCALS_TRACK), (song_dir, Entity.SONG_DIR)],
+    )
+
+    vocals_path = to_wav(
+        vocals_path,
+        song_dir_path,
+        "30_Input",
+        accepted_formats={AudioExt.M4A, AudioExt.AAC},
+        progress_bar=progress_bar,
+        percentage=percentage,
+    )
+
+    args_dict = EffectedVocalsMetaData(
+        vocals_track=FileMetaData(
+            name=vocals_path.name,
+            hash_id=get_file_hash(vocals_path),
+        ),
+        room_size=room_size,
+        wet_level=wet_level,
+        dry_level=dry_level,
+        damping=damping,
+    ).model_dump()
+
+    paths = [
+        get_unique_base_path(
+            song_dir_path,
+            "31_Vocals_Effected",
+            args_dict,
+            progress_bar=progress_bar,
+            percentage=percentage,
+        ).with_suffix(suffix)
+        for suffix in [".wav", ".json"]
+    ]
+
+    effected_vocals_path, effected_vocals_json_path = paths
+
+    if not all(path.exists() for path in paths):
+        display_progress(
+            "[~] Applying audio effects to vocals...",
+            percentage,
+            progress_bar,
+        )
+        _add_effects(
+            vocals_path,
+            effected_vocals_path,
+            room_size,
+            wet_level,
+            dry_level,
+            damping,
+        )
+        json_dump(args_dict, effected_vocals_json_path)
+    return effected_vocals_path
+
+
+def _pitch_shift(audio_track: StrPath, output_file: StrPath, n_semi_tones: int) -> None:
+    """
+    Pitch-shift an audio track.
+
+    Parameters
+    ----------
+    audio_track : StrPath
+        The path to the audio track to pitch-shift.
+    output_file : StrPath
+        The path to the file to save the pitch-shifted audio track to.
+    n_semi_tones : int
+        The number of semi-tones to pitch-shift the audio track by.
+
+    """
+    y, sr = sf.read(audio_track)
+    tfm = sox.Transformer()
+    tfm.pitch(n_semi_tones)
+    y_shifted = tfm.build_array(input_array=y, sample_rate_in=sr)
+    sf.write(output_file, y_shifted, sr)
+
+
+def pitch_shift(
+    audio_track: StrPath,
+    song_dir: StrPath,
+    n_semitones: int,
+    display_msg: str = "[~] Pitch-shifting audio...",
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.5,
+) -> Path:
+    """
+    Pitch shift an audio track by a given number of semi-tones.
+
+    Parameters
+    ----------
+    audio_track : StrPath
+        The path to the audio track to pitch shift.
+    song_dir : StrPath
+        The path to the song directory where the pitch-shifted audio
+        track will be saved.
+    n_semitones : int
+        The number of semi-tones to pitch-shift the audio track by.
+    display_msg : str
+        The message to display when pitch-shifting the audio track.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.5
+        Percentage to display in the progress bar.
+
+    Returns
+    -------
+    Path
+        The path to the pitch-shifted audio track.
+
+    """
+    audio_path, song_dir_path = _validate_all_exist(
+        [(audio_track, Entity.AUDIO_TRACK), (song_dir, Entity.SONG_DIR)],
+    )
+
+    audio_path = to_wav(
+        audio_path,
+        song_dir_path,
+        "40_Input",
+        accepted_formats={AudioExt.M4A, AudioExt.AAC},
+        progress_bar=progress_bar,
+        percentage=percentage,
+    )
+
+    shifted_audio_path = audio_path
+
+    if n_semitones != 0:
+        args_dict = PitchShiftMetaData(
+            audio_track=FileMetaData(
+                name=audio_path.name,
+                hash_id=get_file_hash(audio_path),
+            ),
+            n_semitones=n_semitones,
+        ).model_dump()
+
+        paths = [
+            get_unique_base_path(
+                song_dir_path,
+                "41_Audio_Shifted",
+                args_dict,
+                progress_bar=progress_bar,
+                percentage=percentage,
+            ).with_suffix(suffix)
+            for suffix in [".wav", ".json"]
+        ]
+
+        shifted_audio_path, shifted_audio_json_path = paths
+
+        if not all(path.exists() for path in paths):
+            display_progress(display_msg, percentage, progress_bar)
+            _pitch_shift(audio_path, shifted_audio_path, n_semitones)
+            json_dump(args_dict, shifted_audio_json_path)
+
+    return shifted_audio_path
+
+
+def _to_internal(audio_ext: AudioExt) -> AudioExtInternal:
+    """
+    Map an audio extension to an internally recognized format.
+
+    Parameters
+    ----------
+    audio_ext : AudioExt
+        The audio extension to map.
+
+    Returns
+    -------
+    AudioExtInternal
+        The internal audio extension.
+
+    """
+    match audio_ext:
+        case AudioExt.M4A:
+            return AudioExtInternal.IPOD
+        case AudioExt.AAC:
+            return AudioExtInternal.ADTS
+        case _:
+            return AudioExtInternal(audio_ext)
+
+
+def _mix_song(
+    audio_track_gain_pairs: Sequence[tuple[StrPath, int]],
+    output_file: StrPath,
+    output_sr: int = 44100,
+    output_format: AudioExt = AudioExt.MP3,
+) -> None:
+    """
+    Mix multiple audio tracks to create a song.
+
+    Parameters
+    ----------
+    audio_track_gain_pairs : Sequence[tuple[StrPath, int]]
+        A sequence of pairs each containing the path to an audio track
+        and the gain to apply to it.
+    output_file : StrPath
+        The path to the file to save the mixed song to.
+    output_sr : int, default=44100
+        The sample rate of the mixed song.
+    output_format : AudioExt, default=AudioExt.MP3
+        The audio format of the mixed song.
+
+    """
+    mixed_audio = reduce(
+        lambda a1, a2: a1.overlay(a2),
+        [
+            AudioSegment.from_wav(audio_track) + gain
+            for audio_track, gain in audio_track_gain_pairs
+        ],
+    )
+    mixed_audio_resampled = mixed_audio.set_frame_rate(output_sr)
+    mixed_audio_resampled.export(
+        output_file,
+        format=_to_internal(output_format),
+    )
+
+
+def mix_song(
+    audio_track_gain_pairs: Sequence[tuple[StrPath, int]],
+    song_dir: StrPath,
+    output_sr: int = 44100,
+    output_format: AudioExt = AudioExt.MP3,
+    output_name: str | None = None,
+    display_msg: str = "[~] Mixing audio tracks...",
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.5,
+) -> Path:
+    """
+    Mix multiple audio tracks to create a song.
+
+    Parameters
+    ----------
+    audio_track_gain_pairs : Sequence[tuple[StrPath, int]]
+        A sequence of pairs each containing the path to an audio track
+        and the gain to apply to it.
+    song_dir : StrPath
+        The path to the song directory where the song will be saved.
+    output_sr : int, default=44100
+        The sample rate of the mixed song.
+    output_format : AudioExt, default=AudioExt.MP3
+        The audio format of the mixed song.
+    output_name : str, optional
+        The name of the mixed song.
+    display_msg : str, default="[~] Mixing audio tracks..."
+        The message to display when mixing the audio tracks.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.5
+        Percentage to display in the progress bar.
+
+    Returns
+    -------
+    Path
+        The path to the song cover.
+
+    Raises
+    ------
+    NotProvidedError
+        If no audio tracks are provided.
+
+    """
+    if not audio_track_gain_pairs:
+        raise NotProvidedError(
+            entity=Entity.AUDIO_TRACK_GAIN_PAIRS,
+            ui_msg=UIMessage.NO_AUDIO_TRACK,
+        )
+
+    audio_path_gain_pairs = [
+        (
+            to_wav(
+                _validate_exists(audio_track, Entity.AUDIO_TRACK),
+                song_dir,
+                "50_Input",
+                progress_bar=progress_bar,
+                percentage=percentage,
+            ),
+            gain,
+        )
+        for audio_track, gain in audio_track_gain_pairs
+    ]
+    song_dir_path = _validate_exists(song_dir, Entity.SONG_DIR)
+    args_dict = MixedSongMetaData(
+        staged_audio_tracks=[
+            StagedAudioMetaData(
+                audio_track=FileMetaData(
+                    name=audio_path.name,
+                    hash_id=get_file_hash(audio_path),
+                ),
+                gain=gain,
+            )
+            for audio_path, gain in audio_path_gain_pairs
+        ],
+        output_sr=output_sr,
+        output_format=output_format,
+    ).model_dump()
+
+    paths = [
+        get_unique_base_path(
+            song_dir_path,
+            "51_Mix",
+            args_dict,
+            progress_bar=progress_bar,
+            percentage=percentage,
+        ).with_suffix(suffix)
+        for suffix in ["." + output_format, ".json"]
+    ]
+
+    mix_path, mix_json_path = paths
+
+    if not all(path.exists() for path in paths):
+        display_progress(display_msg, percentage, progress_bar)
+
+        _mix_song(audio_path_gain_pairs, mix_path, output_sr, output_format)
+        json_dump(args_dict, mix_json_path)
+    output_name = output_name or get_song_cover_name(
+        audio_path_gain_pairs[0][0],
+        song_dir_path,
+        None,
+    )
+    song_path = OUTPUT_AUDIO_DIR / f"{output_name}.{output_format}"
+    return copy_file_safe(mix_path, song_path)
+
+
+def run_pipeline(
+    source: str,
+    model_name: str,
+    n_octaves: int = 0,
+    n_semitones: int = 0,
+    f0_method: F0Method = F0Method.RMVPE,
+    index_rate: float = 0.5,
+    filter_radius: int = 3,
+    rms_mix_rate: float = 0.25,
+    protect: float = 0.33,
+    hop_length: int = 128,
+    room_size: float = 0.15,
+    wet_level: float = 0.2,
+    dry_level: float = 0.8,
+    damping: float = 0.7,
+    main_gain: int = 0,
+    inst_gain: int = 0,
+    backup_gain: int = 0,
+    output_sr: int = 44100,
+    output_format: AudioExt = AudioExt.MP3,
+    output_name: str | None = None,
+    progress_bar: gr.Progress | None = None,
+) -> tuple[Path, ...]:
+    """
+    Run the song cover generation pipeline.
+
+    Parameters
+    ----------
+    source : str
+        A Youtube URL, the path to a local audio file or the path to a
+        song directory.
+    model_name : str
+        The name of the voice model to use for vocal conversion.
+    n_octaves : int, default=0
+        The number of octaves to pitch-shift the converted vocals by.
+    n_semitones : int, default=0
+        The number of semi-tones to pitch-shift the converted vocals,
+        instrumentals, and backup vocals by.
+    f0_method : F0Method, default=F0Method.RMVPE
+        The method to use for pitch detection during vocal conversion.
+    index_rate : float, default=0.5
+        The influence of the index file on the vocal conversion.
+    filter_radius : int, default=3
+        The filter radius to use for the vocal conversion.
+    rms_mix_rate : float, default=0.25
+        The blending rate of the volume envelope of the converted
+        vocals.
+    protect : float, default=0.33
+        The protection rate for consonants and breathing sounds during
+        vocal conversion.
+    hop_length : int, default=128
+        The hop length to use for crepe-based pitch detection.
+    room_size : float, default=0.15
+        The room size of the reverb effect to apply to the converted
+        vocals.
+    wet_level : float, default=0.2
+        The wetness level of the reverb effect to apply to the converted
+        vocals.
+    dry_level : float, default=0.8
+        The dryness level of the reverb effect to apply to the converted
+        vocals.
+    damping : float, default=0.7
+        The damping of the reverb effect to apply to the converted
+        vocals.
+    main_gain : int, default=0
+        The gain to apply to the post-processed vocals.
+    inst_gain : int, default=0
+        The gain to apply to the pitch-shifted instrumentals.
+    backup_gain : int, default=0
+        The gain to apply to the pitch-shifted backup vocals.
+    output_sr : int, default=44100
+        The sample rate of the song cover.
+    output_format : AudioExt, default=AudioExt.MP3
+        The audio format of the song cover.
+    output_name : str, optional
+        The name of the song cover.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+
+    Returns
+    -------
+    tuple[Path,...]
+        The path to the generated song cover and the paths to any
+        intermediate audio files that were generated.
+
+    """
+    _validate_exists(model_name, Entity.MODEL_NAME)
+    display_progress("[~] Starting song cover generation pipeline...", 0, progress_bar)
+    song, song_dir = retrieve_song(
+        source,
+        progress_bar=progress_bar,
+        percentage=0 / 9,
+    )
+    vocals_track, instrumentals_track = separate_audio(
+        song,
+        song_dir,
+        SeparationModel.UVR_MDX_NET_VOC_FT,
+        SegmentSize.SEG_512,
+        display_msg="[~] Separating vocals from instrumentals...",
+        progress_bar=progress_bar,
+        percentage=1 / 9,
+    )
+    backup_vocals_track, main_vocals_track = separate_audio(
+        vocals_track,
+        song_dir,
+        SeparationModel.UVR_MDX_NET_KARA_2,
+        SegmentSize.SEG_512,
+        display_msg="[~] Separating main vocals from backup vocals...",
+        progress_bar=progress_bar,
+        percentage=2 / 9,
+    )
+
+    reverb_track, vocals_dereverb_track = separate_audio(
+        main_vocals_track,
+        song_dir,
+        SeparationModel.REVERB_HQ_BY_FOXJOY,
+        SegmentSize.SEG_256,
+        display_msg="[~] De-reverbing vocals...",
+        progress_bar=progress_bar,
+        percentage=3 / 9,
+    )
+    converted_vocals_track = convert(
+        vocals_dereverb_track,
+        song_dir,
+        model_name,
+        n_octaves,
+        n_semitones,
+        f0_method,
+        index_rate,
+        filter_radius,
+        rms_mix_rate,
+        protect,
+        hop_length,
+        progress_bar=progress_bar,
+        percentage=4 / 9,
+    )
+    effected_vocals_track = postprocess(
+        converted_vocals_track,
+        song_dir,
+        room_size,
+        wet_level,
+        dry_level,
+        damping,
+        progress_bar=progress_bar,
+        percentage=5 / 9,
+    )
+    shifted_instrumentals_track = pitch_shift(
+        instrumentals_track,
+        song_dir,
+        n_semitones,
+        display_msg="[~] Pitch-shifting instrumentals...",
+        progress_bar=progress_bar,
+        percentage=6 / 9,
+    )
+
+    shifted_backup_vocals_track = pitch_shift(
+        backup_vocals_track,
+        song_dir,
+        n_semitones,
+        display_msg="[~] Pitch-shifting backup vocals...",
+        progress_bar=progress_bar,
+        percentage=7 / 9,
+    )
+
+    song_cover = mix_song(
+        [
+            (effected_vocals_track, main_gain),
+            (shifted_instrumentals_track, inst_gain),
+            (shifted_backup_vocals_track, backup_gain),
+        ],
+        song_dir,
+        output_sr,
+        output_format,
+        output_name,
+        display_msg="[~] Mixing main vocals, instrumentals, and backup vocals...",
+        progress_bar=progress_bar,
+        percentage=8 / 9,
+    )
+    return (
+        song_cover,
+        song,
+        vocals_track,
+        instrumentals_track,
+        main_vocals_track,
+        backup_vocals_track,
+        vocals_dereverb_track,
+        reverb_track,
+        converted_vocals_track,
+        effected_vocals_track,
+        shifted_instrumentals_track,
+        shifted_backup_vocals_track,
+    )
diff --git a/src/ultimate_rvc/core/main.py b/src/ultimate_rvc/core/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcb5b1bbb1200dde37f2c0e5980472780568eb0e
--- /dev/null
+++ b/src/ultimate_rvc/core/main.py
@@ -0,0 +1,48 @@
+"""
+Module which defines functions for initializing the core of the Ultimate
+RVC project.
+"""
+
+from pathlib import Path
+
+from rich import print as rprint
+
+from ultimate_rvc.common import RVC_MODELS_DIR
+from ultimate_rvc.core.common import FLAG_FILE, download_base_models
+from ultimate_rvc.core.generate.song_cover import initialize_audio_separator
+from ultimate_rvc.core.manage.models import download_model
+
+
+def download_sample_models() -> None:
+    """Download sample RVC models."""
+    named_model_links = [
+        (
+            "https://huggingface.co/damnedraxx/TaylorSwift/resolve/main/TaylorSwift.zip",
+            "Taylor Swift",
+        ),
+        (
+            "https://huggingface.co/Vermiculos/balladjames/resolve/main/Ballad%20James.zip?download=true",
+            "James Hetfield",
+        ),
+        ("https://huggingface.co/ryolez/MMLP/resolve/main/MMLP.zip", "Eminem"),
+    ]
+    for model_url, model_name in named_model_links:
+        if not Path(RVC_MODELS_DIR / model_name).is_dir():
+            rprint(f"Downloading {model_name}...")
+            try:
+                download_model(model_url, model_name)
+            except Exception as e:
+                rprint(f"Failed to download {model_name}: {e}")
+
+
+def initialize() -> None:
+    """Initialize the Ultimate RVC project."""
+    download_base_models()
+    if not FLAG_FILE.is_file():
+        download_sample_models()
+        FLAG_FILE.touch()
+    initialize_audio_separator()
+
+
+if __name__ == "__main__":
+    initialize()
diff --git a/src/ultimate_rvc/core/manage/__init__.py b/src/ultimate_rvc/core/manage/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ceec80875eb70ade8e49b7d08aa7c4c8c35a59a
--- /dev/null
+++ b/src/ultimate_rvc/core/manage/__init__.py
@@ -0,0 +1,4 @@
+"""
+Package which defines modules that facilitate managing settings and
+data.
+"""
diff --git a/src/ultimate_rvc/core/manage/audio.py b/src/ultimate_rvc/core/manage/audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3a3ce9f329f715e79359d640ede797f3e6290ab
--- /dev/null
+++ b/src/ultimate_rvc/core/manage/audio.py
@@ -0,0 +1,214 @@
+"""Module which defines functions to manage audio files."""
+
+import operator
+import shutil
+from collections.abc import Sequence
+from pathlib import Path
+
+import gradio as gr
+
+from ultimate_rvc.core.common import (
+    INTERMEDIATE_AUDIO_BASE_DIR,
+    OUTPUT_AUDIO_DIR,
+    display_progress,
+)
+from ultimate_rvc.core.exceptions import (
+    Entity,
+    InvalidLocationError,
+    Location,
+    NotFoundError,
+    NotProvidedError,
+    UIMessage,
+)
+from ultimate_rvc.typing_extra import StrPath
+
+
+def get_saved_output_audio() -> list[tuple[str, str]]:
+    """
+    Get the name and path of all output audio files.
+
+    Returns
+    -------
+    list[tuple[str, Path]]
+        A list of tuples containing the name and path of each output
+        audio file.
+
+    """
+    if OUTPUT_AUDIO_DIR.is_dir():
+        named_output_files = [
+            (file_path.name, str(file_path)) for file_path in OUTPUT_AUDIO_DIR.iterdir()
+        ]
+        return sorted(named_output_files, key=operator.itemgetter(0))
+    return []
+
+
+def delete_intermediate_audio(
+    directories: Sequence[StrPath],
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.5,
+) -> None:
+    """
+    Delete provided directories containing intermediate audio files.
+
+    The provided directories must be located in the root of the
+    intermediate audio base directory.
+
+    Parameters
+    ----------
+    directories : Sequence[StrPath]
+        Paths to directories containing intermediate audio files to
+        delete.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.5
+        Percentage to display in the progress bar.
+
+    Raises
+    ------
+    NotProvidedError
+        If no paths are provided.
+    NotFoundError
+        if a provided path does not point to an existing directory.
+    InvalidLocationError
+        If a provided path does not point to a location in the root of
+        the intermediate audio base directory.
+
+    """
+    if not directories:
+        raise NotProvidedError(entity=Entity.DIRECTORIES, ui_msg=UIMessage.NO_SONG_DIRS)
+    display_progress(
+        "[~] Deleting directories ...",
+        percentage,
+        progress_bar,
+    )
+    for directory in directories:
+        dir_path = Path(directory)
+        if not dir_path.is_dir():
+            raise NotFoundError(entity=Entity.DIRECTORY, location=dir_path)
+        if dir_path.parent != INTERMEDIATE_AUDIO_BASE_DIR:
+            raise InvalidLocationError(
+                entity=Entity.DIRECTORY,
+                location=Location.INTERMEDIATE_AUDIO_ROOT,
+                path=dir_path,
+            )
+        shutil.rmtree(dir_path)
+
+
+def delete_all_intermediate_audio(
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.5,
+) -> None:
+    """
+    Delete all intermediate audio files.
+
+    Parameters
+    ----------
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.5
+        Percentage to display in the progress bar.
+
+    """
+    display_progress(
+        "[~] Deleting all intermediate audio files...",
+        percentage,
+        progress_bar,
+    )
+    if INTERMEDIATE_AUDIO_BASE_DIR.is_dir():
+        shutil.rmtree(INTERMEDIATE_AUDIO_BASE_DIR)
+
+
+def delete_output_audio(
+    files: Sequence[StrPath],
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.5,
+) -> None:
+    """
+    Delete provided output audio files.
+
+    The provided files must be located in the root of the output audio
+    directory.
+
+    Parameters
+    ----------
+    files : Sequence[StrPath]
+        Paths to the output audio files to delete.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.5
+        Percentage to display in the progress bar.
+
+    Raises
+    ------
+    NotProvidedError
+        If no paths are provided.
+    NotFoundError
+        If a provided path does not point to an existing file.
+    InvalidLocationError
+        If a provided path does not point to a location in the root of
+        the output audio directory.
+
+    """
+    if not files:
+        raise NotProvidedError(
+            entity=Entity.FILES,
+            ui_msg=UIMessage.NO_OUTPUT_AUDIO_FILES,
+        )
+    display_progress(
+        "[~] Deleting output audio files...",
+        percentage,
+        progress_bar,
+    )
+    for file in files:
+        file_path = Path(file)
+        if not file_path.is_file():
+            raise NotFoundError(entity=Entity.FILE, location=file_path)
+        if file_path.parent != OUTPUT_AUDIO_DIR:
+            raise InvalidLocationError(
+                entity=Entity.FILE,
+                location=Location.OUTPUT_AUDIO_ROOT,
+                path=file_path,
+            )
+        file_path.unlink()
+
+
+def delete_all_output_audio(
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.5,
+) -> None:
+    """
+    Delete all output audio files.
+
+    Parameters
+    ----------
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.5
+        Percentage to display in the progress bar.
+
+    """
+    display_progress("[~] Deleting all output audio files...", percentage, progress_bar)
+    if OUTPUT_AUDIO_DIR.is_dir():
+        shutil.rmtree(OUTPUT_AUDIO_DIR)
+
+
+def delete_all_audio(
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.5,
+) -> None:
+    """
+    Delete all audio files.
+
+    Parameters
+    ----------
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.5
+        Percentage to display in the progress bar.
+
+    """
+    display_progress("[~] Deleting all audio files...", percentage, progress_bar)
+    if INTERMEDIATE_AUDIO_BASE_DIR.is_dir():
+        shutil.rmtree(INTERMEDIATE_AUDIO_BASE_DIR)
+    if OUTPUT_AUDIO_DIR.is_dir():
+        shutil.rmtree(OUTPUT_AUDIO_DIR)
diff --git a/src/ultimate_rvc/core/manage/models.py b/src/ultimate_rvc/core/manage/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..796d1a1a6919d344ae9b9bb8b6391fbb0fb585a9
--- /dev/null
+++ b/src/ultimate_rvc/core/manage/models.py
@@ -0,0 +1,424 @@
+"""Module which defines functions to manage voice models."""
+
+import re
+import shutil
+import urllib.request
+import zipfile
+from _collections_abc import Sequence
+from pathlib import Path
+
+import gradio as gr
+
+from ultimate_rvc.common import RVC_MODELS_DIR
+from ultimate_rvc.core.common import (
+    FLAG_FILE,
+    copy_files_to_new_dir,
+    display_progress,
+    json_load,
+    validate_url,
+)
+from ultimate_rvc.core.exceptions import (
+    Entity,
+    Location,
+    NotFoundError,
+    NotProvidedError,
+    UIMessage,
+    UploadFormatError,
+    UploadLimitError,
+    VoiceModelExistsError,
+    VoiceModelNotFoundError,
+)
+from ultimate_rvc.core.typing_extra import (
+    ModelMetaData,
+    ModelMetaDataList,
+    ModelMetaDataPredicate,
+    ModelMetaDataTable,
+    ModelTagName,
+)
+from ultimate_rvc.typing_extra import StrPath
+
+PUBLIC_MODELS_JSON = json_load(Path(__file__).parent / "public_models.json")
+PUBLIC_MODELS_TABLE = ModelMetaDataTable.model_validate(PUBLIC_MODELS_JSON)
+
+
+def get_saved_model_names() -> list[str]:
+    """
+    Get the names of all saved voice models.
+
+    Returns
+    -------
+    list[str]
+        A list of names of all saved voice models.
+
+    """
+    model_paths = RVC_MODELS_DIR.iterdir()
+    names_to_remove = ["hubert_base.pt", "rmvpe.pt", FLAG_FILE.name]
+    return sorted([
+        model_path.name
+        for model_path in model_paths
+        if model_path.name not in names_to_remove
+    ])
+
+
+def load_public_models_table(
+    predicates: Sequence[ModelMetaDataPredicate],
+) -> ModelMetaDataList:
+    """
+    Load table containing metadata of public voice models, optionally
+    filtered by a set of predicates.
+
+    Parameters
+    ----------
+    predicates : Sequence[ModelMetaDataPredicate]
+        Predicates to filter the metadata table by.
+
+    Returns
+    -------
+    ModelMetaDataList
+        List containing metadata for each public voice model that
+        satisfies the given predicates.
+
+    """
+    return [
+        [
+            model.name,
+            model.description,
+            model.tags,
+            model.credit,
+            model.added,
+            model.url,
+        ]
+        for model in PUBLIC_MODELS_TABLE.models
+        if all(predicate(model) for predicate in predicates)
+    ]
+
+
+def get_public_model_tags() -> list[ModelTagName]:
+    """
+    get the names of all valid public voice model tags.
+
+    Returns
+    -------
+    list[str]
+        A list of names of all valid public voice model tags.
+
+    """
+    return [tag.name for tag in PUBLIC_MODELS_TABLE.tags]
+
+
+def filter_public_models_table(
+    tags: Sequence[str],
+    query: str,
+) -> ModelMetaDataList:
+    """
+    Filter table containing metadata of public voice models by tags and
+    a search query.
+
+
+    The search query is matched against the name, description, tags,
+    credit,and added date of each entry in the metadata table. Case
+    insensitive search is performed. If the search query is empty, the
+    metadata table is filtered only bythe given tags.
+
+    Parameters
+    ----------
+    tags : Sequence[str]
+        Tags to filter the metadata table by.
+    query : str
+        Search query to filter the metadata table by.
+
+    Returns
+    -------
+    ModelMetaDataList
+        List containing metadata for each public voice model that
+        match the given tags and search query.
+
+    """
+
+    def _tags_predicate(model: ModelMetaData) -> bool:
+        return all(tag in model.tags for tag in tags)
+
+    def _query_predicate(model: ModelMetaData) -> bool:
+        return (
+            query.lower()
+            in (
+                f"{model.name} {model.description} {' '.join(model.tags)} "
+                f"{model.credit} {model.added}"
+            ).lower()
+            if query
+            else True
+        )
+
+    filter_fns = [_tags_predicate, _query_predicate]
+
+    return load_public_models_table(filter_fns)
+
+
+def _extract_model(
+    zip_file: StrPath,
+    extraction_dir: StrPath,
+    remove_incomplete: bool = True,
+    remove_zip: bool = False,
+) -> None:
+    """
+    Extract a zipped voice model to a directory.
+
+    Parameters
+    ----------
+    zip_file : StrPath
+        The path to a zip file containing the voice model to extract.
+    extraction_dir : StrPath
+        The path to the directory to extract the voice model to.
+
+    remove_incomplete : bool, default=True
+        Whether to remove the extraction directory if the extraction
+        process fails.
+    remove_zip : bool, default=False
+        Whether to remove the zip file once the extraction process is
+        complete.
+
+    Raises
+    ------
+    NotFoundError
+        If no model file is found in the extracted zip file.
+
+    """
+    extraction_path = Path(extraction_dir)
+    zip_path = Path(zip_file)
+    extraction_completed = False
+    try:
+        extraction_path.mkdir(parents=True)
+        with zipfile.ZipFile(zip_path, "r") as zip_ref:
+            zip_ref.extractall(extraction_path)
+        file_path_map = {
+            ext: Path(root, name)
+            for root, _, files in extraction_path.walk()
+            for name in files
+            for ext in [".index", ".pth"]
+            if Path(name).suffix == ext
+            and Path(root, name).stat().st_size
+            > 1024 * (100 if ext == ".index" else 1024 * 40)
+        }
+        if ".pth" not in file_path_map:
+            raise NotFoundError(
+                entity=Entity.MODEL_FILE,
+                location=Location.EXTRACTED_ZIP_FILE,
+                is_path=False,
+            )
+
+        # move model and index file to root of the extraction directory
+        for file_path in file_path_map.values():
+            file_path.rename(extraction_path / file_path.name)
+
+        # remove any sub-directories within the extraction directory
+        for path in extraction_path.iterdir():
+            if path.is_dir():
+                shutil.rmtree(path)
+        extraction_completed = True
+    finally:
+        if not extraction_completed and remove_incomplete and extraction_path.is_dir():
+            shutil.rmtree(extraction_path)
+        if remove_zip and zip_path.exists():
+            zip_path.unlink()
+
+
+def download_model(
+    url: str,
+    name: str,
+    progress_bar: gr.Progress | None = None,
+    percentages: tuple[float, float] = (0.0, 0.5),
+) -> None:
+    """
+    Download a zipped voice model.
+
+    Parameters
+    ----------
+    url : str
+        An URL pointing to a location where the zipped voice model can
+        be downloaded from.
+    name : str
+        The name to give to the downloaded voice model.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentages : tuple[float, float], default=(0.0, 0.5)
+        Percentages to display in the progress bar.
+
+    Raises
+    ------
+    NotProvidedError
+        If no URL or name is provided.
+    VoiceModelExistsError
+        If a voice model with the provided name already exists.
+
+    """
+    if not url:
+        raise NotProvidedError(entity=Entity.URL)
+    if not name:
+        raise NotProvidedError(entity=Entity.MODEL_NAME)
+    extraction_path = RVC_MODELS_DIR / name
+    if extraction_path.exists():
+        raise VoiceModelExistsError(name)
+
+    validate_url(url)
+    zip_name = url.split("/")[-1].split("?")[0]
+
+    # NOTE in case huggingface link is a direct link rather
+    # than a resolve link then convert it to a resolve link
+    url = re.sub(
+        r"https://huggingface.co/([^/]+)/([^/]+)/blob/(.*)",
+        r"https://huggingface.co/\1/\2/resolve/\3",
+        url,
+    )
+    if "pixeldrain.com" in url:
+        url = f"https://pixeldrain.com/api/file/{zip_name}"
+
+    display_progress(
+        "[~] Downloading voice model ...",
+        percentages[0],
+        progress_bar,
+    )
+    urllib.request.urlretrieve(url, zip_name)  # noqa: S310
+
+    display_progress("[~] Extracting zip file...", percentages[1], progress_bar)
+    _extract_model(zip_name, extraction_path, remove_zip=True)
+
+
+def upload_model(
+    files: Sequence[StrPath],
+    name: str,
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.5,
+) -> None:
+    """
+    Upload a voice model from either a zip file or a .pth file and an
+    optional index file.
+
+    Parameters
+    ----------
+    files : Sequence[StrPath]
+        Paths to the files to upload.
+    name : str
+        The name to give to the uploaded voice model.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.5
+        Percentage to display in the progress bar.
+
+    Raises
+    ------
+    NotProvidedError
+        If no file paths or name are provided.
+    VoiceModelExistsError
+        If a voice model with the provided name already
+        exists.
+    UploadFormatError
+        If a single uploaded file is not a .pth file or a .zip file.
+        If two uploaded files are not a .pth file and an .index file.
+    UploadLimitError
+        If more than two file paths are provided.
+
+    """
+    if not files:
+        raise NotProvidedError(entity=Entity.FILES, ui_msg=UIMessage.NO_UPLOADED_FILES)
+    if not name:
+        raise NotProvidedError(entity=Entity.MODEL_NAME)
+    model_dir_path = RVC_MODELS_DIR / name
+    if model_dir_path.exists():
+        raise VoiceModelExistsError(name)
+    sorted_file_paths = sorted([Path(f) for f in files], key=lambda f: f.suffix)
+    match sorted_file_paths:
+        case [file_path]:
+            if file_path.suffix == ".pth":
+                display_progress("[~] Copying .pth file ...", percentage, progress_bar)
+                copy_files_to_new_dir([file_path], model_dir_path)
+            # NOTE a .pth file is actually itself a zip file
+            elif zipfile.is_zipfile(file_path):
+                display_progress("[~] Extracting zip file...", percentage, progress_bar)
+                _extract_model(file_path, model_dir_path)
+            else:
+                raise UploadFormatError(
+                    entity=Entity.FILES,
+                    formats=[".pth", ".zip"],
+                    multiple=False,
+                )
+        case [index_path, pth_path]:
+            if index_path.suffix == ".index" and pth_path.suffix == ".pth":
+                display_progress(
+                    "[~] Copying .pth file and index file ...",
+                    percentage,
+                    progress_bar,
+                )
+                copy_files_to_new_dir([index_path, pth_path], model_dir_path)
+            else:
+                raise UploadFormatError(
+                    entity=Entity.FILES,
+                    formats=[".pth", ".index"],
+                    multiple=True,
+                )
+        case _:
+            raise UploadLimitError(entity=Entity.FILES, limit="two")
+
+
+def delete_models(
+    names: Sequence[str],
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.5,
+) -> None:
+    """
+    Delete one or more voice models.
+
+    Parameters
+    ----------
+    names : Sequence[str]
+        Names of the voice models to delete.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.5
+        Percentage to display in the progress bar.
+
+    Raises
+    ------
+    NotProvidedError
+        If no names are provided.
+    VoiceModelNotFoundError
+        If a voice model with a provided name does not exist.
+
+    """
+    if not names:
+        raise NotProvidedError(
+            entity=Entity.MODEL_NAMES,
+            ui_msg=UIMessage.NO_VOICE_MODELS,
+        )
+    display_progress(
+        "[~] Deleting voice models ...",
+        percentage,
+        progress_bar,
+    )
+    for name in names:
+        model_dir_path = RVC_MODELS_DIR / name
+        if not model_dir_path.is_dir():
+            raise VoiceModelNotFoundError(name)
+        shutil.rmtree(model_dir_path)
+
+
+def delete_all_models(
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.5,
+) -> None:
+    """
+    Delete all voice models.
+
+    Parameters
+    ----------
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.5
+        Percentage to display in the progress bar.
+
+    """
+    all_model_names = get_saved_model_names()
+    display_progress("[~] Deleting all voice models ...", percentage, progress_bar)
+    for model_name in all_model_names:
+        model_dir_path = RVC_MODELS_DIR / model_name
+        if model_dir_path.is_dir():
+            shutil.rmtree(model_dir_path)
diff --git a/src/ultimate_rvc/core/manage/other_settings.py b/src/ultimate_rvc/core/manage/other_settings.py
new file mode 100644
index 0000000000000000000000000000000000000000..a63338ca0caead397c321130086c3b3aa0e8712b
--- /dev/null
+++ b/src/ultimate_rvc/core/manage/other_settings.py
@@ -0,0 +1,29 @@
+"""Module which defines functions used for managing various settings."""
+
+import shutil
+
+import gradio as gr
+
+from ultimate_rvc.common import TEMP_DIR
+from ultimate_rvc.core.common import display_progress
+
+
+def delete_temp_files(
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.5,
+) -> None:
+    """
+
+    Delete all temporary files.
+
+    Parameters
+    ----------
+    progress_bar : gr.Progress, optional
+        Progress bar to update.
+    percentage : float, optional
+        The percentage to display in the progress bar.
+
+    """
+    display_progress("[~] Deleting all temporary files...", percentage, progress_bar)
+    if TEMP_DIR.is_dir():
+        shutil.rmtree(TEMP_DIR)
diff --git a/src/ultimate_rvc/core/manage/public_models.json b/src/ultimate_rvc/core/manage/public_models.json
new file mode 100644
index 0000000000000000000000000000000000000000..22485c2489a976492f3fc5f39f3b0f342ffcd0d7
--- /dev/null
+++ b/src/ultimate_rvc/core/manage/public_models.json
@@ -0,0 +1,646 @@
+{
+    "tags": [
+        {
+            "name": "English",
+            "description": "Character speaks English"
+        },
+        {
+            "name": "Japanese",
+            "description": "Character speaks Japanese"
+        },
+        {
+            "name": "Other Language",
+            "description": "The character speaks Other Language"
+        },
+        {
+            "name": "Anime",
+            "description": "Character from anime"
+        },
+        {
+            "name": "Vtuber",
+            "description": "Character is a vtuber"
+        },
+        {
+            "name": "Real person",
+            "description": "A person who exists in the real world"
+        },
+        {
+            "name": "Game character",
+            "description": "A character from the game"
+        }
+    ],
+    "models": [
+        {
+            "name": "Emilia",
+            "url": "https://huggingface.co/RinkaEmina/RVC_Sharing/resolve/main/Emilia%20V2%2048000.zip",
+            "description": "Emilia from Re:Zero",
+            "added": "2023-07-31",
+            "credit": "rinka4759",
+            "tags": [
+                "Anime"
+            ]
+        },
+        {
+            "name": "Klee",
+            "url": "https://huggingface.co/qweshkka/Klee/resolve/main/Klee.zip",
+            "description": "Klee from Genshin Impact",
+            "added": "2023-07-31",
+            "credit": "qweshsmashjuicefruity",
+            "tags": [
+                "Game character",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Yelan",
+            "url": "https://huggingface.co/iroaK/RVC2_Yelan_GenshinImpact/resolve/main/YelanJP.zip",
+            "description": "Yelan from Genshin Impact",
+            "added": "2023-07-31",
+            "credit": "iroak",
+            "tags": [
+                "Game character",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Yae Miko",
+            "url": "https://huggingface.co/iroaK/RVC2_YaeMiko_GenshinImpact/resolve/main/Yae_MikoJP.zip",
+            "description": "Yae Miko from Genshin Impact",
+            "added": "2023-07-31",
+            "credit": "iroak",
+            "tags": [
+                "Game character",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Lisa",
+            "url": "https://huggingface.co/qweshkka/Lisa2ver/resolve/main/Lisa.zip",
+            "description": "Lisa from Genshin Impact",
+            "added": "2023-07-31",
+            "credit": "qweshsmashjuicefruity",
+            "tags": [
+                "Game character",
+                "English"
+            ]
+        },
+        {
+            "name": "Kazuha",
+            "url": "https://huggingface.co/iroaK/RVC2_Kazuha_GenshinImpact/resolve/main/Kazuha.zip",
+            "description": "Kaedehara Kazuha from Genshin Impact",
+            "added": "2023-07-31",
+            "credit": "iroak",
+            "tags": [
+                "Game character",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Barbara",
+            "url": "https://huggingface.co/iroaK/RVC2_Barbara_GenshinImpact/resolve/main/BarbaraJP.zip",
+            "description": "Barbara from Genshin Impact",
+            "added": "2023-07-31",
+            "credit": "iroak",
+            "tags": [
+                "Game character",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Tom Holland",
+            "url": "https://huggingface.co/TJKAI/TomHolland/resolve/main/TomHolland.zip",
+            "description": "Tom Holland (Spider-Man)",
+            "added": "2023-08-03",
+            "credit": "tjkcreative",
+            "tags": [
+                "Real person",
+                "English"
+            ]
+        },
+        {
+            "name": "Kamisato Ayaka",
+            "url": "https://huggingface.co/benitheworld/ayaka-cn/resolve/main/ayaka-cn.zip",
+            "description": "Kamisato Ayaka from Genshin Impact - CN voice actor",
+            "added": "2023-08-03",
+            "credit": "kannysoap",
+            "tags": [
+                "Game character",
+                "Other Language"
+            ]
+        },
+        {
+            "name": "Amai Odayaka",
+            "url": "https://huggingface.co/NoIdea4Username/NoIdeaRVCCollection/resolve/main/Amai-Odayaka.zip",
+            "description": "Amai Odayaka from Yandere Simulator",
+            "added": "2023-08-03",
+            "credit": "minecraftian47",
+            "tags": [
+                "Anime",
+                "English"
+            ]
+        },
+        {
+            "name": "Compa - Hyperdimension Neptunia",
+            "url": "https://huggingface.co/zeerowiibu/WiibuRVCCollection/resolve/main/Compa%20(Choujigen%20Game%20Neptunia)%20(JPN)%20(RVC%20v2)%20(150%20Epochs).zip",
+            "description": "Compa from Choujigen Game Neptune (aka Hyperdimension Neptunia)",
+            "added": "2023-08-03",
+            "credit": "zeerowiibu",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Fu Xuan",
+            "url": "https://huggingface.co/Juneuarie/FuXuan/resolve/main/FuXuan.zip",
+            "description": "Fu Xuan from Honkai Star Rail (HSR)",
+            "added": "2023-08-03",
+            "credit": "__june",
+            "tags": [
+                "Game character",
+                "English"
+            ]
+        },
+        {
+            "name": "Xinyan",
+            "url": "https://huggingface.co/AnimeSessions/rvc_voice_models/resolve/main/XinyanRVC.zip",
+            "description": "Xinyan from Genshin Impact",
+            "added": "2023-08-03",
+            "credit": "shyelijah",
+            "tags": [
+                "Game character",
+                "English"
+            ]
+        },
+        {
+            "name": "Enterprise",
+            "url": "https://huggingface.co/NoIdea4Username/NoIdeaRVCCollection/resolve/main/Enterprise-JP.zip",
+            "description": "Enterprise from Azur Lane",
+            "added": "2023-08-03",
+            "credit": "minecraftian47",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Kurt Cobain",
+            "url": "https://huggingface.co/Florstie/Kurt_Cobain_byFlorst/resolve/main/Kurt_Florst.zip",
+            "description": "singer Kurt Cobain",
+            "added": "2023-08-03",
+            "credit": "florst",
+            "tags": [
+                "Real person",
+                "English"
+            ]
+        },
+        {
+            "name": "Ironmouse",
+            "url": "https://huggingface.co/Tempo-Hawk/IronmouseV2/resolve/main/IronmouseV2.zip",
+            "description": "Ironmouse",
+            "added": "2023-08-03",
+            "credit": "ladyimpa",
+            "tags": [
+                "Vtuber",
+                "English"
+            ]
+        },
+        {
+            "name": "Bratishkinoff",
+            "url": "https://huggingface.co/JHmashups/Bratishkinoff/resolve/main/bratishkin.zip",
+            "description": "Bratishkinoff (Bratishkin | Братишкин) - russian steamer ",
+            "added": "2023-08-03",
+            "credit": ".caddii",
+            "tags": [
+                "Real person",
+                "Other Language"
+            ]
+        },
+        {
+            "name": "Yagami Light",
+            "url": "https://huggingface.co/geekdom-tr/Yagami-Light/resolve/main/Yagami-Light.zip",
+            "description": "Yagami Light (Miyano Mamoru) from death note",
+            "added": "2023-08-03",
+            "credit": "takka / takka#7700",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Itashi",
+            "url": "https://huggingface.co/4uGGun/4uGGunRVC/resolve/main/itashi.zip",
+            "description": "Itashi (Russian fandubber AniLibria) ",
+            "added": "2023-08-03",
+            "credit": "BelochkaOff",
+            "tags": [
+                "Anime",
+                "Other Language",
+                "Real person"
+            ]
+        },
+        {
+            "name": "Michiru Kagemori",
+            "url": "https://huggingface.co/WolfMK/MichiruKagemori/resolve/main/MichiruKagemori_RVC_V2.zip",
+            "description": "Michiru Kagemori from Brand New Animal (300 Epochs)",
+            "added": "2023-08-03",
+            "credit": "wolfmk",
+            "tags": [
+                "Anime",
+                "English"
+            ]
+        },
+        {
+            "name": "Kaeya",
+            "url": "https://huggingface.co/nlordqting4444/nlordqtingRVC/resolve/main/Kaeya.zip",
+            "description": "Kaeya (VA: Kohsuke Toriumi) from Genshin Impact (300 Epochs)",
+            "added": "2023-08-03",
+            "credit": "nlordqting4444",
+            "tags": [
+                "Game character",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Mona Megistus",
+            "url": "https://huggingface.co/AnimeSessions/rvc_voice_models/resolve/main/MonaRVC.zip",
+            "description": "Mona Megistus (VA: Felecia Angelle) from Genshin Impact (250 Epochs)",
+            "added": "2023-08-03",
+            "credit": "shyelijah",
+            "tags": [
+                "Game character",
+                "English"
+            ]
+        },
+        {
+            "name": "Klee",
+            "url": "https://huggingface.co/hardbop/AI_MODEL_THINGY/resolve/main/kleeeng_rvc.zip",
+            "description": "Klee from Genshin Impact (400 Epochs)",
+            "added": "2023-08-03",
+            "credit": "hardbop",
+            "tags": [
+                "Game character",
+                "English"
+            ]
+        },
+        {
+            "name": "Sakurakoji Kinako",
+            "url": "https://huggingface.co/Gorodogi/RVC2MangioCrepe/resolve/main/kinakobetatwo700.zip",
+            "description": "Sakurakoji Kinako (Suzuhara Nozomi) from Love Live! Superstar!! (700 Epoch)",
+            "added": "2023-08-03",
+            "credit": "ck1089",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Minamo Kurosawa",
+            "url": "https://huggingface.co/timothy10583/RVC/resolve/main/minamo-kurosawa.zip",
+            "description": "Minamo (Nyamo) Kurosawa (Azumanga Daioh US DUB) (300 Epochs)",
+            "added": "2023-08-03",
+            "credit": "timothy10583",
+            "tags": [
+                "Anime"
+            ]
+        },
+        {
+            "name": "Neco Arc",
+            "url": "https://huggingface.co/Ozzy-Helix/Neko_Arc_Neko_Aruku.RVCv2/resolve/main/Neko_Arc-V3-E600.zip",
+            "description": "Neco Arc (Neco-Aruku) (Epochs 600)",
+            "added": "2023-08-03",
+            "credit": "ozzy_helix_",
+            "tags": [
+                "Anime"
+            ]
+        },
+        {
+            "name": "Makima",
+            "url": "https://huggingface.co/andolei/makimaen/resolve/main/makima-en-dub.zip",
+            "description": "Makima from Chainsaw Man (300 Epochs)",
+            "added": "2023-08-03",
+            "credit": "andpproximately",
+            "tags": [
+                "Anime",
+                "English"
+            ]
+        },
+        {
+            "name": "PomPom",
+            "url": "https://huggingface.co/benitheworld/pom-pom/resolve/main/pom-pom.zip",
+            "description": "PomPom from Honkai Star Rail (HSR) (200 Epochs)",
+            "added": "2023-08-03",
+            "credit": "kannysoap",
+            "tags": [
+                "Game character",
+                "English"
+            ]
+        },
+        {
+            "name": "Asuka Langley Soryu",
+            "url": "https://huggingface.co/Piegirl/asukaadv/resolve/main/asuka.zip",
+            "description": "Asuka Langley Soryu/Tiffany Grant from Neon Genesis Evangelion (400 Epochs)",
+            "added": "2023-08-03",
+            "credit": "piegirl",
+            "tags": [
+                "Anime",
+                "English"
+            ]
+        },
+        {
+            "name": "Ochaco Uraraka",
+            "url": "https://huggingface.co/legitdark/JP-Uraraka-By-Dan/resolve/main/JP-Uraraka-By-Dan.zip",
+            "description": "Ochaco Uraraka from Boku no Hero Academia (320 Epochs)",
+            "added": "2023-08-03",
+            "credit": "danthevegetable",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Sunaokami Shiroko",
+            "url": "https://huggingface.co/LordDavis778/BlueArchivevoicemodels/resolve/main/SunaokamiShiroko.zip",
+            "description": "Sunaokami Shiroko from Blue Archive (500 Epochs)",
+            "added": "2023-08-03",
+            "credit": "lorddavis778",
+            "tags": [
+                "Anime"
+            ]
+        },
+        {
+            "name": "Dainsleif",
+            "url": "https://huggingface.co/Nasleyy/NasleyRVC/resolve/main/Voices/Dainsleif/Dainsleif.zip",
+            "description": "Dainsleif from Genshin Impact (335 Epochs)",
+            "added": "2023-08-03",
+            "credit": "nasley",
+            "tags": [
+                "Game character",
+                "English"
+            ]
+        },
+        {
+            "name": "Mae Asmr",
+            "url": "https://huggingface.co/ctian/VRC/resolve/main/MaeASMR.zip",
+            "description": "Mae Asmr - harvest mommy voice (YOUTUBE) (300 Epochs)",
+            "added": "2023-08-03",
+            "credit": "ctian_04",
+            "tags": [
+                "English",
+                "Real person",
+                "Vtuber"
+            ]
+        },
+        {
+            "name": "Hana Shirosaki ",
+            "url": "https://huggingface.co/Pawlik17/HanaWataten/resolve/main/HanaWATATEN.zip",
+            "description": "Hana Shirosaki / 白 咲 花 From Watashi ni Tenshi ga Maiorita! (570 Epochs)",
+            "added": "2023-08-03",
+            "credit": "tamalik",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Kaguya Shinomiya ",
+            "url": "https://huggingface.co/1ski/1skiRVCModels/resolve/main/kaguyav5.zip",
+            "description": "Kaguya Shinomiya from Kaguya-Sama Love is war (200 Epochs)",
+            "added": "2023-08-03",
+            "credit": "1ski",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Nai Shiro",
+            "url": "https://huggingface.co/kuushiro/Shiro-RVC-No-Game-No-Life/resolve/main/shiro-jp-360-epochs.zip",
+            "description": "Nai Shiro (Ai Kayano) from No Game No Life (360 Epochs)",
+            "added": "2023-08-03",
+            "credit": "kxouyou",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Yuigahama Yui",
+            "url": "https://huggingface.co/Zerokano/Yuigahama_Yui-RVCv2/resolve/main/Yuigahama_Yui.zip",
+            "description": "Yuigahama Yui from Yahari Ore no Seishun Love Comedy wa Machigatteiru (250 Epochs)",
+            "added": "2023-08-03",
+            "credit": "zerokano",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Fuwawa Abyssgard",
+            "url": "https://huggingface.co/megaaziib/my-rvc-models-collection/resolve/main/fuwawa.zip",
+            "description": "Fuwawa Abyssgard (FUWAMOCO) from Hololive gen 3 (250 Epochs)",
+            "added": "2023-08-03",
+            "credit": "megaaziib",
+            "tags": [
+                "Vtuber",
+                "English"
+            ]
+        },
+        {
+            "name": "Kana Arima",
+            "url": "https://huggingface.co/ddoumakunn/arimakanna/resolve/main/arimakanna.zip",
+            "description": "Kana Arima from Oshi no Ko (250 Epochs)",
+            "added": "2023-08-03",
+            "credit": "ddoumakunn",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Raiden Shogun",
+            "url": "https://huggingface.co/Nasleyy/NasleyRVC/resolve/main/Voices/RaidenShogun/RaidenShogun.zip",
+            "description": "Raiden Shogun from Genshin Impact (310 Epochs)",
+            "added": "2023-08-03",
+            "credit": "nasley",
+            "tags": [
+                "Game character",
+                "English"
+            ]
+        },
+        {
+            "name": "Alhaitham",
+            "url": "https://huggingface.co/Nasleyy/NasleyRVC/resolve/main/Voices/Alhaitham/Alhaitham.zip",
+            "description": "Alhaitham from Genshin Impact (320 Epochs)",
+            "added": "2023-08-03",
+            "credit": "nasley",
+            "tags": [
+                "Game character",
+                "English"
+            ]
+        },
+        {
+            "name": "Izuku Midoriya",
+            "url": "https://huggingface.co/BigGuy635/MHA/resolve/main/DekuJP.zip",
+            "description": "Izuku Midoriya from Boku no Hero Academia (100 Epochs)",
+            "added": "2023-08-03",
+            "credit": "khjjnoffical",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Kurumi Shiratori",
+            "url": "https://huggingface.co/HarunaKasuga/YoshikoTsushima/resolve/main/KurumiShiratori.zip",
+            "description": "Kurumi Shiratori (VA: Ruka Fukagawa) from D4DJ (500 Epochs)",
+            "added": "2023-08-03",
+            "credit": "seakrait",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Veibae",
+            "url": "https://huggingface.co/datasets/Papaquans/Veibae/resolve/main/veibae_e165_s125565.zip",
+            "description": "Veibae (165 Epochs)",
+            "added": "2023-08-03",
+            "credit": "recairo",
+            "tags": [
+                "Vtuber",
+                "English"
+            ]
+        },
+        {
+            "name": "Black Panther",
+            "url": "https://huggingface.co/TJKAI/BlackPannther/resolve/main/BlackPanther.zip",
+            "description": "Black Panther (Chadwick Boseman) (300 Epochs)",
+            "added": "2023-08-03",
+            "credit": "tjkcreative",
+            "tags": [
+                "Real person",
+                "English"
+            ]
+        },
+        {
+            "name": "Gawr Gura",
+            "url": "https://pixeldrain.com/u/3tJmABXA",
+            "description": "Gawr Gura from Hololive EN",
+            "added": "2023-08-05",
+            "credit": "dacoolkid44 & hijack",
+            "tags": [
+                "Vtuber"
+            ]
+        },
+        {
+            "name": "Houshou Marine",
+            "url": "https://pixeldrain.com/u/L1YLfZyU",
+            "description": "Houshou Marine from Hololive JP",
+            "added": "2023-08-05",
+            "credit": "dacoolkid44 & hijack",
+            "tags": [
+                "Vtuber",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Hoshimachi Suisei",
+            "url": "https://pixeldrain.com/u/YP89C21u",
+            "description": "Hoshimachi Suisei from Hololive JP",
+            "added": "2023-08-05",
+            "credit": "dacoolkid44 & hijack & Maki Ligon",
+            "tags": [
+                "Vtuber",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Laplus Darkness",
+            "url": "https://pixeldrain.com/u/zmuxv5Bf",
+            "description": "Laplus Darkness from Hololive JP",
+            "added": "2023-08-05",
+            "credit": "dacoolkid44 & hijack",
+            "tags": [
+                "Vtuber",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "AZKi",
+            "url": "https://huggingface.co/Kit-Lemonfoot/kitlemonfoot_rvc_models/resolve/main/AZKi%20(Hybrid).zip",
+            "description": "AZKi from Hololive JP",
+            "added": "2023-08-05",
+            "credit": "Kit Lemonfoot / NSHFB",
+            "tags": [
+                "Vtuber",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Ado",
+            "url": "https://huggingface.co/pjesek/AdoRVCv2/resolve/main/AdoRVCv2.zip",
+            "description": "Talented JP artist (500 epochs using every song from her first album)",
+            "added": "2023-08-05",
+            "credit": "pjesek",
+            "tags": [
+                "Real person",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "LiSA",
+            "url": "https://huggingface.co/phant0m4r/LiSA/resolve/main/LiSA.zip",
+            "description": "Talented JP artist (400 epochs)",
+            "added": "2023-08-05",
+            "credit": "Phant0m",
+            "tags": [
+                "Real person",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Kokomi",
+            "url": "https://huggingface.co/benitheworld/kokomi-kr/resolve/main/kokomi-kr.zip",
+            "description": "Kokomi from Genshin Impact KR (300 Epochs)",
+            "added": "2023-08-09",
+            "credit": "kannysoap",
+            "tags": [
+                "Game character",
+                "Other Language"
+            ]
+        },
+        {
+            "name": "Ivanzolo",
+            "url": "https://huggingface.co/fenikkusugosuto/IvanZolo2004/resolve/main/ivanZolo.zip",
+            "description": "Ivanzolo2004 russian streamer | Иван Золо 2004",
+            "added": "2023-08-09",
+            "credit": "prezervativ_naruto2009",
+            "tags": [
+                "Other Language",
+                "Real person"
+            ]
+        },
+        {
+            "name": "Nilou",
+            "url": "https://huggingface.co/benitheworld/nilou-kr/resolve/main/nilou-kr.zip",
+            "description": "Nilou from Genshin Impact KR (300 Epochs)",
+            "added": "2023-08-09",
+            "credit": "kannysoap",
+            "tags": [
+                "Game character",
+                "Other Language"
+            ]
+        },
+        {
+            "name": "Dr. Doofenshmirtz",
+            "url": "https://huggingface.co/Argax/doofenshmirtz-RUS/resolve/main/doofenshmirtz.zip",
+            "description": "RUS Dr. Doofenshmirtz from Phineas and Ferb  (300 epochs)",
+            "added": "2023-08-09",
+            "credit": "argaxus",
+            "tags": [
+                "Other Language"
+            ]
+        }
+    ]
+}
\ No newline at end of file
diff --git a/src/ultimate_rvc/core/typing_extra.py b/src/ultimate_rvc/core/typing_extra.py
new file mode 100644
index 0000000000000000000000000000000000000000..00ad8b9ba3472e751d9acf678522684aa154e63b
--- /dev/null
+++ b/src/ultimate_rvc/core/typing_extra.py
@@ -0,0 +1,294 @@
+"""
+Module which defines extra types for the core of the Ultimate RVC
+project.
+"""
+
+from collections.abc import Callable
+from enum import StrEnum, auto
+
+from pydantic import BaseModel, ConfigDict
+
+from ultimate_rvc.typing_extra import AudioExt, F0Method
+
+# Voice model management
+
+
+class ModelTagName(StrEnum):
+    """Names of valid voice model tags."""
+
+    ENGLISH = "English"
+    JAPANESE = "Japanese"
+    OTHER_LANGUAGE = "Other Language"
+    ANIME = "Anime"
+    VTUBER = "Vtuber"
+    REAL_PERSON = "Real person"
+    GAME_CHARACTER = "Game character"
+
+
+class ModelTagMetaData(BaseModel):
+    """
+    Metadata for a voice model tag.
+
+    Attributes
+    ----------
+    name : ModelTagName
+        The name of the tag.
+    description : str
+        The description of the tag.
+
+    """
+
+    name: ModelTagName
+    description: str
+
+
+class ModelMetaData(BaseModel):
+    """
+    Metadata for a voice model.
+
+    Attributes
+    ----------
+    name : str
+        The name of the voice model.
+    description : str
+        A description of the voice model.
+    tags : list[ModelTagName]
+        The tags associated with the voice model.
+    credit : str
+        Who created the voice model.
+    added : str
+        The date the voice model was created.
+    url : str
+        An URL pointing to a location where the voice model can be
+        downloaded.
+
+    """
+
+    name: str
+    description: str
+    tags: list[ModelTagName]
+    credit: str
+    added: str
+    url: str
+
+
+class ModelMetaDataTable(BaseModel):
+    """
+    Table with metadata for a set of voice models.
+
+    Attributes
+    ----------
+    tags : list[ModelTagMetaData]
+        Metadata for the tags associated with the given set of voice
+        models.
+    models : list[ModelMetaData]
+        Metadata for the given set of voice models.
+
+    """
+
+    tags: list[ModelTagMetaData]
+    models: list[ModelMetaData]
+
+
+ModelMetaDataPredicate = Callable[[ModelMetaData], bool]
+
+ModelMetaDataList = list[list[str | list[ModelTagName]]]
+
+
+# Song cover generation
+
+
+class SourceType(StrEnum):
+    """The type of source providing the song to generate a cover of."""
+
+    URL = auto()
+    FILE = auto()
+    SONG_DIR = auto()
+
+
+class AudioExtInternal(StrEnum):
+    """Audio file formats for internal use."""
+
+    MP3 = "mp3"
+    WAV = "wav"
+    FLAC = "flac"
+    OGG = "ogg"
+    IPOD = "ipod"
+    ADTS = "adts"
+
+
+class FileMetaData(BaseModel):
+    """
+    Metadata for a file.
+
+    Attributes
+    ----------
+    name : str
+        The name of the file.
+    hash_id : str
+        The hash ID of the file.
+
+    """
+
+    name: str
+    hash_id: str
+
+
+class WaveifiedAudioMetaData(BaseModel):
+    """
+    Metadata for a waveified audio track.
+
+    Attributes
+    ----------
+    audio_track : FileMetaData
+        Metadata for the audio track that was waveified.
+
+    """
+
+    audio_track: FileMetaData
+
+
+class SeparatedAudioMetaData(BaseModel):
+    """
+    Metadata for a separated audio track.
+
+    Attributes
+    ----------
+    audio_track : FileMetaData
+        Metadata for the audio track that was separated.
+    model_name : str
+        The name of the model used for separation.
+    segment_size : int
+        The segment size used for separation.
+
+    """
+
+    audio_track: FileMetaData
+    model_name: str
+    segment_size: int
+
+    model_config = ConfigDict(protected_namespaces=())
+
+
+class ConvertedVocalsMetaData(BaseModel):
+    """
+    Metadata for an RVC converted vocals track.
+
+    Attributes
+    ----------
+    vocals_track : FileMetaData
+        Metadata for the vocals track that was converted.
+    model_name : str
+        The name of the model used for vocal conversion.
+    n_semitones : int
+        The number of semitones the converted vocals were pitch-shifted
+        by.
+    f0_method : F0Method
+        The method used for pitch detection.
+    index_rate : float
+        The influence of the index file on the vocal conversion.
+    filter_radius : int
+        The filter radius used for the vocal conversion.
+    rms_mix_rate : float
+        The blending of the volume envelope of the converted vocals.
+    protect : float
+        The protection rate used for consonants and breathing sounds.
+    hop_length : int
+        The hop length used for crepe-based pitch detection.
+
+    """
+
+    vocals_track: FileMetaData
+    model_name: str
+    n_semitones: int
+    f0_method: F0Method
+    index_rate: float
+    filter_radius: int
+    rms_mix_rate: float
+    protect: float
+    hop_length: int
+
+    model_config = ConfigDict(protected_namespaces=())
+
+
+class EffectedVocalsMetaData(BaseModel):
+    """
+    Metadata for an effected vocals track.
+
+    Attributes
+    ----------
+    vocals_track : FileMetaData
+        Metadata for the vocals track that effects were applied to.
+    room_size : float
+        The room size of the reverb effect applied to the vocals track.
+    wet_level : float
+        The wetness level of the reverb effect applied to the vocals
+        track.
+    dry_level : float
+        The dryness level of the reverb effect. applied to the vocals
+        track.
+    damping : float
+        The damping of the reverb effect applied to the vocals track.
+
+    """
+
+    vocals_track: FileMetaData
+    room_size: float
+    wet_level: float
+    dry_level: float
+    damping: float
+
+
+class PitchShiftMetaData(BaseModel):
+    """
+    Metadata for a pitch-shifted audio track.
+
+    Attributes
+    ----------
+    audio_track : FileMetaData
+        Metadata for the audio track that was pitch-shifted.
+    n_semitones : int
+        The number of semitones the audio track was pitch-shifted by.
+
+    """
+
+    audio_track: FileMetaData
+    n_semitones: int
+
+
+class StagedAudioMetaData(BaseModel):
+    """
+    Metadata for a staged audio track.
+
+    Attributes
+    ----------
+    audio_track : FileMetaData
+        Metadata for the audio track that was staged.
+    gain : float
+        The gain applied to the audio track.
+
+    """
+
+    audio_track: FileMetaData
+    gain: float
+
+
+class MixedSongMetaData(BaseModel):
+    """
+    Metadata for a mixed song.
+
+    Attributes
+    ----------
+    staged_audio_tracks : list[StagedAudioMetaData]
+        Metadata for the staged audio tracks that were mixed.
+
+    output_sr : int
+        The sample rate of the mixed song.
+    output_format : AudioExt
+        The audio file format of the mixed song.
+
+    """
+
+    staged_audio_tracks: list[StagedAudioMetaData]
+    output_sr: int
+    output_format: AudioExt
diff --git a/src/ultimate_rvc/py.typed b/src/ultimate_rvc/py.typed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/ultimate_rvc/stubs/audio_separator/separator/__init__.pyi b/src/ultimate_rvc/stubs/audio_separator/separator/__init__.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..6c2a4ef9884427d5e2c12f35f9e0f02eb96eb838
--- /dev/null
+++ b/src/ultimate_rvc/stubs/audio_separator/separator/__init__.pyi
@@ -0,0 +1,100 @@
+from typing import TypedDict
+
+import logging
+
+from ultimate_rvc.typing_extra import StrPath
+
+class MDXParams(TypedDict):
+    hop_length: int
+    segment_size: int
+    overlap: float
+    batch_size: int
+    enable_denoise: bool
+
+class VRParams(TypedDict):
+    batch_size: int
+    window_size: int
+    aggression: int
+    enable_tta: bool
+    enable_post_process: bool
+    post_process_threshold: float
+    high_end_process: bool
+
+class DemucsParams(TypedDict):
+    segment_size: str
+    shifts: int
+    overlap: float
+    segments_enabled: bool
+
+class MDXCParams(TypedDict):
+    segment_size: int
+    override_model_segment_size: bool
+    batch_size: int
+    overlap: int
+    pitch_shift: int
+
+class ArchSpecificParams(TypedDict):
+    MDX: MDXParams
+    VR: VRParams
+    Demucs: DemucsParams
+    MDXC: MDXCParams
+
+class Separator:
+    arch_specific_params: ArchSpecificParams
+    def __init__(
+        self,
+        log_level: int = ...,
+        log_formatter: logging.Formatter | None = None,
+        model_file_dir: StrPath = "/tmp/audio-separator-models/",  # noqa: S108
+        output_dir: StrPath | None = None,
+        output_format: str = "WAV",
+        output_bitrate: str | None = None,
+        normalization_threshold: float = 0.9,
+        amplification_threshold: float = 0.6,
+        output_single_stem: str | None = None,
+        invert_using_spec: bool = False,
+        sample_rate: int = 44100,
+        mdx_params: MDXParams = {
+            "hop_length": 1024,
+            "segment_size": 256,
+            "overlap": 0.25,
+            "batch_size": 1,
+            "enable_denoise": False,
+        },
+        vr_params: VRParams = {
+            "batch_size": 1,
+            "window_size": 512,
+            "aggression": 5,
+            "enable_tta": False,
+            "enable_post_process": False,
+            "post_process_threshold": 0.2,
+            "high_end_process": False,
+        },
+        demucs_params: DemucsParams = {
+            "segment_size": "Default",
+            "shifts": 2,
+            "overlap": 0.25,
+            "segments_enabled": True,
+        },
+        mdxc_params: MDXCParams = {
+            "segment_size": 256,
+            "override_model_segment_size": False,
+            "batch_size": 1,
+            "overlap": 8,
+            "pitch_shift": 0,
+        },
+    ) -> None: ...
+    def download_model_files(
+        self,
+        model_filename: str,
+    ) -> tuple[str, str, str, str, str | None]: ...
+    def load_model(
+        self,
+        model_filename: str = "model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt",
+    ) -> None: ...
+    def separate(
+        self,
+        audio_file_path: str,
+        primary_output_name: str | None = None,
+        secondary_output_name: str | None = None,
+    ) -> list[str]: ...
diff --git a/src/ultimate_rvc/stubs/gradio/__init__.pyi b/src/ultimate_rvc/stubs/gradio/__init__.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..bb3c95ac2325f4e875004061bfc3a5a280cf13bb
--- /dev/null
+++ b/src/ultimate_rvc/stubs/gradio/__init__.pyi
@@ -0,0 +1,245 @@
+import json
+
+from gradio import (
+    _simple_templates,
+    components,
+    image_utils,
+    layouts,
+    processing_utils,
+    templates,
+    themes,
+)
+from gradio.blocks import Blocks
+from gradio.chat_interface import ChatInterface
+from gradio.components import (
+    HTML,
+    JSON,
+    AnnotatedImage,
+    Annotatedimage,
+    Audio,
+    BarPlot,
+    BrowserState,
+    Button,
+    Chatbot,
+    ChatMessage,
+    Checkbox,
+    CheckboxGroup,
+    Checkboxgroup,
+    ClearButton,
+    Code,
+    ColorPicker,
+    DataFrame,
+    Dataframe,
+    Dataset,
+    DateTime,
+    DownloadButton,
+    Dropdown,
+    DuplicateButton,
+    File,
+    FileExplorer,
+    Gallery,
+    Highlight,
+    HighlightedText,
+    Highlightedtext,
+    Image,
+    ImageEditor,
+    Json,
+    Label,
+    LinePlot,
+    LoginButton,
+    Markdown,
+    MessageDict,
+    Model3D,
+    MultimodalTextbox,
+    Number,
+    ParamViewer,
+    Plot,
+    Radio,
+    ScatterPlot,
+    Slider,
+    State,
+    Text,
+    Textbox,
+    Timer,
+    UploadButton,
+    Video,
+    component,
+)
+from gradio.components.audio import WaveformOptions
+from gradio.components.image_editor import Brush, Eraser
+from gradio.data_classes import FileData
+from gradio.events import (
+    DeletedFileData,
+    DownloadData,
+    EventData,
+    KeyUpData,
+    LikeData,
+    RetryData,
+    SelectData,
+    UndoData,
+    on,
+)
+from gradio.exceptions import Error
+from gradio.external import load
+from gradio.flagging import (
+    CSVLogger,
+    FlaggingCallback,
+    SimpleCSVLogger,
+)
+from gradio.helpers import (
+    Info,
+    Progress,
+    Warning,
+    skip,
+    update,
+)
+from gradio.helpers import create_examples as Examples  # noqa: N812
+from gradio.interface import Interface, TabbedInterface, close_all
+from gradio.layouts import Accordion, Column, Group, Row, Tab, TabItem, Tabs
+from gradio.oauth import OAuthProfile, OAuthToken
+from gradio.renderable import render
+from gradio.routes import Request, mount_gradio_app
+from gradio.templates import (
+    Files,
+    ImageMask,
+    List,
+    Matrix,
+    Mic,
+    Microphone,
+    Numpy,
+    Paint,
+    PlayableVideo,
+    Sketchpad,
+    TextArea,
+)
+from gradio.themes import Base as Theme
+from gradio.utils import NO_RELOAD, FileSize, get_package_version, set_static_paths
+from gradio.wasm_utils import IS_WASM
+
+if not IS_WASM:  # noqa: PYI002
+    from gradio.cli import deploy
+    from gradio.ipython_ext import load_ipython_extension
+
+__version__ = ...
+__all__ = [
+    "HTML",
+    "IS_WASM",
+    "JSON",
+    "NO_RELOAD",
+    "Accordion",
+    "AnnotatedImage",
+    "Annotatedimage",
+    "Audio",
+    "BarPlot",
+    "Blocks",
+    "BrowserState",
+    "Brush",
+    "Button",
+    "CSVLogger",
+    "ChatInterface",
+    "ChatMessage",
+    "Chatbot",
+    "Checkbox",
+    "CheckboxGroup",
+    "Checkboxgroup",
+    "ClearButton",
+    "Code",
+    "ColorPicker",
+    "Column",
+    "DataFrame",
+    "Dataframe",
+    "Dataset",
+    "DateTime",
+    "DeletedFileData",
+    "DownloadButton",
+    "DownloadData",
+    "Dropdown",
+    "DuplicateButton",
+    "Eraser",
+    "Error",
+    "EventData",
+    "Examples",
+    "File",
+    "FileData",
+    "FileExplorer",
+    "FileSize",
+    "Files",
+    "FlaggingCallback",
+    "Gallery",
+    "Group",
+    "Highlight",
+    "HighlightedText",
+    "Highlightedtext",
+    "Image",
+    "ImageEditor",
+    "ImageMask",
+    "Info",
+    "Interface",
+    "Json",
+    "KeyUpData",
+    "Label",
+    "LikeData",
+    "LinePlot",
+    "List",
+    "LoginButton",
+    "Markdown",
+    "Matrix",
+    "MessageDict",
+    "Mic",
+    "Microphone",
+    "Model3D",
+    "MultimodalTextbox",
+    "Number",
+    "Numpy",
+    "OAuthProfile",
+    "OAuthToken",
+    "Paint",
+    "ParamViewer",
+    "PlayableVideo",
+    "Plot",
+    "Progress",
+    "Radio",
+    "Request",
+    "RetryData",
+    "Row",
+    "ScatterPlot",
+    "SelectData",
+    "SimpleCSVLogger",
+    "Sketchpad",
+    "Slider",
+    "State",
+    "Tab",
+    "TabItem",
+    "TabbedInterface",
+    "Tabs",
+    "Text",
+    "TextArea",
+    "Textbox",
+    "Theme",
+    "Timer",
+    "UndoData",
+    "UploadButton",
+    "Video",
+    "Warning",
+    "WaveformOptions",
+    "_simple_templates",
+    "close_all",
+    "component",
+    "components",
+    "deploy",
+    "get_package_version",
+    "image_utils",
+    "json",
+    "layouts",
+    "load",
+    "load_ipython_extension",
+    "mount_gradio_app",
+    "on",
+    "processing_utils",
+    "render",
+    "set_static_paths",
+    "skip",
+    "templates",
+    "themes",
+    "update",
+]
diff --git a/src/ultimate_rvc/stubs/gradio/events.pyi b/src/ultimate_rvc/stubs/gradio/events.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..6095e6297533f8e3b81bd86065baa41e9e7e3fe7
--- /dev/null
+++ b/src/ultimate_rvc/stubs/gradio/events.pyi
@@ -0,0 +1,344 @@
+from typing import Any, Literal, NotRequired, Protocol, Self, TypedDict
+
+import dataclasses
+from collections import UserString
+from collections.abc import Callable, Sequence
+from collections.abc import Set as AbstractSet
+
+from _typeshed import SupportsKeysAndGetItem
+
+from gradio.blocks import Block, BlockContext, Component
+from gradio.components import Timer
+from gradio.data_classes import FileData, FileDataDict
+
+type Dependency = _Dependency[Any, Any, Any]
+type EventListenerCallable = _EventListenerCallable[Any, Any, Any]
+type EventListener = _EventListener[Any, Any, Any]
+
+class _EventListenerCallable[T, V, **P](Protocol):
+    def __call__(
+        self,
+        fn: Callable[P, T] | Literal["decorator"] | None = "decorator",
+        inputs: (
+            Component
+            | BlockContext
+            | Sequence[Component | BlockContext]
+            | AbstractSet[Component | BlockContext]
+            | None
+        ) = None,
+        outputs: (
+            Component
+            | BlockContext
+            | Sequence[Component | BlockContext]
+            | AbstractSet[Component | BlockContext]
+            | None
+        ) = None,
+        api_name: str | Literal[False] | None = None,
+        scroll_to_output: bool = False,
+        show_progress: Literal["full", "minimal", "hidden"] = "full",
+        queue: bool = True,
+        batch: bool = False,
+        max_batch_size: int = 4,
+        preprocess: bool = True,
+        postprocess: bool = True,
+        cancels: Dependency | list[Dependency] | None = None,
+        trigger_mode: Literal["once", "multiple", "always_last"] | None = None,
+        js: str | None = None,
+        concurrency_limit: int | Literal["default"] | None = "default",
+        concurrency_id: str | None = None,
+        show_api: bool = True,
+        stream_every: float = 0.5,
+        like_user_message: bool = False,
+    ) -> _Dependency[T, V, P]: ...
+
+class _EventListenerCallableFull[T, V, **P](Protocol):
+    def __call__(
+        self,
+        block: Block | None,
+        fn: Callable[P, T] | Literal["decorator"] | None = "decorator",
+        inputs: (
+            Component
+            | BlockContext
+            | Sequence[Component | BlockContext]
+            | AbstractSet[Component | BlockContext]
+            | None
+        ) = None,
+        outputs: (
+            Component
+            | BlockContext
+            | Sequence[Component | BlockContext]
+            | AbstractSet[Component | BlockContext]
+            | None
+        ) = None,
+        api_name: str | Literal[False] | None = None,
+        scroll_to_output: bool = False,
+        show_progress: Literal["full", "minimal", "hidden"] = "full",
+        queue: bool = True,
+        batch: bool = False,
+        max_batch_size: int = 4,
+        preprocess: bool = True,
+        postprocess: bool = True,
+        cancels: Dependency | list[Dependency] | None = None,
+        trigger_mode: Literal["once", "multiple", "always_last"] | None = None,
+        js: str | None = None,
+        concurrency_limit: int | Literal["default"] | None = "default",
+        concurrency_id: str | None = None,
+        show_api: bool = True,
+        time_limit: int | None = None,
+        stream_every: float = 0.5,
+        like_user_message: bool = False,
+    ) -> _Dependency[T, V, P]: ...
+
+def set_cancel_events(
+    triggers: Sequence[EventListenerMethod],
+    cancels: Dependency | list[Dependency] | None,
+) -> None: ...
+
+class _Dependency[T, V, **P](dict[str, V]):
+    fn: Callable[P, T]
+    associated_timer: Timer | None
+    then: EventListenerCallable
+    success: EventListenerCallable
+
+    def __init__(
+        self,
+        trigger: Block | None,
+        key_vals: SupportsKeysAndGetItem[str, V],
+        dep_index: int | None,
+        fn: Callable[P, T],
+        associated_timer: Timer | None = None,
+    ) -> None: ...
+    def __call__(self, *args: P.args, **kwargs: P.kwargs) -> T: ...
+
+class EventData[T]:
+    target: Block | None
+    _data: T
+
+    def __init__(self, target: Block | None, _data: T) -> None: ...
+
+class _SelectData(TypedDict):
+    index: int | tuple[int, int]
+    value: Any
+    row_value: NotRequired[list[Any]]
+    col_value: NotRequired[list[Any]]
+    selected: NotRequired[bool]
+
+class SelectData(EventData[_SelectData]):
+    index: int | tuple[int, int]
+    value: Any
+    row_value: list[Any] | None
+    col_value: list[Any] | None
+    selected: bool
+
+    def __init__(self, target: Block | None, data: _SelectData) -> None: ...
+
+class _KeyUpData(TypedDict):
+    key: str
+    input_value: str
+
+class KeyUpData(EventData[_KeyUpData]):
+    key: str
+    input_value: str
+
+    def __init__(self, target: Block | None, data: _KeyUpData) -> None: ...
+
+class DeletedFileData(EventData[FileDataDict]):
+    file: FileData
+
+    def __init__(self, target: Block | None, data: FileDataDict) -> None: ...
+
+class _LikeData(TypedDict):
+    index: int | tuple[int, int]
+    value: Any
+    liked: NotRequired[bool]
+
+class LikeData(EventData[_LikeData]):
+    index: int | tuple[int, int]
+    value: Any
+    liked: bool
+
+    def __init__(self, target: Block | None, data: _LikeData) -> None: ...
+
+class _RetryData(TypedDict):
+    index: int | tuple[int, int]
+    value: Any
+
+class RetryData(EventData[_RetryData]):
+    index: int | tuple[int, int]
+    value: Any
+
+    def __init__(self, target: Block | None, data: _RetryData) -> None: ...
+
+class _UndoData(TypedDict):
+    index: int | tuple[int, int]
+    value: Any
+
+class UndoData(EventData[_UndoData]):
+    index: int | tuple[int, int]
+    value: Any
+
+    def __init__(self, target: Block | None, data: _UndoData) -> None: ...
+
+class DownloadData(EventData[FileDataDict]):
+    file: FileData
+
+    def __init__(self, target: Block | None, data: FileDataDict) -> None: ...
+
+@dataclasses.dataclass
+class EventListenerMethod:
+    block: Block | None
+    event_name: str
+
+class _EventListener[T, V, **P](UserString):
+    __slots__ = (
+        "callback",
+        "config_data",
+        "connection",
+        "doc",
+        "event_name",
+        "event_specific_args",
+        "has_trigger",
+        "listener",
+        "show_progress",
+        "trigger_after",
+        "trigger_only_on_success",
+    )
+
+    event_name: str
+    has_trigger: bool
+    config_data: Callable[..., dict[str, T]]
+    show_progress: Literal["full", "minimal", "hidden"]
+    callback: Callable[[Block], None] | None
+    trigger_after: int | None
+    trigger_only_on_success: bool
+    doc: str
+    connection: Literal["sse", "stream"]
+    event_specific_args: list[dict[str, str]]
+    listener: _EventListenerCallableFull[T, V, P]
+
+    def __new__(
+        cls,
+        event_name: str,
+        has_trigger: bool = True,
+        config_data: Callable[..., dict[str, T]] = dict,  # noqa: PYI011
+        show_progress: Literal["full", "minimal", "hidden"] = "full",
+        callback: Callable[[Block], None] | None = None,
+        trigger_after: int | None = None,
+        trigger_only_on_success: bool = False,
+        doc: str = "",
+        connection: Literal["sse", "stream"] = "sse",
+        event_specific_args: list[dict[str, str]] | None = None,
+    ) -> Self: ...
+    def __init__(
+        self,
+        event_name: str,
+        has_trigger: bool = True,
+        config_data: Callable[..., dict[str, T]] = dict,  # noqa: PYI011
+        show_progress: Literal["full", "minimal", "hidden"] = "full",
+        callback: Callable[[Block], None] | None = None,
+        trigger_after: int | None = None,
+        trigger_only_on_success: bool = False,
+        doc: str = "",
+        connection: Literal["sse", "stream"] = "sse",
+        event_specific_args: list[dict[str, str]] | None = None,
+    ) -> None: ...
+    def set_doc(self, component: str) -> None: ...
+    def copy(self) -> _EventListener[T, V, P]: ...
+    @staticmethod
+    def _setup(
+        _event_name: str,
+        _has_trigger: bool,
+        _show_progress: Literal["full", "minimal", "hidden"],
+        _callback: Callable[[Block], None] | None,
+        _trigger_after: int | None,
+        _trigger_only_on_success: bool,
+        _event_specific_args: list[dict[str, str]],
+        _connection: Literal["sse", "stream"] = "sse",
+    ) -> _EventListenerCallableFull[T, V, P]: ...
+
+def on[T, **P](
+    triggers: Sequence[EventListenerCallable] | EventListenerCallable | None = None,
+    fn: Callable[P, T] | Literal["decorator"] | None = "decorator",
+    inputs: (
+        Component
+        | BlockContext
+        | Sequence[Component | BlockContext]
+        | AbstractSet[Component | BlockContext]
+        | None
+    ) = None,
+    outputs: (
+        Component
+        | BlockContext
+        | Sequence[Component | BlockContext]
+        | AbstractSet[Component | BlockContext]
+        | None
+    ) = None,
+    *,
+    api_name: str | Literal[False] | None = None,
+    scroll_to_output: bool = False,
+    show_progress: Literal["full", "minimal", "hidden"] = "full",
+    queue: bool = True,
+    batch: bool = False,
+    max_batch_size: int = 4,
+    preprocess: bool = True,
+    postprocess: bool = True,
+    cancels: Dependency | list[Dependency] | None = None,
+    trigger_mode: Literal["once", "multiple", "always_last"] | None = None,
+    js: str | None = None,
+    concurrency_limit: int | Literal["default"] | None = "default",
+    concurrency_id: str | None = None,
+    show_api: bool = True,
+    time_limit: int | None = None,
+    stream_every: float = 0.5,
+) -> _Dependency[T, Any, P]: ...
+
+class Events:
+    change: EventListener
+    input: EventListener
+    click: EventListener
+    double_click: EventListener
+    submit: EventListener
+    edit: EventListener
+    clear: EventListener
+    play: EventListener
+    pause: EventListener
+    stop: EventListener
+    end: EventListener
+    start_recording: EventListener
+    pause_recording: EventListener
+    stop_recording: EventListener
+    focus: EventListener
+    blur: EventListener
+    upload: EventListener
+    release: EventListener
+    select: EventListener
+    stream: EventListener
+    like: EventListener
+    example_select: EventListener
+    load: EventListener
+    key_up: EventListener
+    apply: EventListener
+    delete: EventListener
+    tick: EventListener
+    undo: EventListener
+    retry: EventListener
+    expand: EventListener
+    collapse: EventListener
+    download: EventListener
+
+__all__ = [
+    "DeletedFileData",
+    "Dependency",
+    "DownloadData",
+    "EventData",
+    "EventListener",
+    "EventListenerMethod",
+    "Events",
+    "KeyUpData",
+    "LikeData",
+    "RetryData",
+    "SelectData",
+    "UndoData",
+    "on",
+    "set_cancel_events",
+]
diff --git a/src/ultimate_rvc/stubs/pedalboard_native/io/__init__.pyi b/src/ultimate_rvc/stubs/pedalboard_native/io/__init__.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..308c7f161f345a0212a7da6885c0cd6f9bdd337e
--- /dev/null
+++ b/src/ultimate_rvc/stubs/pedalboard_native/io/__init__.pyi
@@ -0,0 +1,41 @@
+from typing import Literal, Self, overload
+
+import numpy as np
+from numpy.typing import NDArray
+
+class AudioFile:
+    @classmethod
+    @overload
+    def __new__(
+        cls: object,
+        filename: str,
+        mode: Literal["r"] = "r",
+    ) -> ReadableAudioFile: ...
+    @classmethod
+    @overload
+    def __new__(
+        cls: object,
+        filename: str,
+        mode: Literal["w"],
+        samplerate: float | None = None,
+        num_channels: int = 1,
+        bit_depth: int = 16,
+        quality: str | float | None = None,
+    ) -> WriteableAudioFile: ...
+
+class ReadableAudioFile(AudioFile):
+    def __enter__(self) -> Self: ...
+    def __exit__(self, arg0: object, arg1: object, arg2: object) -> None: ...
+    def read(self, num_frames: float = 0) -> NDArray[np.float32]: ...
+    def tell(self) -> int: ...
+    @property
+    def frames(self) -> int: ...
+    @property
+    def num_channels(self) -> int: ...
+    @property
+    def samplerate(self) -> float | int: ...
+
+class WriteableAudioFile(AudioFile):
+    def __enter__(self) -> Self: ...
+    def __exit__(self, arg0: object, arg1: object, arg2: object) -> None: ...
+    def write(self, samples: NDArray[...]) -> None: ...
diff --git a/src/ultimate_rvc/stubs/soundfile/__init__.pyi b/src/ultimate_rvc/stubs/soundfile/__init__.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..a46dd07f6fe2376ce51873616dc73ce3d83cd404
--- /dev/null
+++ b/src/ultimate_rvc/stubs/soundfile/__init__.pyi
@@ -0,0 +1,34 @@
+from typing import Literal
+
+from os import PathLike
+
+import numpy as np
+from numpy.typing import NDArray
+
+type DEFAULT_NDARRAY = NDArray[np.float64 | np.float32 | np.int32 | np.int16]
+
+def read(
+    file: int | str | PathLike[str] | PathLike[bytes],
+    frames: int = -1,
+    start: int = 0,
+    stop: int | None = None,
+    dtype: Literal["float64", "float32", "int32", "int16"] = "float64",
+    always_2d: bool = False,
+    fill_value: float | None = None,
+    out: DEFAULT_NDARRAY | None = None,
+    samplerate: int | None = None,
+    channels: int | None = None,
+    format: str | None = None,  # noqa: A002
+    subtype: str | None = None,
+    endian: Literal["FILE", "LITTLE", "BIG", "CPU"] | None = None,
+    closefd: bool | None = True,
+) -> tuple[DEFAULT_NDARRAY, int]: ...
+def write(
+    file: int | str | PathLike[str] | PathLike[bytes],
+    data: DEFAULT_NDARRAY,
+    samplerate: int,
+    subtype: str | None = None,
+    endian: Literal["FILE", "LITTLE", "BIG", "CPU"] | None = None,
+    format: str | None = None,  # noqa: A002
+    closefd: bool | None = True,
+) -> None: ...
diff --git a/src/ultimate_rvc/stubs/sox/__init__.pyi b/src/ultimate_rvc/stubs/sox/__init__.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..fb8017102030831d9a59825c89be09c9141ee872
--- /dev/null
+++ b/src/ultimate_rvc/stubs/sox/__init__.pyi
@@ -0,0 +1,19 @@
+from typing import Self
+
+from pathlib import Path
+
+from numpy.typing import NDArray
+
+class Transformer:
+    def pitch(
+        self,
+        n_semitones: float,
+        quick: bool = False,
+    ) -> Self: ...
+    def build_array(
+        self,
+        input_filepath: str | Path | None = None,
+        input_array: NDArray[...] | None = None,
+        sample_rate_in: float | None = None,
+        extra_args: list[str] | None = None,
+    ) -> NDArray[...]: ...
diff --git a/src/ultimate_rvc/stubs/static_ffmpeg/__init__.pyi b/src/ultimate_rvc/stubs/static_ffmpeg/__init__.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..e619dd15e2616d45dbda2871c62fc0f8a65f1965
--- /dev/null
+++ b/src/ultimate_rvc/stubs/static_ffmpeg/__init__.pyi
@@ -0,0 +1 @@
+def add_paths(weak: bool = False) -> bool: ...
diff --git a/src/ultimate_rvc/stubs/static_sox/__init__.pyi b/src/ultimate_rvc/stubs/static_sox/__init__.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..e619dd15e2616d45dbda2871c62fc0f8a65f1965
--- /dev/null
+++ b/src/ultimate_rvc/stubs/static_sox/__init__.pyi
@@ -0,0 +1 @@
+def add_paths(weak: bool = False) -> bool: ...
diff --git a/src/ultimate_rvc/stubs/yt_dlp/__init__.pyi b/src/ultimate_rvc/stubs/yt_dlp/__init__.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..24cef7e876fe1e9640d0210e9ec2168c0574a5f5
--- /dev/null
+++ b/src/ultimate_rvc/stubs/yt_dlp/__init__.pyi
@@ -0,0 +1,27 @@
+from typing import Any, Self
+
+class YoutubeDL:
+    def __init__(
+        self,
+        params: dict[str, Any] | None = None,
+        auto_init: bool = True,
+    ) -> None: ...
+    def extract_info(
+        self,
+        url: str,
+        download: bool = True,
+        ie_key: str | None = None,
+        extra_info: dict[str, Any] | None = None,
+        process: bool = True,
+        force_generic_extractor: bool = False,
+    ) -> dict[str, Any]: ...
+    def prepare_filename(
+        self,
+        info_dict: dict[str, Any],
+        dir_type: str = "",
+        *,
+        outtmpl: str | None = None,
+        warn: bool = False,
+    ) -> str: ...
+    def __enter__(self) -> Self: ...
+    def __exit__(self, *args: object) -> None: ...
diff --git a/src/ultimate_rvc/typing_extra.py b/src/ultimate_rvc/typing_extra.py
new file mode 100644
index 0000000000000000000000000000000000000000..359f2f284b98f99cebaead017378e106f32647fe
--- /dev/null
+++ b/src/ultimate_rvc/typing_extra.py
@@ -0,0 +1,56 @@
+"""Extra typing for the Ultimate RVC project."""
+
+from collections.abc import Mapping, Sequence
+from enum import IntEnum, StrEnum
+from os import PathLike
+
+type StrPath = str | PathLike[str]
+
+type Json = Mapping[str, Json] | Sequence[Json] | str | int | float | bool | None
+
+
+class SeparationModel(StrEnum):
+    """The model to use for audio separation."""
+
+    UVR_MDX_NET_VOC_FT = "UVR-MDX-NET-Voc_FT.onnx"
+    UVR_MDX_NET_KARA_2 = "UVR_MDXNET_KARA_2.onnx"
+    REVERB_HQ_BY_FOXJOY = "Reverb_HQ_By_FoxJoy.onnx"
+
+
+class SegmentSize(IntEnum):
+    """The segment size to use for audio separation."""
+
+    SEG_64 = 64
+    SEG_128 = 128
+    SEG_256 = 256
+    SEG_512 = 512
+    SEG_1024 = 1024
+    SEG_2048 = 2048
+
+
+class F0Method(StrEnum):
+    """The method to use for pitch detection."""
+
+    RMVPE = "rmvpe"
+    MANGIO_CREPE = "mangio-crepe"
+
+
+class SampleRate(IntEnum):
+    """The sample rate of an audio file."""
+
+    HZ_16000 = 16000
+    HZ_44100 = 44100
+    HZ_48000 = 48000
+    HZ_96000 = 96000
+    HZ_192000 = 192000
+
+
+class AudioExt(StrEnum):
+    """Audio file formats."""
+
+    MP3 = "mp3"
+    WAV = "wav"
+    FLAC = "flac"
+    OGG = "ogg"
+    M4A = "m4a"
+    AAC = "aac"
diff --git a/src/ultimate_rvc/vc/__init__.py b/src/ultimate_rvc/vc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cfc83860cf19b959bfb0209a8c4d2f63f9ec9e5
--- /dev/null
+++ b/src/ultimate_rvc/vc/__init__.py
@@ -0,0 +1,8 @@
+"""
+Voice conversion package for the Ultimate RVC project.
+
+This package contains modules exposing functionality that enable voice
+conversion using RVC. The implementation code is primarily built on
+PyTorch to achieve high-performant processing using GPU acceleration.
+
+"""
diff --git a/src/ultimate_rvc/vc/configs/32k.json b/src/ultimate_rvc/vc/configs/32k.json
new file mode 100644
index 0000000000000000000000000000000000000000..400b6be80706e9cd290fbbc963cc07b7ecf8a897
--- /dev/null
+++ b/src/ultimate_rvc/vc/configs/32k.json
@@ -0,0 +1,46 @@
+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 12800,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 32000,
+    "filter_length": 1024,
+    "hop_length": 320,
+    "win_length": 1024,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,4,2,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}
diff --git a/src/ultimate_rvc/vc/configs/32k_v2.json b/src/ultimate_rvc/vc/configs/32k_v2.json
new file mode 100644
index 0000000000000000000000000000000000000000..70e534f4c641a5a2c8e5c1e172f61398ee97e6e0
--- /dev/null
+++ b/src/ultimate_rvc/vc/configs/32k_v2.json
@@ -0,0 +1,46 @@
+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": true,
+    "lr_decay": 0.999875,
+    "segment_size": 12800,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 32000,
+    "filter_length": 1024,
+    "hop_length": 320,
+    "win_length": 1024,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,8,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [20,16,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}
diff --git a/src/ultimate_rvc/vc/configs/40k.json b/src/ultimate_rvc/vc/configs/40k.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb30b8be490877be1a95f257c1bb085e493eadeb
--- /dev/null
+++ b/src/ultimate_rvc/vc/configs/40k.json
@@ -0,0 +1,46 @@
+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 12800,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 40000,
+    "filter_length": 2048,
+    "hop_length": 400,
+    "win_length": 2048,
+    "n_mel_channels": 125,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,10,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}
diff --git a/src/ultimate_rvc/vc/configs/48k.json b/src/ultimate_rvc/vc/configs/48k.json
new file mode 100644
index 0000000000000000000000000000000000000000..6875991005b9dc8f3dd0f5558c660e8fb2fc7178
--- /dev/null
+++ b/src/ultimate_rvc/vc/configs/48k.json
@@ -0,0 +1,46 @@
+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 11520,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 48000,
+    "filter_length": 2048,
+    "hop_length": 480,
+    "win_length": 2048,
+    "n_mel_channels": 128,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,6,2,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}
diff --git a/src/ultimate_rvc/vc/configs/48k_v2.json b/src/ultimate_rvc/vc/configs/48k_v2.json
new file mode 100644
index 0000000000000000000000000000000000000000..75f770cdacff3467e9e925ed2393b480881d0303
--- /dev/null
+++ b/src/ultimate_rvc/vc/configs/48k_v2.json
@@ -0,0 +1,46 @@
+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": true,
+    "lr_decay": 0.999875,
+    "segment_size": 17280,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 48000,
+    "filter_length": 2048,
+    "hop_length": 480,
+    "win_length": 2048,
+    "n_mel_channels": 128,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [12,10,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [24,20,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}
diff --git a/src/ultimate_rvc/vc/infer_pack/attentions.py b/src/ultimate_rvc/vc/infer_pack/attentions.py
new file mode 100644
index 0000000000000000000000000000000000000000..1006fccb66092e3952bb78ca384b3b22cd93a1ed
--- /dev/null
+++ b/src/ultimate_rvc/vc/infer_pack/attentions.py
@@ -0,0 +1,417 @@
+import copy
+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from ultimate_rvc.vc.infer_pack import commons
+from ultimate_rvc.vc.infer_pack import modules
+from ultimate_rvc.vc.infer_pack.modules import LayerNorm
+
+
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size=1,
+        p_dropout=0.0,
+        window_size=10,
+        **kwargs
+    ):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+
+        self.drop = nn.Dropout(p_dropout)
+        self.attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels,
+                    hidden_channels,
+                    n_heads,
+                    p_dropout=p_dropout,
+                    window_size=window_size,
+                )
+            )
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(
+                    hidden_channels,
+                    hidden_channels,
+                    filter_channels,
+                    kernel_size,
+                    p_dropout=p_dropout,
+                )
+            )
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+
+    def forward(self, x, x_mask):
+        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        for i in range(self.n_layers):
+            y = self.attn_layers[i](x, x, attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size=1,
+        p_dropout=0.0,
+        proximal_bias=False,
+        proximal_init=True,
+        **kwargs
+    ):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+
+        self.drop = nn.Dropout(p_dropout)
+        self.self_attn_layers = nn.ModuleList()
+        self.norm_layers_0 = nn.ModuleList()
+        self.encdec_attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.self_attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels,
+                    hidden_channels,
+                    n_heads,
+                    p_dropout=p_dropout,
+                    proximal_bias=proximal_bias,
+                    proximal_init=proximal_init,
+                )
+            )
+            self.norm_layers_0.append(LayerNorm(hidden_channels))
+            self.encdec_attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
+                )
+            )
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(
+                    hidden_channels,
+                    hidden_channels,
+                    filter_channels,
+                    kernel_size,
+                    p_dropout=p_dropout,
+                    causal=True,
+                )
+            )
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+
+    def forward(self, x, x_mask, h, h_mask):
+        """
+        x: decoder input
+        h: encoder output
+        """
+        self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
+            device=x.device, dtype=x.dtype
+        )
+        encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        for i in range(self.n_layers):
+            y = self.self_attn_layers[i](x, x, self_attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_0[i](x + y)
+
+            y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
+
+
+class MultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        channels,
+        out_channels,
+        n_heads,
+        p_dropout=0.0,
+        window_size=None,
+        heads_share=True,
+        block_length=None,
+        proximal_bias=False,
+        proximal_init=False,
+    ):
+        super().__init__()
+        assert channels % n_heads == 0
+
+        self.channels = channels
+        self.out_channels = out_channels
+        self.n_heads = n_heads
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.heads_share = heads_share
+        self.block_length = block_length
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+        self.attn = None
+
+        self.k_channels = channels // n_heads
+        self.conv_q = nn.Conv1d(channels, channels, 1)
+        self.conv_k = nn.Conv1d(channels, channels, 1)
+        self.conv_v = nn.Conv1d(channels, channels, 1)
+        self.conv_o = nn.Conv1d(channels, out_channels, 1)
+        self.drop = nn.Dropout(p_dropout)
+
+        if window_size is not None:
+            n_heads_rel = 1 if heads_share else n_heads
+            rel_stddev = self.k_channels**-0.5
+            self.emb_rel_k = nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+                * rel_stddev
+            )
+            self.emb_rel_v = nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+                * rel_stddev
+            )
+
+        nn.init.xavier_uniform_(self.conv_q.weight)
+        nn.init.xavier_uniform_(self.conv_k.weight)
+        nn.init.xavier_uniform_(self.conv_v.weight)
+        if proximal_init:
+            with torch.no_grad():
+                self.conv_k.weight.copy_(self.conv_q.weight)
+                self.conv_k.bias.copy_(self.conv_q.bias)
+
+    def forward(self, x, c, attn_mask=None):
+        q = self.conv_q(x)
+        k = self.conv_k(c)
+        v = self.conv_v(c)
+
+        x, self.attn = self.attention(q, k, v, mask=attn_mask)
+
+        x = self.conv_o(x)
+        return x
+
+    def attention(self, query, key, value, mask=None):
+        # reshape [b, d, t] -> [b, n_h, t, d_k]
+        b, d, t_s, t_t = (*key.size(), query.size(2))
+        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+
+        scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
+        if self.window_size is not None:
+            assert (
+                t_s == t_t
+            ), "Relative attention is only available for self-attention."
+            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+            rel_logits = self._matmul_with_relative_keys(
+                query / math.sqrt(self.k_channels), key_relative_embeddings
+            )
+            scores_local = self._relative_position_to_absolute_position(rel_logits)
+            scores = scores + scores_local
+        if self.proximal_bias:
+            assert t_s == t_t, "Proximal bias is only available for self-attention."
+            scores = scores + self._attention_bias_proximal(t_s).to(
+                device=scores.device, dtype=scores.dtype
+            )
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e4)
+            if self.block_length is not None:
+                assert (
+                    t_s == t_t
+                ), "Local attention is only available for self-attention."
+                block_mask = (
+                    torch.ones_like(scores)
+                    .triu(-self.block_length)
+                    .tril(self.block_length)
+                )
+                scores = scores.masked_fill(block_mask == 0, -1e4)
+        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
+        p_attn = self.drop(p_attn)
+        output = torch.matmul(p_attn, value)
+        if self.window_size is not None:
+            relative_weights = self._absolute_position_to_relative_position(p_attn)
+            value_relative_embeddings = self._get_relative_embeddings(
+                self.emb_rel_v, t_s
+            )
+            output = output + self._matmul_with_relative_values(
+                relative_weights, value_relative_embeddings
+            )
+        output = (
+            output.transpose(2, 3).contiguous().view(b, d, t_t)
+        )  # [b, n_h, t_t, d_k] -> [b, d, t_t]
+        return output, p_attn
+
+    def _matmul_with_relative_values(self, x, y):
+        """
+        x: [b, h, l, m]
+        y: [h or 1, m, d]
+        ret: [b, h, l, d]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0))
+        return ret
+
+    def _matmul_with_relative_keys(self, x, y):
+        """
+        x: [b, h, l, d]
+        y: [h or 1, m, d]
+        ret: [b, h, l, m]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+        return ret
+
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        max_relative_position = 2 * self.window_size + 1
+        # Pad first before slice to avoid using cond ops.
+        pad_length = max(length - (self.window_size + 1), 0)
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        slice_end_position = slice_start_position + 2 * length - 1
+        if pad_length > 0:
+            padded_relative_embeddings = F.pad(
+                relative_embeddings,
+                commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
+            )
+        else:
+            padded_relative_embeddings = relative_embeddings
+        used_relative_embeddings = padded_relative_embeddings[
+            :, slice_start_position:slice_end_position
+        ]
+        return used_relative_embeddings
+
+    def _relative_position_to_absolute_position(self, x):
+        """
+        x: [b, h, l, 2*l-1]
+        ret: [b, h, l, l]
+        """
+        batch, heads, length, _ = x.size()
+        # Concat columns of pad to shift from relative to absolute indexing.
+        x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
+
+        # Concat extra elements so to add up to shape (len+1, 2*len-1).
+        x_flat = x.view([batch, heads, length * 2 * length])
+        x_flat = F.pad(
+            x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
+        )
+
+        # Reshape and slice out the padded elements.
+        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
+            :, :, :length, length - 1 :
+        ]
+        return x_final
+
+    def _absolute_position_to_relative_position(self, x):
+        """
+        x: [b, h, l, l]
+        ret: [b, h, l, 2*l-1]
+        """
+        batch, heads, length, _ = x.size()
+        # padd along column
+        x = F.pad(
+            x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
+        )
+        x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
+        # add 0's in the beginning that will skew the elements after reshape
+        x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
+        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
+        return x_final
+
+    def _attention_bias_proximal(self, length):
+        """Bias for self-attention to encourage attention to close positions.
+        Args:
+          length: an integer scalar.
+        Returns:
+          a Tensor with shape [1, 1, length, length]
+        """
+        r = torch.arange(length, dtype=torch.float32)
+        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+
+
+class FFN(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        filter_channels,
+        kernel_size,
+        p_dropout=0.0,
+        activation=None,
+        causal=False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.activation = activation
+        self.causal = causal
+
+        if causal:
+            self.padding = self._causal_padding
+        else:
+            self.padding = self._same_padding
+
+        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
+        self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
+        self.drop = nn.Dropout(p_dropout)
+
+    def forward(self, x, x_mask):
+        x = self.conv_1(self.padding(x * x_mask))
+        if self.activation == "gelu":
+            x = x * torch.sigmoid(1.702 * x)
+        else:
+            x = torch.relu(x)
+        x = self.drop(x)
+        x = self.conv_2(self.padding(x * x_mask))
+        return x * x_mask
+
+    def _causal_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = self.kernel_size - 1
+        pad_r = 0
+        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        x = F.pad(x, commons.convert_pad_shape(padding))
+        return x
+
+    def _same_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = (self.kernel_size - 1) // 2
+        pad_r = self.kernel_size // 2
+        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        x = F.pad(x, commons.convert_pad_shape(padding))
+        return x
diff --git a/src/ultimate_rvc/vc/infer_pack/commons.py b/src/ultimate_rvc/vc/infer_pack/commons.py
new file mode 100644
index 0000000000000000000000000000000000000000..54470986f37825b35d90d7efa7437d1c26b87215
--- /dev/null
+++ b/src/ultimate_rvc/vc/infer_pack/commons.py
@@ -0,0 +1,166 @@
+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+
+
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+
+
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+
+
+def kl_divergence(m_p, logs_p, m_q, logs_q):
+    """KL(P||Q)"""
+    kl = (logs_q - logs_p) - 0.5
+    kl += (
+        0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
+    )
+    return kl
+
+
+def rand_gumbel(shape):
+    """Sample from the Gumbel distribution, protect from overflows."""
+    uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
+    return -torch.log(-torch.log(uniform_samples))
+
+
+def rand_gumbel_like(x):
+    g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
+    return g
+
+
+def slice_segments(x, ids_str, segment_size=4):
+    ret = torch.zeros_like(x[:, :, :segment_size])
+    for i in range(x.size(0)):
+        idx_str = ids_str[i]
+        idx_end = idx_str + segment_size
+        ret[i] = x[i, :, idx_str:idx_end]
+    return ret
+
+
+def slice_segments2(x, ids_str, segment_size=4):
+    ret = torch.zeros_like(x[:, :segment_size])
+    for i in range(x.size(0)):
+        idx_str = ids_str[i]
+        idx_end = idx_str + segment_size
+        ret[i] = x[i, idx_str:idx_end]
+    return ret
+
+
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+    b, d, t = x.size()
+    if x_lengths is None:
+        x_lengths = t
+    ids_str_max = x_lengths - segment_size + 1
+    ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+    ret = slice_segments(x, ids_str, segment_size)
+    return ret, ids_str
+
+
+def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
+    position = torch.arange(length, dtype=torch.float)
+    num_timescales = channels // 2
+    log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
+        num_timescales - 1
+    )
+    inv_timescales = min_timescale * torch.exp(
+        torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
+    )
+    scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
+    signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
+    signal = F.pad(signal, [0, 0, 0, channels % 2])
+    signal = signal.view(1, channels, length)
+    return signal
+
+
+def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
+    b, channels, length = x.size()
+    signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+    return x + signal.to(dtype=x.dtype, device=x.device)
+
+
+def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
+    b, channels, length = x.size()
+    signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+    return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
+
+
+def subsequent_mask(length):
+    mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+    return mask
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+
+
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+
+
+def shift_1d(x):
+    x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
+    return x
+
+
+def sequence_mask(length, max_length=None):
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)
+
+
+def generate_path(duration, mask):
+    """
+    duration: [b, 1, t_x]
+    mask: [b, 1, t_y, t_x]
+    """
+    device = duration.device
+
+    b, _, t_y, t_x = mask.shape
+    cum_duration = torch.cumsum(duration, -1)
+
+    cum_duration_flat = cum_duration.view(b * t_x)
+    path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+    path = path.view(b, t_x, t_y)
+    path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+    path = path.unsqueeze(1).transpose(2, 3) * mask
+    return path
+
+
+def clip_grad_value_(parameters, clip_value, norm_type=2):
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    norm_type = float(norm_type)
+    if clip_value is not None:
+        clip_value = float(clip_value)
+
+    total_norm = 0
+    for p in parameters:
+        param_norm = p.grad.data.norm(norm_type)
+        total_norm += param_norm.item() ** norm_type
+        if clip_value is not None:
+            p.grad.data.clamp_(min=-clip_value, max=clip_value)
+    total_norm = total_norm ** (1.0 / norm_type)
+    return total_norm
diff --git a/src/ultimate_rvc/vc/infer_pack/models.py b/src/ultimate_rvc/vc/infer_pack/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..a11d526aa75ab03870836b4bfb13562911eba7cc
--- /dev/null
+++ b/src/ultimate_rvc/vc/infer_pack/models.py
@@ -0,0 +1,1128 @@
+import math, pdb, os
+from time import time as ttime
+import torch
+from torch import nn
+from torch.nn import functional as F
+from ultimate_rvc.vc.infer_pack import modules
+from ultimate_rvc.vc.infer_pack import attentions
+from ultimate_rvc.vc.infer_pack import commons
+from ultimate_rvc.vc.infer_pack.commons import init_weights, get_padding
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from ultimate_rvc.vc.infer_pack.commons import init_weights
+import numpy as np
+from ultimate_rvc.vc.infer_pack import commons 
+
+
+class TextEncoder256(nn.Module):
+    def __init__(
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        f0=True,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.emb_phone = nn.Linear(256, hidden_channels)
+        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
+        if f0 == True:
+            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
+        self.encoder = attentions.Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+    def forward(self, phone, pitch, lengths):
+        if pitch == None:
+            x = self.emb_phone(phone)
+        else:
+            x = self.emb_phone(phone) + self.emb_pitch(pitch)
+        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
+        x = self.lrelu(x)
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.encoder(x * x_mask, x_mask)
+        stats = self.proj(x) * x_mask
+
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        return m, logs, x_mask
+
+
+class TextEncoder768(nn.Module):
+    def __init__(
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        f0=True,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.emb_phone = nn.Linear(768, hidden_channels)
+        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
+        if f0 == True:
+            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
+        self.encoder = attentions.Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+    def forward(self, phone, pitch, lengths):
+        if pitch == None:
+            x = self.emb_phone(phone)
+        else:
+            x = self.emb_phone(phone) + self.emb_pitch(pitch)
+        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
+        x = self.lrelu(x)
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.encoder(x * x_mask, x_mask)
+        stats = self.proj(x) * x_mask
+
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        return m, logs, x_mask
+
+
+class ResidualCouplingBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        n_flows=4,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+
+        self.flows = nn.ModuleList()
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ResidualCouplingLayer(
+                    channels,
+                    hidden_channels,
+                    kernel_size,
+                    dilation_rate,
+                    n_layers,
+                    gin_channels=gin_channels,
+                    mean_only=True,
+                )
+            )
+            self.flows.append(modules.Flip())
+
+    def forward(self, x, x_mask, g=None, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+
+    def remove_weight_norm(self):
+        for i in range(self.n_flows):
+            self.flows[i * 2].remove_weight_norm()
+
+
+class PosteriorEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = modules.WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            gin_channels=gin_channels,
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+    def forward(self, x, x_lengths, g=None):
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
+
+    def remove_weight_norm(self):
+        self.enc.remove_weight_norm()
+
+
+class Generator(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gin_channels=0,
+    ):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+
+
+class SineGen(torch.nn.Module):
+    """Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+
+    def __init__(
+        self,
+        samp_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        noise_std=0.003,
+        voiced_threshold=0,
+        flag_for_pulse=False,
+    ):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = torch.ones_like(f0)
+        uv = uv * (f0 > self.voiced_threshold)
+        return uv
+
+    def forward(self, f0, upp):
+        """sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        with torch.no_grad():
+            f0 = f0[:, None].transpose(1, 2)
+            f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
+            # fundamental component
+            f0_buf[:, :, 0] = f0[:, :, 0]
+            for idx in np.arange(self.harmonic_num):
+                f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
+                    idx + 2
+                )  # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
+            rad_values = (
+                f0_buf / self.sampling_rate
+            ) % 1  ###%1意味着n_har的乘积无法后处理优化
+            rand_ini = torch.rand(
+                f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
+            )
+            rand_ini[:, 0] = 0
+            rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+            tmp_over_one = torch.cumsum(
+                rad_values, 1
+            )  # % 1  #####%1意味着后面的cumsum无法再优化
+            tmp_over_one *= upp
+            tmp_over_one = F.interpolate(
+                tmp_over_one.transpose(2, 1),
+                scale_factor=upp,
+                mode="linear",
+                align_corners=True,
+            ).transpose(2, 1)
+            rad_values = F.interpolate(
+                rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
+            ).transpose(
+                2, 1
+            )  #######
+            tmp_over_one %= 1
+            tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
+            cumsum_shift = torch.zeros_like(rad_values)
+            cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+            sine_waves = torch.sin(
+                torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
+            )
+            sine_waves = sine_waves * self.sine_amp
+            uv = self._f02uv(f0)
+            uv = F.interpolate(
+                uv.transpose(2, 1), scale_factor=upp, mode="nearest"
+            ).transpose(2, 1)
+            noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+            noise = noise_amp * torch.randn_like(sine_waves)
+            sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+
+
+class SourceModuleHnNSF(torch.nn.Module):
+    """SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+
+    def __init__(
+        self,
+        sampling_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        add_noise_std=0.003,
+        voiced_threshod=0,
+        is_half=True,
+    ):
+        super(SourceModuleHnNSF, self).__init__()
+
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        self.is_half = is_half
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(
+            sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
+        )
+
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+
+    def forward(self, x, upp=None):
+        sine_wavs, uv, _ = self.l_sin_gen(x, upp)
+        if self.is_half:
+            sine_wavs = sine_wavs.half()
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        return sine_merge, None, None  # noise, uv
+
+
+class GeneratorNSF(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gin_channels,
+        sr,
+        is_half=False,
+    ):
+        super(GeneratorNSF, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
+        self.m_source = SourceModuleHnNSF(
+            sampling_rate=sr, harmonic_num=0, is_half=is_half
+        )
+        self.noise_convs = nn.ModuleList()
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            c_cur = upsample_initial_channel // (2 ** (i + 1))
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+            if i + 1 < len(upsample_rates):
+                stride_f0 = np.prod(upsample_rates[i + 1 :])
+                self.noise_convs.append(
+                    Conv1d(
+                        1,
+                        c_cur,
+                        kernel_size=stride_f0 * 2,
+                        stride=stride_f0,
+                        padding=stride_f0 // 2,
+                    )
+                )
+            else:
+                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+        self.upp = np.prod(upsample_rates)
+
+    def forward(self, x, f0, g=None):
+        har_source, noi_source, uv = self.m_source(f0, self.upp)
+        har_source = har_source.transpose(1, 2)
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            x_source = self.noise_convs[i](har_source)
+            x = x + x_source
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+
+
+sr2sr = {
+    "32k": 32000,
+    "40k": 40000,
+    "48k": 48000,
+}
+
+
+class SynthesizerTrnMs256NSFsid(nn.Module):
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        gin_channels,
+        sr,
+        **kwargs
+    ):
+        super().__init__()
+        if type(sr) == type("strr"):
+            sr = sr2sr[sr]
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim = spk_embed_dim
+        self.enc_p = TextEncoder256(
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+        )
+        self.dec = GeneratorNSF(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+            sr=sr,
+            is_half=kwargs["is_half"],
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        # print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+
+    def forward(
+        self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
+    ):  # 这里ds是id，[bs,1]
+        # print(1,pitch.shape)#[bs,t]
+        g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
+        z_p = self.flow(z, y_mask, g=g)
+        z_slice, ids_slice = commons.rand_slice_segments(
+            z, y_lengths, self.segment_size
+        )
+        # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
+        pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
+        # print(-2,pitchf.shape,z_slice.shape)
+        o = self.dec(z_slice, pitchf, g=g)
+        return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+
+    def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
+        g = self.emb_g(sid).unsqueeze(-1)
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
+        return o, x_mask, (z, z_p, m_p, logs_p)
+
+
+class SynthesizerTrnMs768NSFsid(nn.Module):
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        gin_channels,
+        sr,
+        **kwargs
+    ):
+        super().__init__()
+        if type(sr) == type("strr"):
+            sr = sr2sr[sr]
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim = spk_embed_dim
+        self.enc_p = TextEncoder768(
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+        )
+        self.dec = GeneratorNSF(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+            sr=sr,
+            is_half=kwargs["is_half"],
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        # print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+
+    def forward(
+        self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
+    ):  # 这里ds是id，[bs,1]
+        # print(1,pitch.shape)#[bs,t]
+        g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
+        z_p = self.flow(z, y_mask, g=g)
+        z_slice, ids_slice = commons.rand_slice_segments(
+            z, y_lengths, self.segment_size
+        )
+        # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
+        pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
+        # print(-2,pitchf.shape,z_slice.shape)
+        o = self.dec(z_slice, pitchf, g=g)
+        return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+
+    def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
+        g = self.emb_g(sid).unsqueeze(-1)
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
+        return o, x_mask, (z, z_p, m_p, logs_p)
+
+
+class SynthesizerTrnMs256NSFsid_nono(nn.Module):
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        gin_channels,
+        sr=None,
+        **kwargs
+    ):
+        super().__init__()
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim = spk_embed_dim
+        self.enc_p = TextEncoder256(
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+            f0=False,
+        )
+        self.dec = Generator(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        # print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+
+    def forward(self, phone, phone_lengths, y, y_lengths, ds):  # 这里ds是id，[bs,1]
+        g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
+        m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
+        z_p = self.flow(z, y_mask, g=g)
+        z_slice, ids_slice = commons.rand_slice_segments(
+            z, y_lengths, self.segment_size
+        )
+        o = self.dec(z_slice, g=g)
+        return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+
+    def infer(self, phone, phone_lengths, sid, max_len=None):
+        g = self.emb_g(sid).unsqueeze(-1)
+        m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
+        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        o = self.dec((z * x_mask)[:, :, :max_len], g=g)
+        return o, x_mask, (z, z_p, m_p, logs_p)
+
+
+class SynthesizerTrnMs768NSFsid_nono(nn.Module):
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        gin_channels,
+        sr=None,
+        **kwargs
+    ):
+        super().__init__()
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim = spk_embed_dim
+        self.enc_p = TextEncoder768(
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+            f0=False,
+        )
+        self.dec = Generator(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        # print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+
+    def forward(self, phone, phone_lengths, y, y_lengths, ds):  # 这里ds是id，[bs,1]
+        g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
+        m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
+        z_p = self.flow(z, y_mask, g=g)
+        z_slice, ids_slice = commons.rand_slice_segments(
+            z, y_lengths, self.segment_size
+        )
+        o = self.dec(z_slice, g=g)
+        return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+
+    def infer(self, phone, phone_lengths, sid, max_len=None):
+        g = self.emb_g(sid).unsqueeze(-1)
+        m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
+        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        o = self.dec((z * x_mask)[:, :, :max_len], g=g)
+        return o, x_mask, (z, z_p, m_p, logs_p)
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminator, self).__init__()
+        periods = [2, 3, 5, 7, 11, 17]
+        # periods = [3, 5, 7, 11, 17, 23, 37]
+
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [
+            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+        ]
+        self.discriminators = nn.ModuleList(discs)
+
+    def forward(self, y, y_hat):
+        y_d_rs = []  #
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            # for j in range(len(fmap_r)):
+            #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class MultiPeriodDiscriminatorV2(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminatorV2, self).__init__()
+        # periods = [2, 3, 5, 7, 11, 17]
+        periods = [2, 3, 5, 7, 11, 17, 23, 37]
+
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [
+            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+        ]
+        self.discriminators = nn.ModuleList(discs)
+
+    def forward(self, y, y_hat):
+        y_d_rs = []  #
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            # for j in range(len(fmap_r)):
+            #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+
+    def forward(self, x):
+        fmap = []
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    Conv2d(
+                        1,
+                        32,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        32,
+                        128,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        128,
+                        512,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        512,
+                        1024,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        1024,
+                        1024,
+                        (kernel_size, 1),
+                        1,
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+            ]
+        )
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
diff --git a/src/ultimate_rvc/vc/infer_pack/models_onnx.py b/src/ultimate_rvc/vc/infer_pack/models_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d63d8a9c084bfe1ab3de248d7ed6f42bbb003ea
--- /dev/null
+++ b/src/ultimate_rvc/vc/infer_pack/models_onnx.py
@@ -0,0 +1,822 @@
+import math, pdb, os
+from time import time as ttime
+import torch
+from torch import nn
+from torch.nn import functional as F
+from ultimate_rvc.vc.infer_pack import modules
+from ultimate_rvc.vc.infer_pack import attentions
+from ultimate_rvc.vc.infer_pack import commons
+from ultimate_rvc.vc.infer_pack.commons import init_weights, get_padding
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from ultimate_rvc.vc.infer_pack.commons import init_weights
+import numpy as np
+from ultimate_rvc.vc.infer_pack import commons
+
+
+class TextEncoder256(nn.Module):
+    def __init__(
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        f0=True,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.emb_phone = nn.Linear(256, hidden_channels)
+        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
+        if f0 == True:
+            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
+        self.encoder = attentions.Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+    def forward(self, phone, pitch, lengths):
+        if pitch == None:
+            x = self.emb_phone(phone)
+        else:
+            x = self.emb_phone(phone) + self.emb_pitch(pitch)
+        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
+        x = self.lrelu(x)
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.encoder(x * x_mask, x_mask)
+        stats = self.proj(x) * x_mask
+
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        return m, logs, x_mask
+
+
+class TextEncoder768(nn.Module):
+    def __init__(
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        f0=True,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.emb_phone = nn.Linear(768, hidden_channels)
+        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
+        if f0 == True:
+            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
+        self.encoder = attentions.Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+    def forward(self, phone, pitch, lengths):
+        if pitch == None:
+            x = self.emb_phone(phone)
+        else:
+            x = self.emb_phone(phone) + self.emb_pitch(pitch)
+        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
+        x = self.lrelu(x)
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.encoder(x * x_mask, x_mask)
+        stats = self.proj(x) * x_mask
+
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        return m, logs, x_mask
+
+
+class ResidualCouplingBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        n_flows=4,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+
+        self.flows = nn.ModuleList()
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ResidualCouplingLayer(
+                    channels,
+                    hidden_channels,
+                    kernel_size,
+                    dilation_rate,
+                    n_layers,
+                    gin_channels=gin_channels,
+                    mean_only=True,
+                )
+            )
+            self.flows.append(modules.Flip())
+
+    def forward(self, x, x_mask, g=None, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+
+    def remove_weight_norm(self):
+        for i in range(self.n_flows):
+            self.flows[i * 2].remove_weight_norm()
+
+
+class PosteriorEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = modules.WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            gin_channels=gin_channels,
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+    def forward(self, x, x_lengths, g=None):
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
+
+    def remove_weight_norm(self):
+        self.enc.remove_weight_norm()
+
+
+class Generator(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gin_channels=0,
+    ):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+
+
+class SineGen(torch.nn.Module):
+    """Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+
+    def __init__(
+        self,
+        samp_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        noise_std=0.003,
+        voiced_threshold=0,
+        flag_for_pulse=False,
+    ):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = torch.ones_like(f0)
+        uv = uv * (f0 > self.voiced_threshold)
+        return uv
+
+    def forward(self, f0, upp):
+        """sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        with torch.no_grad():
+            f0 = f0[:, None].transpose(1, 2)
+            f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
+            # fundamental component
+            f0_buf[:, :, 0] = f0[:, :, 0]
+            for idx in np.arange(self.harmonic_num):
+                f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
+                    idx + 2
+                )  # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
+            rad_values = (
+                f0_buf / self.sampling_rate
+            ) % 1  ###%1意味着n_har的乘积无法后处理优化
+            rand_ini = torch.rand(
+                f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
+            )
+            rand_ini[:, 0] = 0
+            rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+            tmp_over_one = torch.cumsum(
+                rad_values, 1
+            )  # % 1  #####%1意味着后面的cumsum无法再优化
+            tmp_over_one *= upp
+            tmp_over_one = F.interpolate(
+                tmp_over_one.transpose(2, 1),
+                scale_factor=upp,
+                mode="linear",
+                align_corners=True,
+            ).transpose(2, 1)
+            rad_values = F.interpolate(
+                rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
+            ).transpose(
+                2, 1
+            )  #######
+            tmp_over_one %= 1
+            tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
+            cumsum_shift = torch.zeros_like(rad_values)
+            cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+            sine_waves = torch.sin(
+                torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
+            )
+            sine_waves = sine_waves * self.sine_amp
+            uv = self._f02uv(f0)
+            uv = F.interpolate(
+                uv.transpose(2, 1), scale_factor=upp, mode="nearest"
+            ).transpose(2, 1)
+            noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+            noise = noise_amp * torch.randn_like(sine_waves)
+            sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+
+
+class SourceModuleHnNSF(torch.nn.Module):
+    """SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+
+    def __init__(
+        self,
+        sampling_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        add_noise_std=0.003,
+        voiced_threshod=0,
+        is_half=True,
+    ):
+        super(SourceModuleHnNSF, self).__init__()
+
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        self.is_half = is_half
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(
+            sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
+        )
+
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+
+    def forward(self, x, upp=None):
+        sine_wavs, uv, _ = self.l_sin_gen(x, upp)
+        if self.is_half:
+            sine_wavs = sine_wavs.half()
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        return sine_merge, None, None  # noise, uv
+
+
+class GeneratorNSF(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gin_channels,
+        sr,
+        is_half=False,
+    ):
+        super(GeneratorNSF, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
+        self.m_source = SourceModuleHnNSF(
+            sampling_rate=sr, harmonic_num=0, is_half=is_half
+        )
+        self.noise_convs = nn.ModuleList()
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            c_cur = upsample_initial_channel // (2 ** (i + 1))
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+            if i + 1 < len(upsample_rates):
+                stride_f0 = np.prod(upsample_rates[i + 1 :])
+                self.noise_convs.append(
+                    Conv1d(
+                        1,
+                        c_cur,
+                        kernel_size=stride_f0 * 2,
+                        stride=stride_f0,
+                        padding=stride_f0 // 2,
+                    )
+                )
+            else:
+                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+        self.upp = np.prod(upsample_rates)
+
+    def forward(self, x, f0, g=None):
+        har_source, noi_source, uv = self.m_source(f0, self.upp)
+        har_source = har_source.transpose(1, 2)
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            x_source = self.noise_convs[i](har_source)
+            x = x + x_source
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+
+
+sr2sr = {
+    "32k": 32000,
+    "40k": 40000,
+    "48k": 48000,
+}
+
+
+class SynthesizerTrnMsNSFsidM(nn.Module):
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        gin_channels,
+        sr,
+        **kwargs
+    ):
+        super().__init__()
+        if type(sr) == type("strr"):
+            sr = sr2sr[sr]
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim = spk_embed_dim
+        if self.gin_channels == 256:
+            self.enc_p = TextEncoder256(
+                inter_channels,
+                hidden_channels,
+                filter_channels,
+                n_heads,
+                n_layers,
+                kernel_size,
+                p_dropout,
+            )
+        else:
+            self.enc_p = TextEncoder768(
+                inter_channels,
+                hidden_channels,
+                filter_channels,
+                n_heads,
+                n_layers,
+                kernel_size,
+                p_dropout,
+            )
+        self.dec = GeneratorNSF(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+            sr=sr,
+            is_half=kwargs["is_half"],
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        self.speaker_map = None
+        # print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+
+    def construct_spkmixmap(self, n_speaker):
+        self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels))
+        for i in range(n_speaker):
+            self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))
+        self.speaker_map = self.speaker_map.unsqueeze(0)
+
+    def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None):
+        if self.speaker_map is not None:  # [N, S]  *  [S, B, 1, H]
+            g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1))  # [N, S, B, 1, 1]
+            g = g * self.speaker_map  # [N, S, B, 1, H]
+            g = torch.sum(g, dim=1)  # [N, 1, B, 1, H]
+            g = g.transpose(0, -1).transpose(0, -2).squeeze(0)  # [B, H, N]
+        else:
+            g = g.unsqueeze(0)
+            g = self.emb_g(g).transpose(1, 2)
+
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
+        return o
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminator, self).__init__()
+        periods = [2, 3, 5, 7, 11, 17]
+        # periods = [3, 5, 7, 11, 17, 23, 37]
+
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [
+            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+        ]
+        self.discriminators = nn.ModuleList(discs)
+
+    def forward(self, y, y_hat):
+        y_d_rs = []  #
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            # for j in range(len(fmap_r)):
+            #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class MultiPeriodDiscriminatorV2(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminatorV2, self).__init__()
+        # periods = [2, 3, 5, 7, 11, 17]
+        periods = [2, 3, 5, 7, 11, 17, 23, 37]
+
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [
+            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+        ]
+        self.discriminators = nn.ModuleList(discs)
+
+    def forward(self, y, y_hat):
+        y_d_rs = []  #
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            # for j in range(len(fmap_r)):
+            #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+
+    def forward(self, x):
+        fmap = []
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    Conv2d(
+                        1,
+                        32,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        32,
+                        128,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        128,
+                        512,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        512,
+                        1024,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        1024,
+                        1024,
+                        (kernel_size, 1),
+                        1,
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+            ]
+        )
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
diff --git a/src/ultimate_rvc/vc/infer_pack/models_onnx_moess.py b/src/ultimate_rvc/vc/infer_pack/models_onnx_moess.py
new file mode 100644
index 0000000000000000000000000000000000000000..654a9f2a2dffd3ccbfa90b920ce0aa2b4ff5dcd8
--- /dev/null
+++ b/src/ultimate_rvc/vc/infer_pack/models_onnx_moess.py
@@ -0,0 +1,853 @@
+import math, pdb, os
+from time import time as ttime
+import torch
+from torch import nn
+from torch.nn import functional as F
+from ultimate_rvc.vc.infer_pack import modules
+from ultimate_rvc.vc.infer_pack import attentions
+from ultimate_rvc.vc.infer_pack import commons
+from ultimate_rvc.vc.infer_pack.commons import init_weights, get_padding
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from ultimate_rvc.vc.infer_pack.commons import init_weights
+import numpy as np
+from ultimate_rvc.vc.infer_pack import commons
+
+
+class TextEncoder256(nn.Module):
+    def __init__(
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        f0=True,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.emb_phone = nn.Linear(256, hidden_channels)
+        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
+        if f0 == True:
+            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
+        self.encoder = attentions.Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+    def forward(self, phone, pitch, lengths):
+        if pitch == None:
+            x = self.emb_phone(phone)
+        else:
+            x = self.emb_phone(phone) + self.emb_pitch(pitch)
+        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
+        x = self.lrelu(x)
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.encoder(x * x_mask, x_mask)
+        stats = self.proj(x) * x_mask
+
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        return m, logs, x_mask
+
+
+class TextEncoder256Sim(nn.Module):
+    def __init__(
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        f0=True,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.emb_phone = nn.Linear(256, hidden_channels)
+        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
+        if f0 == True:
+            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
+        self.encoder = attentions.Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+
+    def forward(self, phone, pitch, lengths):
+        if pitch == None:
+            x = self.emb_phone(phone)
+        else:
+            x = self.emb_phone(phone) + self.emb_pitch(pitch)
+        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
+        x = self.lrelu(x)
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.encoder(x * x_mask, x_mask)
+        x = self.proj(x) * x_mask
+        return x, x_mask
+
+
+class ResidualCouplingBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        n_flows=4,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+
+        self.flows = nn.ModuleList()
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ResidualCouplingLayer(
+                    channels,
+                    hidden_channels,
+                    kernel_size,
+                    dilation_rate,
+                    n_layers,
+                    gin_channels=gin_channels,
+                    mean_only=True,
+                )
+            )
+            self.flows.append(modules.Flip())
+
+    def forward(self, x, x_mask, g=None, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+
+    def remove_weight_norm(self):
+        for i in range(self.n_flows):
+            self.flows[i * 2].remove_weight_norm()
+
+
+class PosteriorEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = modules.WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            gin_channels=gin_channels,
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+    def forward(self, x, x_lengths, g=None):
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
+
+    def remove_weight_norm(self):
+        self.enc.remove_weight_norm()
+
+
+class Generator(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gin_channels=0,
+    ):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+
+
+class SineGen(torch.nn.Module):
+    """Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+
+    def __init__(
+        self,
+        samp_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        noise_std=0.003,
+        voiced_threshold=0,
+        flag_for_pulse=False,
+    ):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = torch.ones_like(f0)
+        uv = uv * (f0 > self.voiced_threshold)
+        return uv
+
+    def forward(self, f0, upp):
+        """sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        with torch.no_grad():
+            f0 = f0[:, None].transpose(1, 2)
+            f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
+            # fundamental component
+            f0_buf[:, :, 0] = f0[:, :, 0]
+            for idx in np.arange(self.harmonic_num):
+                f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
+                    idx + 2
+                )  # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
+            rad_values = (
+                f0_buf / self.sampling_rate
+            ) % 1  ###%1意味着n_har的乘积无法后处理优化
+            rand_ini = torch.rand(
+                f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
+            )
+            rand_ini[:, 0] = 0
+            rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+            tmp_over_one = torch.cumsum(
+                rad_values, 1
+            )  # % 1  #####%1意味着后面的cumsum无法再优化
+            tmp_over_one *= upp
+            tmp_over_one = F.interpolate(
+                tmp_over_one.transpose(2, 1),
+                scale_factor=upp,
+                mode="linear",
+                align_corners=True,
+            ).transpose(2, 1)
+            rad_values = F.interpolate(
+                rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
+            ).transpose(
+                2, 1
+            )  #######
+            tmp_over_one %= 1
+            tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
+            cumsum_shift = torch.zeros_like(rad_values)
+            cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+            sine_waves = torch.sin(
+                torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
+            )
+            sine_waves = sine_waves * self.sine_amp
+            uv = self._f02uv(f0)
+            uv = F.interpolate(
+                uv.transpose(2, 1), scale_factor=upp, mode="nearest"
+            ).transpose(2, 1)
+            noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+            noise = noise_amp * torch.randn_like(sine_waves)
+            sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+
+
+class SourceModuleHnNSF(torch.nn.Module):
+    """SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+
+    def __init__(
+        self,
+        sampling_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        add_noise_std=0.003,
+        voiced_threshod=0,
+        is_half=True,
+    ):
+        super(SourceModuleHnNSF, self).__init__()
+
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        self.is_half = is_half
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(
+            sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
+        )
+
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+
+    def forward(self, x, upp=None):
+        sine_wavs, uv, _ = self.l_sin_gen(x, upp)
+        if self.is_half:
+            sine_wavs = sine_wavs.half()
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        return sine_merge, None, None  # noise, uv
+
+
+class GeneratorNSF(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gin_channels,
+        sr,
+        is_half=False,
+    ):
+        super(GeneratorNSF, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
+        self.m_source = SourceModuleHnNSF(
+            sampling_rate=sr, harmonic_num=0, is_half=is_half
+        )
+        self.noise_convs = nn.ModuleList()
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            c_cur = upsample_initial_channel // (2 ** (i + 1))
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+            if i + 1 < len(upsample_rates):
+                stride_f0 = np.prod(upsample_rates[i + 1 :])
+                self.noise_convs.append(
+                    Conv1d(
+                        1,
+                        c_cur,
+                        kernel_size=stride_f0 * 2,
+                        stride=stride_f0,
+                        padding=stride_f0 // 2,
+                    )
+                )
+            else:
+                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+        self.upp = np.prod(upsample_rates)
+
+    def forward(self, x, f0, g=None):
+        har_source, noi_source, uv = self.m_source(f0, self.upp)
+        har_source = har_source.transpose(1, 2)
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            x_source = self.noise_convs[i](har_source)
+            x = x + x_source
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+
+
+sr2sr = {
+    "32k": 32000,
+    "40k": 40000,
+    "48k": 48000,
+}
+
+
+class SynthesizerTrnMs256NSFsidM(nn.Module):
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        gin_channels,
+        sr,
+        **kwargs
+    ):
+        super().__init__()
+        if type(sr) == type("strr"):
+            sr = sr2sr[sr]
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim = spk_embed_dim
+        self.enc_p = TextEncoder256(
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+        )
+        self.dec = GeneratorNSF(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+            sr=sr,
+            is_half=kwargs["is_half"],
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        # print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+
+    def forward(self, phone, phone_lengths, pitch, nsff0, sid, rnd, max_len=None):
+        g = self.emb_g(sid).unsqueeze(-1)
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
+        return o
+
+
+class SynthesizerTrnMs256NSFsid_sim(nn.Module):
+    """
+    Synthesizer for Training
+    """
+
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        # hop_length,
+        gin_channels=0,
+        use_sdp=True,
+        **kwargs
+    ):
+        super().__init__()
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim = spk_embed_dim
+        self.enc_p = TextEncoder256Sim(
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+        )
+        self.dec = GeneratorNSF(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+            is_half=kwargs["is_half"],
+        )
+
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        # print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+
+    def forward(
+        self, phone, phone_lengths, pitch, pitchf, ds, max_len=None
+    ):  # y是spec不需要了现在
+        g = self.emb_g(ds.unsqueeze(0)).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
+        x, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        x = self.flow(x, x_mask, g=g, reverse=True)
+        o = self.dec((x * x_mask)[:, :, :max_len], pitchf, g=g)
+        return o
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminator, self).__init__()
+        periods = [2, 3, 5, 7, 11, 17]
+        # periods = [3, 5, 7, 11, 17, 23, 37]
+
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [
+            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+        ]
+        self.discriminators = nn.ModuleList(discs)
+
+    def forward(self, y, y_hat):
+        y_d_rs = []  #
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            # for j in range(len(fmap_r)):
+            #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+
+    def forward(self, x):
+        fmap = []
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    Conv2d(
+                        1,
+                        32,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        32,
+                        128,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        128,
+                        512,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        512,
+                        1024,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        1024,
+                        1024,
+                        (kernel_size, 1),
+                        1,
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+            ]
+        )
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
diff --git a/src/ultimate_rvc/vc/infer_pack/modules.py b/src/ultimate_rvc/vc/infer_pack/modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..7897c7300cdaf96a0d9c6f159987f87c5b7b2ed7
--- /dev/null
+++ b/src/ultimate_rvc/vc/infer_pack/modules.py
@@ -0,0 +1,522 @@
+import copy
+import math
+import numpy as np
+import scipy
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm
+
+from ultimate_rvc.vc.infer_pack import commons
+from ultimate_rvc.vc.infer_pack.commons import init_weights, get_padding
+from ultimate_rvc.vc.infer_pack.transforms import piecewise_rational_quadratic_transform
+
+
+LRELU_SLOPE = 0.1
+
+
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+
+    def forward(self, x):
+        x = x.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+        return x.transpose(1, -1)
+
+
+class ConvReluNorm(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        hidden_channels,
+        out_channels,
+        kernel_size,
+        n_layers,
+        p_dropout,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+        assert n_layers > 1, "Number of layers should be larger than 0."
+
+        self.conv_layers = nn.ModuleList()
+        self.norm_layers = nn.ModuleList()
+        self.conv_layers.append(
+            nn.Conv1d(
+                in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
+            )
+        )
+        self.norm_layers.append(LayerNorm(hidden_channels))
+        self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
+        for _ in range(n_layers - 1):
+            self.conv_layers.append(
+                nn.Conv1d(
+                    hidden_channels,
+                    hidden_channels,
+                    kernel_size,
+                    padding=kernel_size // 2,
+                )
+            )
+            self.norm_layers.append(LayerNorm(hidden_channels))
+        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+
+    def forward(self, x, x_mask):
+        x_org = x
+        for i in range(self.n_layers):
+            x = self.conv_layers[i](x * x_mask)
+            x = self.norm_layers[i](x)
+            x = self.relu_drop(x)
+        x = x_org + self.proj(x)
+        return x * x_mask
+
+
+class DDSConv(nn.Module):
+    """
+    Dialted and Depth-Separable Convolution
+    """
+
+    def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
+        super().__init__()
+        self.channels = channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+
+        self.drop = nn.Dropout(p_dropout)
+        self.convs_sep = nn.ModuleList()
+        self.convs_1x1 = nn.ModuleList()
+        self.norms_1 = nn.ModuleList()
+        self.norms_2 = nn.ModuleList()
+        for i in range(n_layers):
+            dilation = kernel_size**i
+            padding = (kernel_size * dilation - dilation) // 2
+            self.convs_sep.append(
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    groups=channels,
+                    dilation=dilation,
+                    padding=padding,
+                )
+            )
+            self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
+            self.norms_1.append(LayerNorm(channels))
+            self.norms_2.append(LayerNorm(channels))
+
+    def forward(self, x, x_mask, g=None):
+        if g is not None:
+            x = x + g
+        for i in range(self.n_layers):
+            y = self.convs_sep[i](x * x_mask)
+            y = self.norms_1[i](y)
+            y = F.gelu(y)
+            y = self.convs_1x1[i](y)
+            y = self.norms_2[i](y)
+            y = F.gelu(y)
+            y = self.drop(y)
+            x = x + y
+        return x * x_mask
+
+
+class WN(torch.nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        gin_channels=0,
+        p_dropout=0,
+    ):
+        super(WN, self).__init__()
+        assert kernel_size % 2 == 1
+        self.hidden_channels = hidden_channels
+        self.kernel_size = (kernel_size,)
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.p_dropout = p_dropout
+
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        self.drop = nn.Dropout(p_dropout)
+
+        if gin_channels != 0:
+            cond_layer = torch.nn.Conv1d(
+                gin_channels, 2 * hidden_channels * n_layers, 1
+            )
+            self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
+
+        for i in range(n_layers):
+            dilation = dilation_rate**i
+            padding = int((kernel_size * dilation - dilation) / 2)
+            in_layer = torch.nn.Conv1d(
+                hidden_channels,
+                2 * hidden_channels,
+                kernel_size,
+                dilation=dilation,
+                padding=padding,
+            )
+            in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
+            self.in_layers.append(in_layer)
+
+            # last one is not necessary
+            if i < n_layers - 1:
+                res_skip_channels = 2 * hidden_channels
+            else:
+                res_skip_channels = hidden_channels
+
+            res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
+            self.res_skip_layers.append(res_skip_layer)
+
+    def forward(self, x, x_mask, g=None, **kwargs):
+        output = torch.zeros_like(x)
+        n_channels_tensor = torch.IntTensor([self.hidden_channels])
+
+        if g is not None:
+            g = self.cond_layer(g)
+
+        for i in range(self.n_layers):
+            x_in = self.in_layers[i](x)
+            if g is not None:
+                cond_offset = i * 2 * self.hidden_channels
+                g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
+            else:
+                g_l = torch.zeros_like(x_in)
+
+            acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
+            acts = self.drop(acts)
+
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if i < self.n_layers - 1:
+                res_acts = res_skip_acts[:, : self.hidden_channels, :]
+                x = (x + res_acts) * x_mask
+                output = output + res_skip_acts[:, self.hidden_channels :, :]
+            else:
+                output = output + res_skip_acts
+        return output * x_mask
+
+    def remove_weight_norm(self):
+        if self.gin_channels != 0:
+            torch.nn.utils.remove_weight_norm(self.cond_layer)
+        for l in self.in_layers:
+            torch.nn.utils.remove_weight_norm(l)
+        for l in self.res_skip_layers:
+            torch.nn.utils.remove_weight_norm(l)
+
+
+class ResBlock1(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+        self.convs1.apply(init_weights)
+
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+            ]
+        )
+        self.convs2.apply(init_weights)
+
+    def forward(self, x, x_mask=None):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c2(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+
+
+class ResBlock2(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.convs = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+            ]
+        )
+        self.convs.apply(init_weights)
+
+    def forward(self, x, x_mask=None):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+
+
+class Log(nn.Module):
+    def forward(self, x, x_mask, reverse=False, **kwargs):
+        if not reverse:
+            y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
+            logdet = torch.sum(-y, [1, 2])
+            return y, logdet
+        else:
+            x = torch.exp(x) * x_mask
+            return x
+
+
+class Flip(nn.Module):
+    def forward(self, x, *args, reverse=False, **kwargs):
+        x = torch.flip(x, [1])
+        if not reverse:
+            logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+            return x, logdet
+        else:
+            return x
+
+
+class ElementwiseAffine(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.channels = channels
+        self.m = nn.Parameter(torch.zeros(channels, 1))
+        self.logs = nn.Parameter(torch.zeros(channels, 1))
+
+    def forward(self, x, x_mask, reverse=False, **kwargs):
+        if not reverse:
+            y = self.m + torch.exp(self.logs) * x
+            y = y * x_mask
+            logdet = torch.sum(self.logs * x_mask, [1, 2])
+            return y, logdet
+        else:
+            x = (x - self.m) * torch.exp(-self.logs) * x_mask
+            return x
+
+
+class ResidualCouplingLayer(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        p_dropout=0,
+        gin_channels=0,
+        mean_only=False,
+    ):
+        assert channels % 2 == 0, "channels should be divisible by 2"
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.half_channels = channels // 2
+        self.mean_only = mean_only
+
+        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+        self.enc = WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            p_dropout=p_dropout,
+            gin_channels=gin_channels,
+        )
+        self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+        self.post.weight.data.zero_()
+        self.post.bias.data.zero_()
+
+    def forward(self, x, x_mask, g=None, reverse=False):
+        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+        h = self.pre(x0) * x_mask
+        h = self.enc(h, x_mask, g=g)
+        stats = self.post(h) * x_mask
+        if not self.mean_only:
+            m, logs = torch.split(stats, [self.half_channels] * 2, 1)
+        else:
+            m = stats
+            logs = torch.zeros_like(m)
+
+        if not reverse:
+            x1 = m + x1 * torch.exp(logs) * x_mask
+            x = torch.cat([x0, x1], 1)
+            logdet = torch.sum(logs, [1, 2])
+            return x, logdet
+        else:
+            x1 = (x1 - m) * torch.exp(-logs) * x_mask
+            x = torch.cat([x0, x1], 1)
+            return x
+
+    def remove_weight_norm(self):
+        self.enc.remove_weight_norm()
+
+
+class ConvFlow(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        filter_channels,
+        kernel_size,
+        n_layers,
+        num_bins=10,
+        tail_bound=5.0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.num_bins = num_bins
+        self.tail_bound = tail_bound
+        self.half_channels = in_channels // 2
+
+        self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
+        self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
+        self.proj = nn.Conv1d(
+            filter_channels, self.half_channels * (num_bins * 3 - 1), 1
+        )
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+
+    def forward(self, x, x_mask, g=None, reverse=False):
+        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+        h = self.pre(x0)
+        h = self.convs(h, x_mask, g=g)
+        h = self.proj(h) * x_mask
+
+        b, c, t = x0.shape
+        h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2)  # [b, cx?, t] -> [b, c, t, ?]
+
+        unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
+        unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
+            self.filter_channels
+        )
+        unnormalized_derivatives = h[..., 2 * self.num_bins :]
+
+        x1, logabsdet = piecewise_rational_quadratic_transform(
+            x1,
+            unnormalized_widths,
+            unnormalized_heights,
+            unnormalized_derivatives,
+            inverse=reverse,
+            tails="linear",
+            tail_bound=self.tail_bound,
+        )
+
+        x = torch.cat([x0, x1], 1) * x_mask
+        logdet = torch.sum(logabsdet * x_mask, [1, 2])
+        if not reverse:
+            return x, logdet
+        else:
+            return x
diff --git a/src/ultimate_rvc/vc/infer_pack/transforms.py b/src/ultimate_rvc/vc/infer_pack/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..a11f799e023864ff7082c1f49c0cc18351a13b47
--- /dev/null
+++ b/src/ultimate_rvc/vc/infer_pack/transforms.py
@@ -0,0 +1,209 @@
+import torch
+from torch.nn import functional as F
+
+import numpy as np
+
+
+DEFAULT_MIN_BIN_WIDTH = 1e-3
+DEFAULT_MIN_BIN_HEIGHT = 1e-3
+DEFAULT_MIN_DERIVATIVE = 1e-3
+
+
+def piecewise_rational_quadratic_transform(
+    inputs,
+    unnormalized_widths,
+    unnormalized_heights,
+    unnormalized_derivatives,
+    inverse=False,
+    tails=None,
+    tail_bound=1.0,
+    min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+    min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+    min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
+    if tails is None:
+        spline_fn = rational_quadratic_spline
+        spline_kwargs = {}
+    else:
+        spline_fn = unconstrained_rational_quadratic_spline
+        spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
+
+    outputs, logabsdet = spline_fn(
+        inputs=inputs,
+        unnormalized_widths=unnormalized_widths,
+        unnormalized_heights=unnormalized_heights,
+        unnormalized_derivatives=unnormalized_derivatives,
+        inverse=inverse,
+        min_bin_width=min_bin_width,
+        min_bin_height=min_bin_height,
+        min_derivative=min_derivative,
+        **spline_kwargs
+    )
+    return outputs, logabsdet
+
+
+def searchsorted(bin_locations, inputs, eps=1e-6):
+    bin_locations[..., -1] += eps
+    return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
+
+
+def unconstrained_rational_quadratic_spline(
+    inputs,
+    unnormalized_widths,
+    unnormalized_heights,
+    unnormalized_derivatives,
+    inverse=False,
+    tails="linear",
+    tail_bound=1.0,
+    min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+    min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+    min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
+    inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
+    outside_interval_mask = ~inside_interval_mask
+
+    outputs = torch.zeros_like(inputs)
+    logabsdet = torch.zeros_like(inputs)
+
+    if tails == "linear":
+        unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
+        constant = np.log(np.exp(1 - min_derivative) - 1)
+        unnormalized_derivatives[..., 0] = constant
+        unnormalized_derivatives[..., -1] = constant
+
+        outputs[outside_interval_mask] = inputs[outside_interval_mask]
+        logabsdet[outside_interval_mask] = 0
+    else:
+        raise RuntimeError("{} tails are not implemented.".format(tails))
+
+    (
+        outputs[inside_interval_mask],
+        logabsdet[inside_interval_mask],
+    ) = rational_quadratic_spline(
+        inputs=inputs[inside_interval_mask],
+        unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
+        unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
+        unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
+        inverse=inverse,
+        left=-tail_bound,
+        right=tail_bound,
+        bottom=-tail_bound,
+        top=tail_bound,
+        min_bin_width=min_bin_width,
+        min_bin_height=min_bin_height,
+        min_derivative=min_derivative,
+    )
+
+    return outputs, logabsdet
+
+
+def rational_quadratic_spline(
+    inputs,
+    unnormalized_widths,
+    unnormalized_heights,
+    unnormalized_derivatives,
+    inverse=False,
+    left=0.0,
+    right=1.0,
+    bottom=0.0,
+    top=1.0,
+    min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+    min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+    min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
+    if torch.min(inputs) < left or torch.max(inputs) > right:
+        raise ValueError("Input to a transform is not within its domain")
+
+    num_bins = unnormalized_widths.shape[-1]
+
+    if min_bin_width * num_bins > 1.0:
+        raise ValueError("Minimal bin width too large for the number of bins")
+    if min_bin_height * num_bins > 1.0:
+        raise ValueError("Minimal bin height too large for the number of bins")
+
+    widths = F.softmax(unnormalized_widths, dim=-1)
+    widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
+    cumwidths = torch.cumsum(widths, dim=-1)
+    cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
+    cumwidths = (right - left) * cumwidths + left
+    cumwidths[..., 0] = left
+    cumwidths[..., -1] = right
+    widths = cumwidths[..., 1:] - cumwidths[..., :-1]
+
+    derivatives = min_derivative + F.softplus(unnormalized_derivatives)
+
+    heights = F.softmax(unnormalized_heights, dim=-1)
+    heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
+    cumheights = torch.cumsum(heights, dim=-1)
+    cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
+    cumheights = (top - bottom) * cumheights + bottom
+    cumheights[..., 0] = bottom
+    cumheights[..., -1] = top
+    heights = cumheights[..., 1:] - cumheights[..., :-1]
+
+    if inverse:
+        bin_idx = searchsorted(cumheights, inputs)[..., None]
+    else:
+        bin_idx = searchsorted(cumwidths, inputs)[..., None]
+
+    input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
+    input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
+
+    input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
+    delta = heights / widths
+    input_delta = delta.gather(-1, bin_idx)[..., 0]
+
+    input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
+    input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
+
+    input_heights = heights.gather(-1, bin_idx)[..., 0]
+
+    if inverse:
+        a = (inputs - input_cumheights) * (
+            input_derivatives + input_derivatives_plus_one - 2 * input_delta
+        ) + input_heights * (input_delta - input_derivatives)
+        b = input_heights * input_derivatives - (inputs - input_cumheights) * (
+            input_derivatives + input_derivatives_plus_one - 2 * input_delta
+        )
+        c = -input_delta * (inputs - input_cumheights)
+
+        discriminant = b.pow(2) - 4 * a * c
+        assert (discriminant >= 0).all()
+
+        root = (2 * c) / (-b - torch.sqrt(discriminant))
+        outputs = root * input_bin_widths + input_cumwidths
+
+        theta_one_minus_theta = root * (1 - root)
+        denominator = input_delta + (
+            (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
+            * theta_one_minus_theta
+        )
+        derivative_numerator = input_delta.pow(2) * (
+            input_derivatives_plus_one * root.pow(2)
+            + 2 * input_delta * theta_one_minus_theta
+            + input_derivatives * (1 - root).pow(2)
+        )
+        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+
+        return outputs, -logabsdet
+    else:
+        theta = (inputs - input_cumwidths) / input_bin_widths
+        theta_one_minus_theta = theta * (1 - theta)
+
+        numerator = input_heights * (
+            input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
+        )
+        denominator = input_delta + (
+            (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
+            * theta_one_minus_theta
+        )
+        outputs = input_cumheights + numerator / denominator
+
+        derivative_numerator = input_delta.pow(2) * (
+            input_derivatives_plus_one * theta.pow(2)
+            + 2 * input_delta * theta_one_minus_theta
+            + input_derivatives * (1 - theta).pow(2)
+        )
+        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+
+        return outputs, logabsdet
diff --git a/src/ultimate_rvc/vc/my_utils.py b/src/ultimate_rvc/vc/my_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf743b2506cc65955ffd49ec9a03ad69a036fa02
--- /dev/null
+++ b/src/ultimate_rvc/vc/my_utils.py
@@ -0,0 +1,66 @@
+"""Package which defines utility functions for voice conversion."""
+
+import numpy as np
+from numpy.typing import NDArray
+
+import ffmpeg
+
+
+def load_audio(file: str, sr: int) -> NDArray[np.float32]:
+    """
+    Load an audio file into a numpy array with a target sample rate.
+
+    A subprocess is launched to decode the given audio file while
+    down-mixing and resampling as necessary.
+
+    Parameters
+    ----------
+    file : str
+        Path to the audio file.
+    sr : int
+        Target sample rate.
+
+    Returns
+    -------
+    NDArray[np.float32]
+        Decoded audio file in numpy array format.
+
+    Raises
+    ------
+    RuntimeError
+        If the audio file cannot be loaded.
+
+    See Also
+    --------
+    https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
+
+    Notes
+    -----
+    Requires the ffmpeg CLI and `typed-ffmpeg` package to be installed.
+
+    """
+    try:
+        # NOTE prevent the input path from containing spaces and
+        # carriage returns at the beginning and end.
+        file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
+        out, _ = (
+            ffmpeg.input(file, threads=0)
+            .output(
+                filename="-",
+                f="f32le",
+                acodec="pcm_f32le",
+                ac=1,
+                ar=sr,
+            )
+            .run(
+                cmd=["ffmpeg", "-nostdin"],
+                capture_stdout=True,
+                capture_stderr=True,
+            )
+        )
+
+    except Exception as e:
+        err_msg = f"Failed to load audio: {e}"
+        raise RuntimeError(err_msg) from e
+
+    return np.frombuffer(out, np.float32).flatten()
diff --git a/src/ultimate_rvc/vc/rmvpe.py b/src/ultimate_rvc/vc/rmvpe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e83aa80dafc81a3f42a13933b3c5b220fa176e2
--- /dev/null
+++ b/src/ultimate_rvc/vc/rmvpe.py
@@ -0,0 +1,409 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from librosa.filters import mel
+
+
+class BiGRU(nn.Module):
+    def __init__(self, input_features, hidden_features, num_layers):
+        super(BiGRU, self).__init__()
+        self.gru = nn.GRU(
+            input_features,
+            hidden_features,
+            num_layers=num_layers,
+            batch_first=True,
+            bidirectional=True,
+        )
+
+    def forward(self, x):
+        return self.gru(x)[0]
+
+
+class ConvBlockRes(nn.Module):
+    def __init__(self, in_channels, out_channels, momentum=0.01):
+        super(ConvBlockRes, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(3, 3),
+                stride=(1, 1),
+                padding=(1, 1),
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU(),
+            nn.Conv2d(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                kernel_size=(3, 3),
+                stride=(1, 1),
+                padding=(1, 1),
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU(),
+        )
+        if in_channels != out_channels:
+            self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
+            self.is_shortcut = True
+        else:
+            self.is_shortcut = False
+
+    def forward(self, x):
+        if self.is_shortcut:
+            return self.conv(x) + self.shortcut(x)
+        else:
+            return self.conv(x) + x
+
+
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        in_size,
+        n_encoders,
+        kernel_size,
+        n_blocks,
+        out_channels=16,
+        momentum=0.01,
+    ):
+        super(Encoder, self).__init__()
+        self.n_encoders = n_encoders
+        self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
+        self.layers = nn.ModuleList()
+        self.latent_channels = []
+        for i in range(self.n_encoders):
+            self.layers.append(
+                ResEncoderBlock(
+                    in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
+                )
+            )
+            self.latent_channels.append([out_channels, in_size])
+            in_channels = out_channels
+            out_channels *= 2
+            in_size //= 2
+        self.out_size = in_size
+        self.out_channel = out_channels
+
+    def forward(self, x):
+        concat_tensors = []
+        x = self.bn(x)
+        for i in range(self.n_encoders):
+            _, x = self.layers[i](x)
+            concat_tensors.append(_)
+        return x, concat_tensors
+
+
+class ResEncoderBlock(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
+    ):
+        super(ResEncoderBlock, self).__init__()
+        self.n_blocks = n_blocks
+        self.conv = nn.ModuleList()
+        self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
+        for i in range(n_blocks - 1):
+            self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
+        self.kernel_size = kernel_size
+        if self.kernel_size is not None:
+            self.pool = nn.AvgPool2d(kernel_size=kernel_size)
+
+    def forward(self, x):
+        for i in range(self.n_blocks):
+            x = self.conv[i](x)
+        if self.kernel_size is not None:
+            return x, self.pool(x)
+        else:
+            return x
+
+
+class Intermediate(nn.Module):  #
+    def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
+        super(Intermediate, self).__init__()
+        self.n_inters = n_inters
+        self.layers = nn.ModuleList()
+        self.layers.append(
+            ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
+        )
+        for i in range(self.n_inters - 1):
+            self.layers.append(
+                ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
+            )
+
+    def forward(self, x):
+        for i in range(self.n_inters):
+            x = self.layers[i](x)
+        return x
+
+
+class ResDecoderBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
+        super(ResDecoderBlock, self).__init__()
+        out_padding = (0, 1) if stride == (1, 2) else (1, 1)
+        self.n_blocks = n_blocks
+        self.conv1 = nn.Sequential(
+            nn.ConvTranspose2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(3, 3),
+                stride=stride,
+                padding=(1, 1),
+                output_padding=out_padding,
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU(),
+        )
+        self.conv2 = nn.ModuleList()
+        self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
+        for i in range(n_blocks - 1):
+            self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
+
+    def forward(self, x, concat_tensor):
+        x = self.conv1(x)
+        x = torch.cat((x, concat_tensor), dim=1)
+        for i in range(self.n_blocks):
+            x = self.conv2[i](x)
+        return x
+
+
+class Decoder(nn.Module):
+    def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
+        super(Decoder, self).__init__()
+        self.layers = nn.ModuleList()
+        self.n_decoders = n_decoders
+        for i in range(self.n_decoders):
+            out_channels = in_channels // 2
+            self.layers.append(
+                ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
+            )
+            in_channels = out_channels
+
+    def forward(self, x, concat_tensors):
+        for i in range(self.n_decoders):
+            x = self.layers[i](x, concat_tensors[-1 - i])
+        return x
+
+
+class DeepUnet(nn.Module):
+    def __init__(
+        self,
+        kernel_size,
+        n_blocks,
+        en_de_layers=5,
+        inter_layers=4,
+        in_channels=1,
+        en_out_channels=16,
+    ):
+        super(DeepUnet, self).__init__()
+        self.encoder = Encoder(
+            in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
+        )
+        self.intermediate = Intermediate(
+            self.encoder.out_channel // 2,
+            self.encoder.out_channel,
+            inter_layers,
+            n_blocks,
+        )
+        self.decoder = Decoder(
+            self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
+        )
+
+    def forward(self, x):
+        x, concat_tensors = self.encoder(x)
+        x = self.intermediate(x)
+        x = self.decoder(x, concat_tensors)
+        return x
+
+
+class E2E(nn.Module):
+    def __init__(
+        self,
+        n_blocks,
+        n_gru,
+        kernel_size,
+        en_de_layers=5,
+        inter_layers=4,
+        in_channels=1,
+        en_out_channels=16,
+    ):
+        super(E2E, self).__init__()
+        self.unet = DeepUnet(
+            kernel_size,
+            n_blocks,
+            en_de_layers,
+            inter_layers,
+            in_channels,
+            en_out_channels,
+        )
+        self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
+        if n_gru:
+            self.fc = nn.Sequential(
+                BiGRU(3 * 128, 256, n_gru),
+                nn.Linear(512, 360),
+                nn.Dropout(0.25),
+                nn.Sigmoid(),
+            )
+        else:
+            self.fc = nn.Sequential(
+                nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
+            )
+
+    def forward(self, mel):
+        mel = mel.transpose(-1, -2).unsqueeze(1)
+        x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
+        x = self.fc(x)
+        return x
+
+
+class MelSpectrogram(torch.nn.Module):
+    def __init__(
+        self,
+        is_half,
+        n_mel_channels,
+        sampling_rate,
+        win_length,
+        hop_length,
+        n_fft=None,
+        mel_fmin=0,
+        mel_fmax=None,
+        clamp=1e-5,
+    ):
+        super().__init__()
+        n_fft = win_length if n_fft is None else n_fft
+        self.hann_window = {}
+        mel_basis = mel(
+            sr=sampling_rate,
+            n_fft=n_fft,
+            n_mels=n_mel_channels,
+            fmin=mel_fmin,
+            fmax=mel_fmax,
+            htk=True,
+        )
+        mel_basis = torch.from_numpy(mel_basis).float()
+        self.register_buffer("mel_basis", mel_basis)
+        self.n_fft = win_length if n_fft is None else n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.sampling_rate = sampling_rate
+        self.n_mel_channels = n_mel_channels
+        self.clamp = clamp
+        self.is_half = is_half
+
+    def forward(self, audio, keyshift=0, speed=1, center=True):
+        factor = 2 ** (keyshift / 12)
+        n_fft_new = int(np.round(self.n_fft * factor))
+        win_length_new = int(np.round(self.win_length * factor))
+        hop_length_new = int(np.round(self.hop_length * speed))
+        keyshift_key = str(keyshift) + "_" + str(audio.device)
+        if keyshift_key not in self.hann_window:
+            self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
+                audio.device
+            )
+        fft = torch.stft(
+            audio,
+            n_fft=n_fft_new,
+            hop_length=hop_length_new,
+            win_length=win_length_new,
+            window=self.hann_window[keyshift_key],
+            center=center,
+            return_complex=True,
+        )
+        magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
+        if keyshift != 0:
+            size = self.n_fft // 2 + 1
+            resize = magnitude.size(1)
+            if resize < size:
+                magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
+            magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
+        mel_output = torch.matmul(self.mel_basis, magnitude)
+        if self.is_half == True:
+            mel_output = mel_output.half()
+        log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
+        return log_mel_spec
+
+
+class RMVPE:
+    def __init__(self, model_path, is_half, device=None):
+        self.resample_kernel = {}
+        model = E2E(4, 1, (2, 2))
+        ckpt = torch.load(model_path, map_location="cpu")
+        model.load_state_dict(ckpt)
+        model.eval()
+        if is_half == True:
+            model = model.half()
+        self.model = model
+        self.resample_kernel = {}
+        self.is_half = is_half
+        if device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = device
+        self.mel_extractor = MelSpectrogram(
+            is_half, 128, 16000, 1024, 160, None, 30, 8000
+        ).to(device)
+        self.model = self.model.to(device)
+        cents_mapping = 20 * np.arange(360) + 1997.3794084376191
+        self.cents_mapping = np.pad(cents_mapping, (4, 4))  # 368
+
+    def mel2hidden(self, mel):
+        with torch.no_grad():
+            n_frames = mel.shape[-1]
+            mel = F.pad(
+                mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect"
+            )
+            hidden = self.model(mel)
+            return hidden[:, :n_frames]
+
+    def decode(self, hidden, thred=0.03):
+        cents_pred = self.to_local_average_cents(hidden, thred=thred)
+        f0 = 10 * (2 ** (cents_pred / 1200))
+        f0[f0 == 10] = 0
+        # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred])
+        return f0
+
+    def infer_from_audio(self, audio, thred=0.03):
+        audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
+        # torch.cuda.synchronize()
+        # t0=ttime()
+        mel = self.mel_extractor(audio, center=True)
+        # torch.cuda.synchronize()
+        # t1=ttime()
+        hidden = self.mel2hidden(mel)
+        # torch.cuda.synchronize()
+        # t2=ttime()
+        hidden = hidden.squeeze(0).cpu().numpy()
+        if self.is_half == True:
+            hidden = hidden.astype("float32")
+        f0 = self.decode(hidden, thred=thred)
+        # torch.cuda.synchronize()
+        # t3=ttime()
+        # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0))
+        return f0
+
+    def to_local_average_cents(self, salience, thred=0.05):
+        # t0 = ttime()
+        center = np.argmax(salience, axis=1)  # 帧长#index
+        salience = np.pad(salience, ((0, 0), (4, 4)))  # 帧长,368
+        # t1 = ttime()
+        center += 4
+        todo_salience = []
+        todo_cents_mapping = []
+        starts = center - 4
+        ends = center + 5
+        for idx in range(salience.shape[0]):
+            todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
+            todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
+        # t2 = ttime()
+        todo_salience = np.array(todo_salience)  # 帧长，9
+        todo_cents_mapping = np.array(todo_cents_mapping)  # 帧长，9
+        product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
+        weight_sum = np.sum(todo_salience, 1)  # 帧长
+        devided = product_sum / weight_sum  # 帧长
+        # t3 = ttime()
+        maxx = np.max(salience, axis=1)  # 帧长
+        devided[maxx <= thred] = 0
+        # t4 = ttime()
+        # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
+        return devided
diff --git a/src/ultimate_rvc/vc/rvc.py b/src/ultimate_rvc/vc/rvc.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff4bb25b8cdca1ccae0cae7bd361b5745adde8df
--- /dev/null
+++ b/src/ultimate_rvc/vc/rvc.py
@@ -0,0 +1,212 @@
+from typing import Any
+
+from multiprocessing import cpu_count
+from pathlib import Path
+
+from scipy.io import wavfile
+
+import torch
+from fairseq import checkpoint_utils
+
+from ultimate_rvc.typing_extra import F0Method
+
+from ultimate_rvc.vc.infer_pack.models import (
+    SynthesizerTrnMs256NSFsid,
+    SynthesizerTrnMs256NSFsid_nono,
+    SynthesizerTrnMs768NSFsid,
+    SynthesizerTrnMs768NSFsid_nono,
+)
+from ultimate_rvc.vc.my_utils import load_audio
+from ultimate_rvc.vc.vc_infer_pipeline import VC
+
+SRC_DIR = Path(__file__).resolve().parent.parent
+
+
+class Config:
+    def __init__(self, device, is_half):
+        self.device = device
+        self.is_half = is_half
+        self.n_cpu = 0
+        self.gpu_name = None
+        self.gpu_mem = None
+        self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
+
+    def device_config(self) -> tuple:
+        if torch.cuda.is_available():
+            i_device = int(self.device.split(":")[-1])
+            self.gpu_name = torch.cuda.get_device_name(i_device)
+            if (
+                ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
+                or "P40" in self.gpu_name.upper()
+                or "1060" in self.gpu_name
+                or "1070" in self.gpu_name
+                or "1080" in self.gpu_name
+            ):
+                print("16 series/10 series P40 forced single precision")
+                self.is_half = False
+                for config_file in ["32k.json", "40k.json", "48k.json"]:
+                    with open(SRC_DIR / "vc" / "configs" / config_file) as f:
+                        strr = f.read().replace("true", "false")
+                    with open(SRC_DIR / "vc" / "configs" / config_file, "w") as f:
+                        f.write(strr)
+                with open(
+                    SRC_DIR / "vc" / "trainset_preprocess_pipeline_print.py",
+                ) as f:
+                    strr = f.read().replace("3.7", "3.0")
+                with open(
+                    SRC_DIR / "vc" / "trainset_preprocess_pipeline_print.py",
+                    "w",
+                ) as f:
+                    f.write(strr)
+            else:
+                self.gpu_name = None
+            self.gpu_mem = int(
+                torch.cuda.get_device_properties(i_device).total_memory
+                / 1024
+                / 1024
+                / 1024
+                + 0.4,
+            )
+            if self.gpu_mem <= 4:
+                with open(
+                    SRC_DIR / "vc" / "trainset_preprocess_pipeline_print.py",
+                ) as f:
+                    strr = f.read().replace("3.7", "3.0")
+                with open(
+                    SRC_DIR / "vc" / "trainset_preprocess_pipeline_print.py",
+                    "w",
+                ) as f:
+                    f.write(strr)
+        elif torch.backends.mps.is_available():
+            print("No supported N-card found, use MPS for inference")
+            self.device = "mps"
+        else:
+            print("No supported N-card found, use CPU for inference")
+            self.device = "cpu"
+            self.is_half = True
+
+        if self.n_cpu == 0:
+            self.n_cpu = cpu_count()
+
+        if self.is_half:
+            # 6G memory config
+            x_pad = 3
+            x_query = 10
+            x_center = 60
+            x_max = 65
+        else:
+            # 5G memory config
+            x_pad = 1
+            x_query = 6
+            x_center = 38
+            x_max = 41
+
+        if self.gpu_mem != None and self.gpu_mem <= 4:
+            x_pad = 1
+            x_query = 5
+            x_center = 30
+            x_max = 32
+
+        return x_pad, x_query, x_center, x_max
+
+
+def load_hubert(device: str, model_path: str, *, is_half: bool) -> torch.nn.Module:
+    models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
+        [model_path],
+        suffix="",
+    )
+    hubert = models[0]
+    hubert.to(device)
+
+    hubert.half() if is_half else hubert.float()
+
+    hubert.eval()
+    return hubert
+
+
+def get_vc(
+    device: str,
+    config: Config,
+    model_path: str,
+    *,
+    is_half: bool,
+) -> tuple[dict[str, Any], str, torch.nn.Module, int, VC]:
+    cpt = torch.load(model_path, map_location="cpu")
+    if "config" not in cpt or "weight" not in cpt:
+        err_msg = (
+            f"Incorrect format for {model_path}. Use a voice model trained using RVC v2"
+        )
+        raise ValueError(err_msg)
+
+    tgt_sr = cpt["config"][-1]
+    cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
+    if_f0 = cpt.get("f0", 1)
+    version = cpt.get("version", "v1")
+
+    if version == "v1":
+        if if_f0 == 1:
+            net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half)
+        else:
+            net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
+    elif version == "v2":
+        if if_f0 == 1:
+            net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=is_half)
+        else:
+            net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
+    else:
+        err_msg = f"Unsupported model version: {version}"
+        raise ValueError(err_msg)
+
+    del net_g.enc_q
+    net_g.load_state_dict(cpt["weight"], strict=False)
+    net_g.eval().to(device)
+
+    net_g.half() if is_half else net_g.float()
+
+    vc = VC(tgt_sr, config)
+    return cpt, version, net_g, tgt_sr, vc
+
+
+def rvc_infer(
+    index_path: str,
+    index_rate: float,
+    input_path: str,
+    output_path: str,
+    pitch_change: int,
+    f0_method: F0Method,
+    cpt: dict[str, Any],
+    version: str,
+    net_g: torch.nn.Module,
+    filter_radius: int,
+    tgt_sr: int,
+    rms_mix_rate: float,
+    protect: float,
+    crepe_hop_length: int,
+    vc: VC,
+    hubert_model: torch.nn.Module,
+    resample_sr: int,
+) -> None:
+    audio = load_audio(input_path, 16000)
+    times = [0, 0, 0]
+    if_f0 = cpt.get("f0", 1)
+    audio_opt, output_sr = vc.pipeline(
+        hubert_model,
+        net_g,
+        0,
+        audio,
+        input_path,
+        times,
+        pitch_change,
+        f0_method,
+        index_path,
+        index_rate,
+        if_f0,
+        filter_radius,
+        tgt_sr,
+        resample_sr,
+        rms_mix_rate,
+        version,
+        protect,
+        crepe_hop_length,
+    )
+    wavfile.write(output_path, output_sr, audio_opt)
diff --git a/src/ultimate_rvc/vc/trainset_preprocess_pipeline_print.py b/src/ultimate_rvc/vc/trainset_preprocess_pipeline_print.py
new file mode 100644
index 0000000000000000000000000000000000000000..52879dc2f7f02292c9384221bbd2469b948eaba1
--- /dev/null
+++ b/src/ultimate_rvc/vc/trainset_preprocess_pipeline_print.py
@@ -0,0 +1,146 @@
+import sys, os, multiprocessing
+from scipy import signal
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+inp_root = sys.argv[1]
+sr = int(sys.argv[2])
+n_p = int(sys.argv[3])
+exp_dir = sys.argv[4]
+noparallel = sys.argv[5] == "True"
+import numpy as np, os, traceback
+from slicer2 import Slicer
+import librosa, traceback
+from scipy.io import wavfile
+import multiprocessing
+from ultimate_rvc.vc.my_utils import load_audio
+import tqdm
+
+DoFormant = False
+Quefrency = 1.0
+Timbre = 1.0
+
+mutex = multiprocessing.Lock()
+f = open("%s/preprocess.log" % exp_dir, "a+")
+
+
+def println(strr):
+    mutex.acquire()
+    print(strr)
+    f.write("%s\n" % strr)
+    f.flush()
+    mutex.release()
+
+
+class PreProcess:
+    def __init__(self, sr, exp_dir):
+        self.slicer = Slicer(
+            sr=sr,
+            threshold=-42,
+            min_length=1500,
+            min_interval=400,
+            hop_size=15,
+            max_sil_kept=500,
+        )
+        self.sr = sr
+        self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr)
+        self.per = 3.0
+        self.overlap = 0.3
+        self.tail = self.per + self.overlap
+        self.max = 0.9
+        self.alpha = 0.75
+        self.exp_dir = exp_dir
+        self.gt_wavs_dir = "%s/0_gt_wavs" % exp_dir
+        self.wavs16k_dir = "%s/1_16k_wavs" % exp_dir
+        os.makedirs(self.exp_dir, exist_ok=True)
+        os.makedirs(self.gt_wavs_dir, exist_ok=True)
+        os.makedirs(self.wavs16k_dir, exist_ok=True)
+
+    def norm_write(self, tmp_audio, idx0, idx1):
+        tmp_max = np.abs(tmp_audio).max()
+        if tmp_max > 2.5:
+            print("%s-%s-%s-filtered" % (idx0, idx1, tmp_max))
+            return
+        tmp_audio = (tmp_audio / tmp_max * (self.max * self.alpha)) + (
+            1 - self.alpha
+        ) * tmp_audio
+        wavfile.write(
+            "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1),
+            self.sr,
+            tmp_audio.astype(np.float32),
+        )
+        tmp_audio = librosa.resample(
+            tmp_audio, orig_sr=self.sr, target_sr=16000
+        )  # , res_type="soxr_vhq"
+        wavfile.write(
+            "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
+            16000,
+            tmp_audio.astype(np.float32),
+        )
+
+    def pipeline(self, path, idx0):
+        try:
+            audio = load_audio(path, self.sr, DoFormant, Quefrency, Timbre)
+            # zero phased digital filter cause pre-ringing noise...
+            # audio = signal.filtfilt(self.bh, self.ah, audio)
+            audio = signal.lfilter(self.bh, self.ah, audio)
+
+            idx1 = 0
+            for audio in self.slicer.slice(audio):
+                i = 0
+                while 1:
+                    start = int(self.sr * (self.per - self.overlap) * i)
+                    i += 1
+                    if len(audio[start:]) > self.tail * self.sr:
+                        tmp_audio = audio[start : start + int(self.per * self.sr)]
+                        self.norm_write(tmp_audio, idx0, idx1)
+                        idx1 += 1
+                    else:
+                        tmp_audio = audio[start:]
+                        idx1 += 1
+                        break
+                self.norm_write(tmp_audio, idx0, idx1)
+            # println("%s->Suc." % path)
+        except:
+            println("%s->%s" % (path, traceback.format_exc()))
+
+    def pipeline_mp(self, infos, thread_n):
+        for path, idx0 in tqdm.tqdm(
+            infos, position=thread_n, leave=True, desc="thread:%s" % thread_n
+        ):
+            self.pipeline(path, idx0)
+
+    def pipeline_mp_inp_dir(self, inp_root, n_p):
+        try:
+            infos = [
+                ("%s/%s" % (inp_root, name), idx)
+                for idx, name in enumerate(sorted(list(os.listdir(inp_root))))
+            ]
+            if noparallel:
+                for i in range(n_p):
+                    self.pipeline_mp(infos[i::n_p])
+            else:
+                ps = []
+                for i in range(n_p):
+                    p = multiprocessing.Process(
+                        target=self.pipeline_mp, args=(infos[i::n_p], i)
+                    )
+                    ps.append(p)
+                    p.start()
+                for i in range(n_p):
+                    ps[i].join()
+        except:
+            println("Fail. %s" % traceback.format_exc())
+
+
+def preprocess_trainset(inp_root, sr, n_p, exp_dir):
+    pp = PreProcess(sr, exp_dir)
+    println("start preprocess")
+    println(sys.argv)
+    pp.pipeline_mp_inp_dir(inp_root, n_p)
+    println("end preprocess")
+
+
+if __name__ == "__main__":
+    preprocess_trainset(inp_root, sr, n_p, exp_dir)
diff --git a/src/ultimate_rvc/vc/vc_infer_pipeline.py b/src/ultimate_rvc/vc/vc_infer_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..843cc30ff21f9fdaa27c84ce6de997d600b4a035
--- /dev/null
+++ b/src/ultimate_rvc/vc/vc_infer_pipeline.py
@@ -0,0 +1,656 @@
+from functools import lru_cache
+from time import time as ttime
+
+import faiss
+import librosa
+import numpy as np
+import os
+import parselmouth
+import pyworld
+import sys
+import torch
+import torch.nn.functional as F
+import torchcrepe
+import traceback
+from scipy import signal
+from torch import Tensor
+from ultimate_rvc.common import BASE_DIR, MODELS_DIR
+
+now_dir = os.path.join(BASE_DIR, "src")
+sys.path.append(now_dir)
+
+bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
+
+input_audio_path2wav = {}
+
+
+@lru_cache
+def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
+    audio = input_audio_path2wav[input_audio_path]
+    f0, t = pyworld.harvest(
+        audio,
+        fs=fs,
+        f0_ceil=f0max,
+        f0_floor=f0min,
+        frame_period=frame_period,
+    )
+    f0 = pyworld.stonemask(audio, f0, t, fs)
+    return f0
+
+
+def change_rms(data1, sr1, data2, sr2, rate):  # 1是输入音频，2是输出音频,rate是2的占比
+    # print(data1.max(),data2.max())
+    rms1 = librosa.feature.rms(
+        y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
+    )  # 每半秒一个点
+    rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
+    rms1 = torch.from_numpy(rms1)
+    rms1 = F.interpolate(
+        rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
+    ).squeeze()
+    rms2 = torch.from_numpy(rms2)
+    rms2 = F.interpolate(
+        rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
+    ).squeeze()
+    rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
+    data2 *= (
+        torch.pow(rms1, torch.tensor(1 - rate))
+        * torch.pow(rms2, torch.tensor(rate - 1))
+    ).numpy()
+    return data2
+
+
+class VC(object):
+    def __init__(self, tgt_sr, config):
+        self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
+            config.x_pad,
+            config.x_query,
+            config.x_center,
+            config.x_max,
+            config.is_half,
+        )
+        self.sr = 16000  # hubert输入采样率
+        self.window = 160  # 每帧点数
+        self.t_pad = self.sr * self.x_pad  # 每条前后pad时间
+        self.t_pad_tgt = tgt_sr * self.x_pad
+        self.t_pad2 = self.t_pad * 2
+        self.t_query = self.sr * self.x_query  # 查询切点前后查询时间
+        self.t_center = self.sr * self.x_center  # 查询切点位置
+        self.t_max = self.sr * self.x_max  # 免查询时长阈值
+        self.device = config.device
+
+    # Fork Feature: Get the best torch device to use for f0 algorithms that require a torch device. Will return the type (torch.device)
+    def get_optimal_torch_device(self, index: int = 0) -> torch.device:
+        # Get cuda device
+        if torch.cuda.is_available():
+            return torch.device(
+                f"cuda:{index % torch.cuda.device_count()}"
+            )  # Very fast
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
+        # Else wise return the "cpu" as a torch device,
+        return torch.device("cpu")
+
+    # Fork Feature: Compute f0 with the crepe method
+    def get_f0_crepe_computation(
+        self,
+        x,
+        f0_min,
+        f0_max,
+        p_len,
+        hop_length=160,  # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
+        model="full",  # Either use crepe-tiny "tiny" or crepe "full". Default is full
+    ):
+        x = x.astype(
+            np.float32
+        )  # fixes the F.conv2D exception. We needed to convert double to float.
+        x /= np.quantile(np.abs(x), 0.999)
+        torch_device = self.get_optimal_torch_device()
+        audio = torch.from_numpy(x).to(torch_device, copy=True)
+        audio = torch.unsqueeze(audio, dim=0)
+        if audio.ndim == 2 and audio.shape[0] > 1:
+            audio = torch.mean(audio, dim=0, keepdim=True).detach()
+        audio = audio.detach()
+        #print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
+        pitch: Tensor = torchcrepe.predict(
+            audio,
+            self.sr,
+            hop_length,
+            f0_min,
+            f0_max,
+            model,
+            batch_size=hop_length * 2,
+            device=torch_device,
+            pad=True,
+        )
+        p_len = p_len or x.shape[0] // hop_length
+        # Resize the pitch for final f0
+        source = np.array(pitch.squeeze(0).cpu().float().numpy())
+        source[source < 0.001] = np.nan
+        target = np.interp(
+            np.arange(0, len(source) * p_len, len(source)) / p_len,
+            np.arange(0, len(source)),
+            source,
+        )
+        f0 = np.nan_to_num(target)
+        return f0  # Resized f0
+
+    def get_f0_official_crepe_computation(
+        self,
+        x,
+        f0_min,
+        f0_max,
+        model="full",
+    ):
+        # Pick a batch size that doesn't cause memory errors on your gpu
+        batch_size = 512
+        # Compute pitch using first gpu
+        audio = torch.tensor(np.copy(x))[None].float()
+        f0, pd = torchcrepe.predict(
+            audio,
+            self.sr,
+            self.window,
+            f0_min,
+            f0_max,
+            model,
+            batch_size=batch_size,
+            device=self.device,
+            return_periodicity=True,
+        )
+        pd = torchcrepe.filter.median(pd, 3)
+        f0 = torchcrepe.filter.mean(f0, 3)
+        f0[pd < 0.1] = 0
+        f0 = f0[0].cpu().numpy()
+        return f0
+
+    # Fork Feature: Compute pYIN f0 method
+    def get_f0_pyin_computation(self, x, f0_min, f0_max):
+        y, sr = librosa.load("saudio/Sidney.wav", self.sr, mono=True)
+        f0, _, _ = librosa.pyin(y, sr=self.sr, fmin=f0_min, fmax=f0_max)
+        f0 = f0[1:]  # Get rid of extra first frame
+        return f0
+
+    # Fork Feature: Acquire median hybrid f0 estimation calculation
+    def get_f0_hybrid_computation(
+        self,
+        methods_str,
+        input_audio_path,
+        x,
+        f0_min,
+        f0_max,
+        p_len,
+        filter_radius,
+        crepe_hop_length,
+        time_step,
+    ):
+        # Get various f0 methods from input to use in the computation stack
+        s = methods_str
+        s = s.split("hybrid")[1]
+        s = s.replace("[", "").replace("]", "")
+        methods = s.split("+")
+        f0_computation_stack = []
+
+        print("Calculating f0 pitch estimations for methods: %s" % str(methods))
+        x = x.astype(np.float32)
+        x /= np.quantile(np.abs(x), 0.999)
+        # Get f0 calculations for all methods specified
+        for method in methods:
+            f0 = None
+            if method == "pm":
+                f0 = (
+                    parselmouth.Sound(x, self.sr)
+                    .to_pitch_ac(
+                        time_step=time_step / 1000,
+                        voicing_threshold=0.6,
+                        pitch_floor=f0_min,
+                        pitch_ceiling=f0_max,
+                    )
+                    .selected_array["frequency"]
+                )
+                pad_size = (p_len - len(f0) + 1) // 2
+                if pad_size > 0 or p_len - len(f0) - pad_size > 0:
+                    f0 = np.pad(
+                        f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
+                    )
+            elif method == "crepe":
+                f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
+                f0 = f0[1:]  # Get rid of extra first frame
+            elif method == "crepe-tiny":
+                f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
+                f0 = f0[1:]  # Get rid of extra first frame
+            elif method == "mangio-crepe":
+                f0 = self.get_f0_crepe_computation(
+                    x, f0_min, f0_max, p_len, crepe_hop_length
+                )
+            elif method == "mangio-crepe-tiny":
+                f0 = self.get_f0_crepe_computation(
+                    x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
+                )
+            elif method == "harvest":
+                f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
+                if filter_radius > 2:
+                    f0 = signal.medfilt(f0, 3)
+                f0 = f0[1:]  # Get rid of first frame.
+            elif method == "dio":  # Potentially buggy?
+                f0, t = pyworld.dio(
+                    x.astype(np.double),
+                    fs=self.sr,
+                    f0_ceil=f0_max,
+                    f0_floor=f0_min,
+                    frame_period=10,
+                )
+                f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
+                f0 = signal.medfilt(f0, 3)
+                f0 = f0[1:]
+            # elif method == "pyin": Not Working just yet
+            #    f0 = self.get_f0_pyin_computation(x, f0_min, f0_max)
+            # Push method to the stack
+            f0_computation_stack.append(f0)
+
+        for fc in f0_computation_stack:
+            print(len(fc))
+
+        print("Calculating hybrid median f0 from the stack of: %s" % str(methods))
+        f0_median_hybrid = None
+        if len(f0_computation_stack) == 1:
+            f0_median_hybrid = f0_computation_stack[0]
+        else:
+            f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
+        return f0_median_hybrid
+
+    def get_f0(
+        self,
+        input_audio_path,
+        x,
+        p_len,
+        f0_up_key,
+        f0_method,
+        filter_radius,
+        crepe_hop_length,
+        inp_f0=None,
+    ):
+        global input_audio_path2wav
+        time_step = self.window / self.sr * 1000
+        f0_min = 50
+        f0_max = 1100
+        f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+        f0_mel_max = 1127 * np.log(1 + f0_max / 700)
+        if f0_method == "pm":
+            f0 = (
+                parselmouth.Sound(x, self.sr)
+                .to_pitch_ac(
+                    time_step=time_step / 1000,
+                    voicing_threshold=0.6,
+                    pitch_floor=f0_min,
+                    pitch_ceiling=f0_max,
+                )
+                .selected_array["frequency"]
+            )
+            pad_size = (p_len - len(f0) + 1) // 2
+            if pad_size > 0 or p_len - len(f0) - pad_size > 0:
+                f0 = np.pad(
+                    f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
+                )
+        elif f0_method == "harvest":
+            input_audio_path2wav[input_audio_path] = x.astype(np.double)
+            f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
+            if filter_radius > 2:
+                f0 = signal.medfilt(f0, 3)
+        elif f0_method == "dio":  # Potentially Buggy?
+            f0, t = pyworld.dio(
+                x.astype(np.double),
+                fs=self.sr,
+                f0_ceil=f0_max,
+                f0_floor=f0_min,
+                frame_period=10,
+            )
+            f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
+            f0 = signal.medfilt(f0, 3)
+        elif f0_method == "crepe":
+            f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
+        elif f0_method == "crepe-tiny":
+            f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
+        elif f0_method == "mangio-crepe":
+            f0 = self.get_f0_crepe_computation(
+                x, f0_min, f0_max, p_len, crepe_hop_length
+            )
+        elif f0_method == "mangio-crepe-tiny":
+            f0 = self.get_f0_crepe_computation(
+                x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
+            )
+        elif f0_method == "rmvpe":
+            if hasattr(self, "model_rmvpe") == False:
+                from ultimate_rvc.vc.rmvpe import RMVPE
+
+                self.model_rmvpe = RMVPE(
+                    os.path.join(MODELS_DIR, "rvc", "rmvpe.pt"),
+                    is_half=self.is_half,
+                    device=self.device,
+                )
+            f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
+
+        elif "hybrid" in f0_method:
+            # Perform hybrid median pitch estimation
+            input_audio_path2wav[input_audio_path] = x.astype(np.double)
+            f0 = self.get_f0_hybrid_computation(
+                f0_method,
+                input_audio_path,
+                x,
+                f0_min,
+                f0_max,
+                p_len,
+                filter_radius,
+                crepe_hop_length,
+                time_step,
+            )
+
+        f0 *= pow(2, f0_up_key / 12)
+        # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
+        tf0 = self.sr // self.window  # 每秒f0点数
+        if inp_f0 is not None:
+            delta_t = np.round(
+                (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
+            ).astype("int16")
+            replace_f0 = np.interp(
+                list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
+            )
+            shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
+            f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
+                :shape
+            ]
+        # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
+        f0bak = f0.copy()
+        f0_mel = 1127 * np.log(1 + f0 / 700)
+        f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
+            f0_mel_max - f0_mel_min
+        ) + 1
+        f0_mel[f0_mel <= 1] = 1
+        f0_mel[f0_mel > 255] = 255
+        f0_coarse = np.rint(f0_mel).astype(int)
+
+        return f0_coarse, f0bak  # 1-0
+
+    def vc(
+        self,
+        model,
+        net_g,
+        sid,
+        audio0,
+        pitch,
+        pitchf,
+        times,
+        index,
+        big_npy,
+        index_rate,
+        version,
+        protect,
+    ):  # ,file_index,file_big_npy
+        feats = torch.from_numpy(audio0)
+        if self.is_half:
+            feats = feats.half()
+        else:
+            feats = feats.float()
+        if feats.dim() == 2:  # double channels
+            feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats.view(1, -1)
+        padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
+
+        inputs = {
+            "source": feats.to(self.device),
+            "padding_mask": padding_mask,
+            "output_layer": 9 if version == "v1" else 12,
+        }
+        t0 = ttime()
+        with torch.no_grad():
+            logits = model.extract_features(**inputs)
+            feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
+        if protect < 0.5 and pitch != None and pitchf != None:
+            feats0 = feats.clone()
+        if (
+            isinstance(index, type(None)) == False
+            and isinstance(big_npy, type(None)) == False
+            and index_rate != 0
+        ):
+            npy = feats[0].cpu().numpy()
+            if self.is_half:
+                npy = npy.astype("float32")
+
+            # _, I = index.search(npy, 1)
+            # npy = big_npy[I.squeeze()]
+
+            score, ix = index.search(npy, k=8)
+            weight = np.square(1 / score)
+            weight /= weight.sum(axis=1, keepdims=True)
+            npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
+
+            if self.is_half:
+                npy = npy.astype("float16")
+            feats = (
+                torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
+                + (1 - index_rate) * feats
+            )
+
+        feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
+        if protect < 0.5 and pitch != None and pitchf != None:
+            feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
+                0, 2, 1
+            )
+        t1 = ttime()
+        p_len = audio0.shape[0] // self.window
+        if feats.shape[1] < p_len:
+            p_len = feats.shape[1]
+            if pitch != None and pitchf != None:
+                pitch = pitch[:, :p_len]
+                pitchf = pitchf[:, :p_len]
+
+        if protect < 0.5 and pitch != None and pitchf != None:
+            pitchff = pitchf.clone()
+            pitchff[pitchf > 0] = 1
+            pitchff[pitchf < 1] = protect
+            pitchff = pitchff.unsqueeze(-1)
+            feats = feats * pitchff + feats0 * (1 - pitchff)
+            feats = feats.to(feats0.dtype)
+        p_len = torch.tensor([p_len], device=self.device).long()
+        with torch.no_grad():
+            if pitch != None and pitchf != None:
+                audio1 = (
+                    (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
+                    .data.cpu()
+                    .float()
+                    .numpy()
+                )
+            else:
+                audio1 = (
+                    (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
+                )
+        del feats, p_len, padding_mask
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        t2 = ttime()
+        times[0] += t1 - t0
+        times[2] += t2 - t1
+        return audio1
+
+    def pipeline(
+        self,
+        model,
+        net_g,
+        sid,
+        audio,
+        input_audio_path,
+        times,
+        f0_up_key,
+        f0_method,
+        file_index,
+        # file_big_npy,
+        index_rate,
+        if_f0,
+        filter_radius,
+        tgt_sr,
+        resample_sr,
+        rms_mix_rate,
+        version,
+        protect,
+        crepe_hop_length,
+        f0_file=None,
+    ):
+        if (
+            file_index != ""
+            # and file_big_npy != ""
+            # and os.path.exists(file_big_npy) == True
+            and os.path.exists(file_index) == True
+            and index_rate != 0
+        ):
+            try:
+                index = faiss.read_index(file_index)
+                # big_npy = np.load(file_big_npy)
+                big_npy = index.reconstruct_n(0, index.ntotal)
+            except:
+                traceback.print_exc()
+                index = big_npy = None
+        else:
+            index = big_npy = None
+        audio = signal.filtfilt(bh, ah, audio)
+        audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
+        opt_ts = []
+        if audio_pad.shape[0] > self.t_max:
+            audio_sum = np.zeros_like(audio)
+            for i in range(self.window):
+                audio_sum += audio_pad[i : i - self.window]
+            for t in range(self.t_center, audio.shape[0], self.t_center):
+                opt_ts.append(
+                    t
+                    - self.t_query
+                    + np.where(
+                        np.abs(audio_sum[t - self.t_query : t + self.t_query])
+                        == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
+                    )[0][0]
+                )
+        s = 0
+        audio_opt = []
+        t = None
+        t1 = ttime()
+        audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
+        p_len = audio_pad.shape[0] // self.window
+        inp_f0 = None
+        if hasattr(f0_file, "name") == True:
+            try:
+                with open(f0_file.name, "r") as f:
+                    lines = f.read().strip("\n").split("\n")
+                inp_f0 = []
+                for line in lines:
+                    inp_f0.append([float(i) for i in line.split(",")])
+                inp_f0 = np.array(inp_f0, dtype="float32")
+            except:
+                traceback.print_exc()
+        sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
+        pitch, pitchf = None, None
+        if if_f0 == 1:
+            pitch, pitchf = self.get_f0(
+                input_audio_path,
+                audio_pad,
+                p_len,
+                f0_up_key,
+                f0_method,
+                filter_radius,
+                crepe_hop_length,
+                inp_f0,
+            )
+            pitch = pitch[:p_len]
+            pitchf = pitchf[:p_len]
+            if self.device == "mps":
+                pitchf = pitchf.astype(np.float32)
+            pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
+            pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
+        t2 = ttime()
+        times[1] += t2 - t1
+        for t in opt_ts:
+            t = t // self.window * self.window
+            if if_f0 == 1:
+                audio_opt.append(
+                    self.vc(
+                        model,
+                        net_g,
+                        sid,
+                        audio_pad[s : t + self.t_pad2 + self.window],
+                        pitch[:, s // self.window : (t + self.t_pad2) // self.window],
+                        pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
+                        times,
+                        index,
+                        big_npy,
+                        index_rate,
+                        version,
+                        protect,
+                    )[self.t_pad_tgt : -self.t_pad_tgt]
+                )
+            else:
+                audio_opt.append(
+                    self.vc(
+                        model,
+                        net_g,
+                        sid,
+                        audio_pad[s : t + self.t_pad2 + self.window],
+                        None,
+                        None,
+                        times,
+                        index,
+                        big_npy,
+                        index_rate,
+                        version,
+                        protect,
+                    )[self.t_pad_tgt : -self.t_pad_tgt]
+                )
+            s = t
+        if if_f0 == 1:
+            audio_opt.append(
+                self.vc(
+                    model,
+                    net_g,
+                    sid,
+                    audio_pad[t:],
+                    pitch[:, t // self.window :] if t is not None else pitch,
+                    pitchf[:, t // self.window :] if t is not None else pitchf,
+                    times,
+                    index,
+                    big_npy,
+                    index_rate,
+                    version,
+                    protect,
+                )[self.t_pad_tgt : -self.t_pad_tgt]
+            )
+        else:
+            audio_opt.append(
+                self.vc(
+                    model,
+                    net_g,
+                    sid,
+                    audio_pad[t:],
+                    None,
+                    None,
+                    times,
+                    index,
+                    big_npy,
+                    index_rate,
+                    version,
+                    protect,
+                )[self.t_pad_tgt : -self.t_pad_tgt]
+            )
+        audio_opt = np.concatenate(audio_opt)
+        if rms_mix_rate != 1:
+            audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
+        if resample_sr >= 16000 and tgt_sr != resample_sr:
+            audio_opt = librosa.resample(
+                audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
+            )
+            tgt_sr = resample_sr
+        audio_max = np.abs(audio_opt).max() / 0.99
+        max_int16 = 32768
+        if audio_max > 1:
+            max_int16 /= audio_max
+        audio_opt = (audio_opt * max_int16).astype(np.int16)
+        del pitch, pitchf, sid
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        return audio_opt, tgt_sr
diff --git a/src/ultimate_rvc/web/__init__.py b/src/ultimate_rvc/web/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9aeaabe3a5720a1328d42ccfcb1dabf4db0d1d0f
--- /dev/null
+++ b/src/ultimate_rvc/web/__init__.py
@@ -0,0 +1,10 @@
+"""
+Web package for the Ultimate RVC project.
+
+This package contains modules which define the web application of the
+Ultimate RVC project.
+"""
+
+from ultimate_rvc.core.main import initialize
+
+initialize()
diff --git a/src/ultimate_rvc/web/common.py b/src/ultimate_rvc/web/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..80066f7dbb5058357826c2ffb269a92f961e5359
--- /dev/null
+++ b/src/ultimate_rvc/web/common.py
@@ -0,0 +1,398 @@
+"""
+Module defining common utility functions and classes for the
+web application of the Ultimate RVC project.
+"""
+
+from typing import Any, Concatenate
+
+from collections.abc import Callable, Sequence
+
+import gradio as gr
+
+from ultimate_rvc.core.exceptions import NotProvidedError
+from ultimate_rvc.core.generate.song_cover import (
+    get_named_song_dirs,
+    get_song_cover_name,
+)
+from ultimate_rvc.core.manage.audio import get_saved_output_audio
+from ultimate_rvc.web.typing_extra import (
+    ComponentVisibilityKwArgs,
+    DropdownChoices,
+    DropdownValue,
+    TextBoxKwArgs,
+    UpdateDropdownKwArgs,
+)
+
+PROGRESS_BAR = gr.Progress()
+
+
+def exception_harness[T, **P](
+    fn: Callable[P, T],
+    info_msg: str | None = None,
+) -> Callable[P, T]:
+    """
+    Wrap a function in a harness that catches exceptions and re-raises
+    them as instances of `gradio.Error`.
+
+    Parameters
+    ----------
+    fn : Callable[P, T]
+        The function to wrap.
+
+    info_msg : str, optional
+        Message to display in an info-box pop-up after the function
+        executes successfully.
+
+    Returns
+    -------
+    Callable[P, T]
+        The wrapped function.
+
+    """
+
+    def _wrapped_fn(*args: P.args, **kwargs: P.kwargs) -> T:
+        try:
+            res = fn(*args, **kwargs)
+        except gr.Error:
+            raise
+        except NotProvidedError as e:
+            msg = e.ui_msg or e
+            raise gr.Error(str(msg)) from None
+        except Exception as e:
+            raise gr.Error(str(e)) from e
+        else:
+            if info_msg:
+                gr.Info(info_msg, duration=0.5)
+            return res
+
+    return _wrapped_fn
+
+
+def confirmation_harness[T, **P](
+    fn: Callable[P, T],
+) -> Callable[Concatenate[bool, P], T]:
+    """
+    Wrap a function in a harness that requires a confirmation before
+    executing and catches exceptions, re-raising them as instances of
+    `gradio.Error`.
+
+    Parameters
+    ----------
+    fn : Callable[P, T]
+        The function to wrap.
+
+    Returns
+    -------
+    Callable[Concatenate[bool, P], T]
+        The wrapped function.
+
+    """
+
+    def _wrapped_fn(confirm: bool, *args: P.args, **kwargs: P.kwargs) -> T:
+        if confirm:
+            return exception_harness(fn)(*args, **kwargs)
+        err_msg = "Confirmation missing!"
+        raise gr.Error(err_msg)
+
+    return _wrapped_fn
+
+
+def render_msg(
+    template: str,
+    *args: str,
+    display_info: bool = False,
+    **kwargs: str,
+) -> str:
+    """
+    Render a message template with the provided arguments.
+
+    Parameters
+    ----------
+    template : str
+        Message template to render.
+    args : str
+        Positional arguments to pass to the template.
+    display_info : bool, default=False
+        Whether to display the rendered message as an info message
+        in addition to returning it.
+    kwargs : str
+        Keyword arguments to pass to the template.
+
+    Returns
+    -------
+    str
+        Rendered message.
+
+    """
+    msg = template.format(*args, **kwargs)
+    if display_info:
+        gr.Info(msg)
+    return msg
+
+
+def confirm_box_js(msg: str) -> str:
+    """
+    Generate a JavaScript code snippet which:
+      * defines an anonymous function that takes one named parameter and
+      zero or more unnamed parameters
+      * renders a confirmation box
+      * returns the choice selected by the user in that confirmation
+      box in addition to any unnamed parameters passed to the function.
+
+    Parameters
+    ----------
+    msg : str
+        Message to display in the confirmation box rendered by the
+        JavaScript code snippet.
+
+    Returns
+    -------
+    str
+        The JavaScript code snippet.
+
+    """
+    return f"(x, ...args) => [confirm('{msg}'), ...args]"
+
+
+def update_value(x: str) -> dict[str, Any]:
+    """
+    Update the value of a component.
+
+    Parameters
+    ----------
+    x : str
+        New value for the component.
+
+    Returns
+    -------
+    dict[str, Any]
+        Dictionary which updates the value of the component.
+
+    """
+    return gr.update(value=x)
+
+
+def update_dropdowns[**P](
+    fn: Callable[P, DropdownChoices],
+    num_components: int,
+    value: DropdownValue = None,
+    value_indices: Sequence[int] = [],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> gr.Dropdown | tuple[gr.Dropdown, ...]:
+    """
+    Update the choices and optionally the value of one or more dropdown
+    components.
+
+    Parameters
+    ----------
+    fn : Callable[P, DropdownChoices]
+        Function to get updated choices for the dropdown components.
+    num_components : int
+        Number of dropdown components to update.
+    value : DropdownValue, optional
+        New value for dropdown components.
+    value_indices : Sequence[int], default=[]
+        Indices of dropdown components to update the value for.
+    args : P.args
+        Positional arguments to pass to the function used to update
+        dropdown choices.
+    kwargs : P.kwargs
+        Keyword arguments to pass to the function used to update
+        dropdown choices.
+
+    Returns
+    -------
+    gr.Dropdown | tuple[gr.Dropdown,...]
+        Updated dropdown component or components.
+
+    Raises
+    ------
+    ValueError
+        If not all provided indices are unique or if an index exceeds
+        or is equal to the number of dropdown components.
+
+    """
+    if len(value_indices) != len(set(value_indices)):
+        err_msg = "Value indices must be unique."
+        raise ValueError(err_msg)
+    if value_indices and max(value_indices) >= num_components:
+        err_msg = (
+            "Index of a dropdown component to update the value for exceeds the number"
+            " of dropdown components to update."
+        )
+        raise ValueError(err_msg)
+    updated_choices = fn(*args, **kwargs)
+    update_args_list: list[UpdateDropdownKwArgs] = [
+        {"choices": updated_choices} for _ in range(num_components)
+    ]
+    for index in value_indices:
+        update_args_list[index]["value"] = value
+
+    match update_args_list:
+        case [update_args]:
+            # NOTE This is a workaround as gradio does not support
+            # singleton tuples for components.
+            return gr.Dropdown(**update_args)
+        case _:
+            return tuple(gr.Dropdown(**update_args) for update_args in update_args_list)
+
+
+def update_cached_songs(
+    num_components: int,
+    value: DropdownValue = None,
+    value_indices: Sequence[int] = [],
+) -> gr.Dropdown | tuple[gr.Dropdown, ...]:
+    """
+    Update the choices of one or more dropdown components to the set of
+    currently cached songs.
+
+    Optionally update the default value of one or more of these
+    components.
+
+    Parameters
+    ----------
+    num_components : int
+        Number of dropdown components to update.
+    value : DropdownValue, optional
+        New value for the dropdown components.
+    value_indices : Sequence[int], default=[]
+        Indices of dropdown components to update the value for.
+
+    Returns
+    -------
+    gr.Dropdown | tuple[gr.Dropdown,...]
+        Updated dropdown component or components.
+
+    """
+    return update_dropdowns(get_named_song_dirs, num_components, value, value_indices)
+
+
+def update_output_audio(
+    num_components: int,
+    value: DropdownValue = None,
+    value_indices: Sequence[int] = [],
+) -> gr.Dropdown | tuple[gr.Dropdown, ...]:
+    """
+    Update the choices of one or more dropdown components to the set of
+    currently saved output audio files.
+
+    Optionally update the default value of one or more of these
+    components.
+
+    Parameters
+    ----------
+    num_components : int
+        Number of dropdown components to update.
+    value : DropdownValue, optional
+        New value for dropdown components.
+    value_indices : Sequence[int], default=[]
+        Indices of dropdown components to update the value for.
+
+    Returns
+    -------
+    gr.Dropdown | tuple[gr.Dropdown,...]
+        Updated dropdown component or components.
+
+    """
+    return update_dropdowns(
+        get_saved_output_audio,
+        num_components,
+        value,
+        value_indices,
+    )
+
+
+def toggle_visible_component(
+    num_components: int,
+    visible_index: int,
+) -> dict[str, Any] | tuple[dict[str, Any], ...]:
+    """
+    Reveal a single component from a set of components. All other
+    components are hidden.
+
+    Parameters
+    ----------
+    num_components : int
+        Number of components to set visibility for.
+    visible_index : int
+        Index of the component to reveal.
+
+    Returns
+    -------
+    dict[str, Any] | tuple[dict[str, Any], ...]
+        A single dictionary or a tuple of dictionaries that update the
+        visibility of the components.
+
+    Raises
+    ------
+    ValueError
+        If the visible index exceeds or is equal to the number of
+        components to set visibility for.
+
+    """
+    if visible_index >= num_components:
+        err_msg = (
+            "Visible index must be less than the number of components to set visibility"
+            " for."
+        )
+        raise ValueError(err_msg)
+    update_args_list: list[ComponentVisibilityKwArgs] = [
+        {"visible": False, "value": None} for _ in range(num_components)
+    ]
+    update_args_list[visible_index]["visible"] = True
+    match update_args_list:
+        case [update_args]:
+            return gr.update(**update_args)
+        case _:
+            return tuple(gr.update(**update_args) for update_args in update_args_list)
+
+
+def update_song_cover_name(
+    effected_vocals_track: str | None = None,
+    song_dir: str | None = None,
+    model_name: str | None = None,
+    update_placeholder: bool = False,
+) -> gr.Textbox:
+    """
+    Update a textbox component so that it displays a suitable name for a
+    cover of a given song.
+
+    If the path of an existing song directory is provided, the name of
+    the song is inferred from that directory. If the name of a voice
+    model is not provided but the path of an existing song directory
+    and the path of an effected vocals track in that directory are
+    provided, then the voice model is inferred from the effected vocals
+    track.
+
+
+    Parameters
+    ----------
+    effected_vocals_track : str, optional
+        The path to an effected vocals track.
+    song_dir : str, optional
+        The path to a song directory.
+    model_name : str, optional
+        The name of a voice model.
+    update_placeholder : bool, default=False
+        Whether to update the placeholder text instead of the value of
+        the textbox component.
+
+    Returns
+    -------
+    gr.Textbox
+        Textbox component with updated value or placeholder text.
+
+    """
+    update_args: TextBoxKwArgs = {}
+    update_key = "placeholder" if update_placeholder else "value"
+    if effected_vocals_track or song_dir or model_name:
+        song_cover_name = get_song_cover_name(
+            effected_vocals_track,
+            song_dir,
+            model_name,
+        )
+        update_args[update_key] = song_cover_name
+    else:
+        update_args[update_key] = None
+    return gr.Textbox(**update_args)
diff --git a/src/ultimate_rvc/web/main.py b/src/ultimate_rvc/web/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..86b276fbc5f2e7accae2b4b4187decdd759f7640
--- /dev/null
+++ b/src/ultimate_rvc/web/main.py
@@ -0,0 +1,251 @@
+"""
+Web application for the Ultimate RVC project.
+
+Each tab of the application is defined in its own module in the
+`web/tabs` directory. Components that are accessed across multiple
+tabs are passed as arguments to the render functions in the respective
+modules.
+"""
+
+from typing import Annotated
+
+import os
+
+import gradio as gr
+
+import typer
+
+from ultimate_rvc.common import AUDIO_DIR, MODELS_DIR, TEMP_DIR
+from ultimate_rvc.core.generate.song_cover import get_named_song_dirs
+from ultimate_rvc.core.manage.audio import get_saved_output_audio
+from ultimate_rvc.core.manage.models import get_saved_model_names
+from ultimate_rvc.web.tabs.manage_audio import render as render_manage_audio_tab
+from ultimate_rvc.web.tabs.manage_models import render as render_manage_models_tab
+from ultimate_rvc.web.tabs.multi_step_generation import render as render_multi_step_tab
+from ultimate_rvc.web.tabs.one_click_generation import render as render_one_click_tab
+from ultimate_rvc.web.tabs.other_settings import render as render_other_settings_tab
+
+app_wrapper = typer.Typer()
+
+
+def _init_app() -> list[gr.Dropdown]:
+    """
+    Initialize the Ultimate RVC web application by updating the choices
+    of all dropdown components.
+
+    Returns
+    -------
+    tuple[gr.Dropdown, ...]
+        Updated dropdowns for selecting voice models, cached songs,
+        and output audio files.
+
+    """
+    model_names = get_saved_model_names()
+    named_song_dirs = get_named_song_dirs()
+    models = [
+        gr.Dropdown(
+            choices=model_names,
+            value=None if not model_names else model_names[0],
+        )
+        for _ in range(2)
+    ]
+    model_delete = [gr.Dropdown(choices=model_names)]
+    cached_songs = [gr.Dropdown(choices=named_song_dirs) for _ in range(3)]
+    song_dirs = [
+        gr.Dropdown(
+            choices=named_song_dirs,
+            value=None if not named_song_dirs else named_song_dirs[0][1],
+        )
+        for _ in range(5)
+    ]
+    output_audio = [gr.Dropdown(choices=get_saved_output_audio())]
+    return models + model_delete + cached_songs + song_dirs + output_audio
+
+
+def render_app() -> gr.Blocks:
+    """
+    Render the Ultimate RVC web application.
+
+    Returns
+    -------
+    gr.Blocks
+        The rendered web application.
+
+    """
+    css = """
+    h1 { text-align: center; margin-top: 20px; margin-bottom: 20px; }
+    """
+    cache_delete_frequency = 86400  # every 24 hours check for files to delete
+    cache_delete_cutoff = 86400  # and delete files older than 24 hours
+
+    with gr.Blocks(
+        title="Ultimate RVC",
+        css=css,
+        delete_cache=(cache_delete_frequency, cache_delete_cutoff),
+    ) as app:
+        gr.HTML("<h1>Ultimate RVC 🧡</h1>")
+        song_dirs = [
+            gr.Dropdown(
+                # NOTE choices and value must be explicitly set like
+                # this to avoid caching issues when reloading the app
+                # (and hence calling _init_app) in both production and
+                # development modes
+                choices=get_named_song_dirs(),
+                value=None,
+                label="Song directory",
+                info=(
+                    "Directory where intermediate audio files are stored and loaded"
+                    " from locally. When a new song is retrieved, its directory is"
+                    " chosen by default."
+                ),
+                render=False,
+            )
+            for _ in range(5)
+        ]
+        cached_song_1click, cached_song_multi = [
+            gr.Dropdown(
+                label="Source",
+                info="Select a song from the list of cached songs.",
+                visible=False,
+                render=False,
+            )
+            for _ in range(2)
+        ]
+        intermediate_audio = gr.Dropdown(
+            label="Song directories",
+            multiselect=True,
+            info=(
+                "Select one or more song directories containing intermediate audio"
+                " files to delete."
+            ),
+            render=False,
+        )
+        output_audio = gr.Dropdown(
+            label="Output audio files",
+            multiselect=True,
+            info="Select one or more output audio files to delete.",
+            render=False,
+        )
+        model_1click, model_multi = [
+            gr.Dropdown(
+                # NOTE choices and value must be explicitly set like
+                # this to avoid caching issues when reloading the app
+                # (and hence calling _init_app) in both production and
+                # development modes
+                choices=get_saved_model_names(),
+                value=None,
+                label="Voice model",
+                render=False,
+                info="Select a voice model to use for converting vocals.",
+            )
+            for _ in range(2)
+        ]
+        model_delete = gr.Dropdown(label="Voice models", multiselect=True, render=False)
+
+        # main tab
+        with gr.Tab("Generate song covers"):
+            render_one_click_tab(
+                song_dirs,
+                cached_song_1click,
+                cached_song_multi,
+                model_1click,
+                intermediate_audio,
+                output_audio,
+            )
+            render_multi_step_tab(
+                song_dirs,
+                cached_song_1click,
+                cached_song_multi,
+                model_multi,
+                intermediate_audio,
+                output_audio,
+            )
+        with gr.Tab("Manage models"):
+            render_manage_models_tab(
+                model_delete,
+                model_1click,
+                model_multi,
+            )
+        with gr.Tab("Manage audio"):
+            render_manage_audio_tab(
+                song_dirs,
+                cached_song_1click,
+                cached_song_multi,
+                intermediate_audio,
+                output_audio,
+            )
+        with gr.Tab("Other settings"):
+            render_other_settings_tab()
+
+        app.load(
+            _init_app,
+            outputs=[
+                model_1click,
+                model_multi,
+                model_delete,
+                intermediate_audio,
+                cached_song_1click,
+                cached_song_multi,
+                *song_dirs,
+                output_audio,
+            ],
+            show_progress="hidden",
+        )
+    return app
+
+
+app = render_app()
+
+
+@app_wrapper.command()
+def start_app(
+    share: Annotated[
+        bool,
+        typer.Option("--share", "-s", help="Enable sharing"),
+    ] = False,
+    listen: Annotated[
+        bool,
+        typer.Option(
+            "--listen",
+            "-l",
+            help="Make the web application reachable from your local network.",
+        ),
+    ] = False,
+    listen_host: Annotated[
+        str | None,
+        typer.Option(
+            "--listen-host",
+            "-h",
+            help="The hostname that the server will use.",
+        ),
+    ] = None,
+    listen_port: Annotated[
+        int | None,
+        typer.Option(
+            "--listen-port",
+            "-p",
+            help="The listening port that the server will use.",
+        ),
+    ] = None,
+    ssr_mode: Annotated[
+        bool,
+        typer.Option(
+            "--ssr-mode",
+            help="Enable server-side rendering mode.",
+        ),
+    ] = False,
+) -> None:
+    """Run the Ultimate RVC web application."""
+    os.environ["GRADIO_TEMP_DIR"] = str(TEMP_DIR)
+    gr.set_static_paths([MODELS_DIR, AUDIO_DIR])
+    app.queue()
+    app.launch(
+        share=share,
+        server_name=(None if not listen else (listen_host or "0.0.0.0")),  # noqa: S104
+        server_port=listen_port,
+        ssr_mode=ssr_mode,
+    )
+
+
+if __name__ == "__main__":
+    app_wrapper()
diff --git a/src/ultimate_rvc/web/tabs/__init__.py b/src/ultimate_rvc/web/tabs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..69d4f9b9a5706eb19b32ef38dd5aa41334d664dc
--- /dev/null
+++ b/src/ultimate_rvc/web/tabs/__init__.py
@@ -0,0 +1,7 @@
+"""
+web package for tabs.
+
+This package contains modules which define functions for rendering each
+tab in web application of the Ultimate RVC project.
+
+"""
diff --git a/src/ultimate_rvc/web/tabs/manage_audio.py b/src/ultimate_rvc/web/tabs/manage_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..1407caa6e168cc3dc6316109612028a310a610e5
--- /dev/null
+++ b/src/ultimate_rvc/web/tabs/manage_audio.py
@@ -0,0 +1,211 @@
+"""Module which defines the code for the "Manage audio" tab."""
+
+from collections.abc import Sequence
+from functools import partial
+
+import gradio as gr
+
+from ultimate_rvc.core.manage.audio import (
+    delete_all_audio,
+    delete_all_intermediate_audio,
+    delete_all_output_audio,
+    delete_intermediate_audio,
+    delete_output_audio,
+)
+from ultimate_rvc.web.common import (
+    PROGRESS_BAR,
+    confirm_box_js,
+    confirmation_harness,
+    render_msg,
+    update_cached_songs,
+    update_output_audio,
+)
+
+
+def render(
+    song_dirs: Sequence[gr.Dropdown],
+    cached_song_1click: gr.Dropdown,
+    cached_song_multi: gr.Dropdown,
+    intermediate_audio: gr.Dropdown,
+    output_audio: gr.Dropdown,
+) -> None:
+    """
+    Render "Manage audio" tab.
+
+    Parameters
+    ----------
+    song_dirs : Sequence[gr.Dropdown]
+        Dropdown components for selecting song directories in the
+        "Multi-step generation" tab.
+    cached_song_1click : gr.Dropdown
+        Dropdown for selecting a cached song in the
+        "One-click generation" tab
+    cached_song_multi : gr.Dropdown
+        Dropdown for selecting a cached song in the
+        "Multi-step generation" tab
+    intermediate_audio : gr.Dropdown
+        Dropdown for selecting intermediate audio files to delete in the
+        "Delete audio" tab.
+    output_audio : gr.Dropdown
+        Dropdown for selecting output audio files to delete in the
+        "Delete audio" tab.
+
+    """
+    dummy_checkbox = gr.Checkbox(visible=False)
+    with gr.Tab("Delete audio"):
+        with gr.Accordion("Intermediate audio", open=False), gr.Row():
+            with gr.Column():
+                intermediate_audio.render()
+                intermediate_audio_btn = gr.Button(
+                    "Delete selected",
+                    variant="secondary",
+                )
+                all_intermediate_audio_btn = gr.Button(
+                    "Delete all",
+                    variant="primary",
+                )
+            with gr.Column():
+                intermediate_audio_msg = gr.Textbox(
+                    label="Output message",
+                    interactive=False,
+                )
+        with gr.Accordion("Output audio", open=False), gr.Row():
+            with gr.Column():
+                output_audio.render()
+                output_audio_btn = gr.Button(
+                    "Delete selected",
+                    variant="secondary",
+                )
+                all_output_audio_btn = gr.Button(
+                    "Delete all",
+                    variant="primary",
+                )
+            with gr.Column():
+                output_audio_msg = gr.Textbox(
+                    label="Output message",
+                    interactive=False,
+                )
+        with gr.Accordion("All audio", open=True), gr.Row(equal_height=True):
+            all_audio_btn = gr.Button("Delete", variant="primary")
+            all_audio_msg = gr.Textbox(label="Output message", interactive=False)
+
+        intermediate_audio_click = intermediate_audio_btn.click(
+            partial(
+                confirmation_harness(delete_intermediate_audio),
+                progress_bar=PROGRESS_BAR,
+            ),
+            inputs=[dummy_checkbox, intermediate_audio],
+            outputs=intermediate_audio_msg,
+            js=confirm_box_js(
+                "Are you sure you want to delete the selected song directories?",
+            ),
+        ).success(
+            partial(
+                render_msg,
+                "[-] Successfully deleted the selected song directories!",
+            ),
+            outputs=intermediate_audio_msg,
+            show_progress="hidden",
+        )
+
+        all_intermediate_audio_click = all_intermediate_audio_btn.click(
+            partial(
+                confirmation_harness(delete_all_intermediate_audio),
+                progress_bar=PROGRESS_BAR,
+            ),
+            inputs=dummy_checkbox,
+            outputs=intermediate_audio_msg,
+            js=confirm_box_js(
+                "Are you sure you want to delete all intermediate audio files?",
+            ),
+        ).success(
+            partial(
+                render_msg,
+                "[-] Successfully deleted all intermediate audio files!",
+            ),
+            outputs=intermediate_audio_msg,
+            show_progress="hidden",
+        )
+
+        output_audio_click = output_audio_btn.click(
+            partial(
+                confirmation_harness(delete_output_audio),
+                progress_bar=PROGRESS_BAR,
+            ),
+            inputs=[dummy_checkbox, output_audio],
+            outputs=output_audio_msg,
+            js=confirm_box_js(
+                "Are you sure you want to delete the selected output audio files?",
+            ),
+        ).success(
+            partial(
+                render_msg,
+                "[-] Successfully deleted the selected output audio files!",
+            ),
+            outputs=output_audio_msg,
+            show_progress="hidden",
+        )
+
+        all_output_audio_click = all_output_audio_btn.click(
+            partial(
+                confirmation_harness(delete_all_output_audio),
+                progress_bar=PROGRESS_BAR,
+            ),
+            inputs=dummy_checkbox,
+            outputs=output_audio_msg,
+            js=confirm_box_js(
+                "Are you sure you want to delete all output audio files?",
+            ),
+        ).success(
+            partial(render_msg, "[-] Successfully deleted all output audio files!"),
+            outputs=output_audio_msg,
+            show_progress="hidden",
+        )
+
+        all_audio_click = all_audio_btn.click(
+            partial(
+                confirmation_harness(delete_all_audio),
+                progress_bar=PROGRESS_BAR,
+            ),
+            inputs=dummy_checkbox,
+            outputs=all_audio_msg,
+            js=confirm_box_js("Are you sure you want to delete all audio files?"),
+        ).success(
+            partial(render_msg, "[-] Successfully deleted all audio files!"),
+            outputs=all_audio_msg,
+            show_progress="hidden",
+        )
+
+        _, _, all_audio_update = [
+            click_event.success(
+                partial(
+                    update_cached_songs,
+                    3 + len(song_dirs),
+                    [],
+                    [0],
+                ),
+                outputs=[
+                    intermediate_audio,
+                    cached_song_1click,
+                    cached_song_multi,
+                    *song_dirs,
+                ],
+                show_progress="hidden",
+            )
+            for click_event in [
+                intermediate_audio_click,
+                all_intermediate_audio_click,
+                all_audio_click,
+            ]
+        ]
+
+        for click_event in [
+            output_audio_click,
+            all_output_audio_click,
+            all_audio_update,
+        ]:
+            click_event.success(
+                partial(update_output_audio, 1, [], [0]),
+                outputs=[output_audio],
+                show_progress="hidden",
+            )
diff --git a/src/ultimate_rvc/web/tabs/manage_models.py b/src/ultimate_rvc/web/tabs/manage_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ec0bd9dc2cc31a658ab6ae75697849a97b46d5d
--- /dev/null
+++ b/src/ultimate_rvc/web/tabs/manage_models.py
@@ -0,0 +1,320 @@
+"""Module which defines the code for the "Manage models" tab."""
+
+from collections.abc import Sequence
+from functools import partial
+
+import gradio as gr
+import pandas as pd
+
+from ultimate_rvc.core.manage.models import (
+    delete_all_models,
+    delete_models,
+    download_model,
+    filter_public_models_table,
+    get_public_model_tags,
+    get_saved_model_names,
+    upload_model,
+)
+from ultimate_rvc.web.common import (
+    PROGRESS_BAR,
+    confirm_box_js,
+    confirmation_harness,
+    exception_harness,
+    render_msg,
+    update_dropdowns,
+)
+from ultimate_rvc.web.typing_extra import DropdownValue
+
+
+def _update_models(
+    num_components: int,
+    value: DropdownValue = None,
+    value_indices: Sequence[int] = [],
+) -> gr.Dropdown | tuple[gr.Dropdown, ...]:
+    """
+    Update the choices of one or more dropdown components to the set of
+    currently saved voice models.
+
+    Optionally updates the default value of one or more of these
+    components.
+
+    Parameters
+    ----------
+    num_components : int
+        Number of dropdown components to update.
+    value : DropdownValue, optional
+        New value for dropdown components.
+    value_indices : Sequence[int], default=[]
+        Indices of dropdown components to update the value for.
+
+    Returns
+    -------
+    gr.Dropdown | tuple[gr.Dropdown, ...]
+        Updated dropdown component or components.
+
+    """
+    return update_dropdowns(get_saved_model_names, num_components, value, value_indices)
+
+
+def _filter_public_models_table(tags: Sequence[str], query: str) -> gr.Dataframe:
+    """
+    Filter table containing metadata of public voice models by tags and
+    a search query.
+
+    Parameters
+    ----------
+    tags : Sequence[str]
+        Tags to filter the metadata table by.
+    query : str
+        Search query to filter the metadata table by.
+
+    Returns
+    -------
+    gr.Dataframe
+        The filtered table rendered in a Gradio dataframe.
+
+    """
+    models_table = filter_public_models_table(tags, query)
+    return gr.Dataframe(value=models_table)
+
+
+def _autofill_model_name_and_url(
+    public_models_table: pd.DataFrame,
+    select_event: gr.SelectData,
+) -> tuple[gr.Textbox, gr.Textbox]:
+    """
+    Autofill two textboxes with respectively the name and URL that is
+    saved in the currently selected row of the public models table.
+
+    Parameters
+    ----------
+    public_models_table : pd.DataFrame
+        The public models table saved in a Pandas dataframe.
+    select_event : gr.SelectData
+        Event containing the index of the currently selected row in the
+        public models table.
+
+    Returns
+    -------
+    name : gr.Textbox
+        The textbox containing the model name.
+
+    url : gr.Textbox
+        The textbox containing the model URL.
+
+    Raises
+    ------
+    TypeError
+        If the index in the provided event is not a sequence.
+
+    """
+    event_index = select_event.index
+    if not isinstance(event_index, Sequence):
+        err_msg = (
+            f"Expected a sequence of indices but got {type(event_index)} from the"
+            " provided event."
+        )
+        raise TypeError(err_msg)
+    event_index = event_index[0]
+    url = public_models_table.loc[event_index, "URL"]
+    name = public_models_table.loc[event_index, "Name"]
+    if isinstance(url, str) and isinstance(name, str):
+        return gr.Textbox(value=name), gr.Textbox(value=url)
+    err_msg = (
+        "Expected model name and URL to be strings but got"
+        f" {type(name)} and {type(url)} respectively."
+    )
+    raise TypeError(err_msg)
+
+
+def render(
+    model_delete: gr.Dropdown,
+    model_1click: gr.Dropdown,
+    model_multi: gr.Dropdown,
+) -> None:
+    """
+
+    Render "Manage models" tab.
+
+    Parameters
+    ----------
+    model_delete : gr.Dropdown
+        Dropdown for selecting voice models to delete in the
+        "Delete models" tab.
+    model_1click : gr.Dropdown
+        Dropdown for selecting a voice model to use in the
+        "One-click generation" tab.
+    model_multi : gr.Dropdown
+        Dropdown for selecting a voice model to use in the
+        "Multi-step generation" tab.
+
+    """
+    # Download tab
+
+    dummy_checkbox = gr.Checkbox(visible=False)
+    with gr.Tab("Download model"):
+        with gr.Accordion("View public models table", open=False):
+            gr.Markdown("")
+            gr.Markdown("*HOW TO USE*")
+            gr.Markdown(
+                "- Filter voice models by selecting one or more tags and/or providing a"
+                " search query.",
+            )
+            gr.Markdown(
+                "- Select a row in the table to autofill the name and"
+                " URL for the given voice model in the form fields below.",
+            )
+            gr.Markdown("")
+            with gr.Row():
+                search_query = gr.Textbox(label="Search query")
+                tags = gr.CheckboxGroup(
+                    value=[],
+                    label="Tags",
+                    choices=get_public_model_tags(),
+                )
+            with gr.Row():
+                public_models_table = gr.Dataframe(
+                    value=_filter_public_models_table,
+                    inputs=[tags, search_query],
+                    headers=["Name", "Description", "Tags", "Credit", "Added", "URL"],
+                    label="Public models table",
+                    interactive=False,
+                )
+
+        with gr.Row():
+            model_url = gr.Textbox(
+                label="Model URL",
+                info=(
+                    "Should point to a zip file containing a .pth model file and"
+                    " optionally also an .index file."
+                ),
+            )
+            model_name = gr.Textbox(
+                label="Model name",
+                info="Enter a unique name for the voice model.",
+            )
+
+        with gr.Row(equal_height=True):
+            download_btn = gr.Button("Download 🌐", variant="primary", scale=19)
+            download_msg = gr.Textbox(
+                label="Output message",
+                interactive=False,
+                scale=20,
+            )
+
+        public_models_table.select(
+            _autofill_model_name_and_url,
+            inputs=public_models_table,
+            outputs=[model_name, model_url],
+            show_progress="hidden",
+        )
+
+        download_btn_click = download_btn.click(
+            partial(
+                exception_harness(download_model),
+                progress_bar=PROGRESS_BAR,
+            ),
+            inputs=[model_url, model_name],
+            outputs=download_msg,
+        ).success(
+            partial(
+                render_msg,
+                "[+] Succesfully downloaded voice model!",
+            ),
+            inputs=model_name,
+            outputs=download_msg,
+            show_progress="hidden",
+        )
+
+    # Upload tab
+    with gr.Tab("Upload model"):
+        with gr.Accordion("HOW TO USE"):
+            gr.Markdown("")
+            gr.Markdown(
+                "1. Find the .pth file for a locally trained RVC model (e.g. in your"
+                " local weights folder) and optionally also a corresponding .index file"
+                " (e.g. in your logs/[name] folder)",
+            )
+            gr.Markdown(
+                "2. Upload the files directly or save them to a folder, then compress"
+                " that folder and upload the resulting .zip file",
+            )
+            gr.Markdown("3. Enter a unique name for the uploaded model")
+            gr.Markdown("4. Click 'Upload'")
+
+        with gr.Row():
+            model_files = gr.File(
+                label="Files",
+                file_count="multiple",
+                file_types=[".zip", ".pth", ".index"],
+            )
+
+            local_model_name = gr.Textbox(label="Model name")
+
+        with gr.Row(equal_height=True):
+            upload_btn = gr.Button("Upload", variant="primary", scale=19)
+            upload_msg = gr.Textbox(
+                label="Output message",
+                interactive=False,
+                scale=20,
+            )
+            upload_btn_click = upload_btn.click(
+                partial(exception_harness(upload_model), progress_bar=PROGRESS_BAR),
+                inputs=[model_files, local_model_name],
+                outputs=upload_msg,
+            ).success(
+                partial(
+                    render_msg,
+                    "[+] Successfully uploaded voice model!",
+                ),
+                inputs=local_model_name,
+                outputs=upload_msg,
+                show_progress="hidden",
+            )
+
+    with gr.Tab("Delete models"):
+        with gr.Row():
+            with gr.Column():
+                model_delete.render()
+                delete_btn = gr.Button("Delete selected", variant="secondary")
+                delete_all_btn = gr.Button("Delete all", variant="primary")
+            with gr.Column():
+                delete_msg = gr.Textbox(label="Output message", interactive=False)
+        delete_btn_click = delete_btn.click(
+            partial(confirmation_harness(delete_models), progress_bar=PROGRESS_BAR),
+            inputs=[dummy_checkbox, model_delete],
+            outputs=delete_msg,
+            js=confirm_box_js(
+                "Are you sure you want to delete the selected voice models?",
+            ),
+        ).success(
+            partial(render_msg, "[-] Successfully deleted selected voice models!"),
+            outputs=delete_msg,
+            show_progress="hidden",
+        )
+
+        delete_all_btn_click = delete_all_btn.click(
+            partial(
+                confirmation_harness(delete_all_models),
+                progress_bar=PROGRESS_BAR,
+            ),
+            inputs=dummy_checkbox,
+            outputs=delete_msg,
+            js=confirm_box_js("Are you sure you want to delete all voice models?"),
+        ).success(
+            partial(render_msg, "[-] Successfully deleted all voice models!"),
+            outputs=delete_msg,
+            show_progress="hidden",
+        )
+
+    for click_event in [
+        download_btn_click,
+        upload_btn_click,
+        delete_btn_click,
+        delete_all_btn_click,
+    ]:
+        click_event.success(
+            partial(_update_models, 3, [], [2]),
+            outputs=[model_1click, model_multi, model_delete],
+            show_progress="hidden",
+        )
diff --git a/src/ultimate_rvc/web/tabs/multi_step_generation.py b/src/ultimate_rvc/web/tabs/multi_step_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..b00c53801bcb1766d6ee17b061e908ff51f486b4
--- /dev/null
+++ b/src/ultimate_rvc/web/tabs/multi_step_generation.py
@@ -0,0 +1,957 @@
+"""Module which defines the code for the "Multi-step generation" tab."""
+
+from typing import TYPE_CHECKING, Any
+
+from collections.abc import Sequence
+from functools import partial
+
+import gradio as gr
+
+from ultimate_rvc.core.generate.song_cover import (
+    convert,
+    mix_song,
+    pitch_shift,
+    postprocess,
+    retrieve_song,
+    separate_audio,
+)
+from ultimate_rvc.typing_extra import (
+    AudioExt,
+    F0Method,
+    SampleRate,
+    SegmentSize,
+    SeparationModel,
+)
+from ultimate_rvc.web.common import (
+    PROGRESS_BAR,
+    exception_harness,
+    toggle_visible_component,
+    update_cached_songs,
+    update_output_audio,
+    update_song_cover_name,
+    update_value,
+)
+from ultimate_rvc.web.typing_extra import ConcurrencyId, SourceType
+
+if TYPE_CHECKING:
+    from ultimate_rvc.web.typing_extra import UpdateAudioKwArgs
+
+
+def _update_audio(
+    num_components: int,
+    output_indices: Sequence[int],
+    track: str | None,
+    disallow_none: bool = True,
+) -> gr.Audio | tuple[gr.Audio, ...]:
+    """
+    Update the value of a subset of `Audio` components to the given
+    audio track.
+
+    Parameters
+    ----------
+    num_components : int
+        The total number of `Audio` components under consideration.
+    output_indices : Sequence[int]
+        Indices of `Audio` components to update the value for.
+    track : str
+        Path pointing to an audio track to update the value of the
+        indexed `Audio` components with.
+    disallow_none : bool, default=True
+        Whether to disallow the value of the indexed components to be
+        `None`.
+
+    Returns
+    -------
+    gr.Audio | tuple[gr.Audio, ...]
+        Each `Audio` component under consideration with the value of the
+        indexed components updated to the given audio track.
+
+    """
+    update_args_list: list[UpdateAudioKwArgs] = [{} for _ in range(num_components)]
+    for index in output_indices:
+        if track or not disallow_none:
+            update_args_list[index]["value"] = track
+    match update_args_list:
+        case [update_args]:
+            return gr.Audio(**update_args)
+        case _:
+            return tuple(gr.Audio(**update_args) for update_args in update_args_list)
+
+
+def _pair_audio_tracks_and_gain(
+    audio_components: Sequence[gr.Audio],
+    gain_components: Sequence[gr.Slider],
+    data: dict[gr.Audio | gr.Slider, Any],
+) -> list[tuple[str, int]]:
+    """
+    Pair audio tracks and gain levels stored in separate gradio
+    components.
+
+    This function is meant to first be partially applied to the sequence
+    of audio components and the sequence of slider components containing
+    the values that should be combined. The resulting function can then
+    be called by an event listener whose inputs is a set containing
+    those audio and slider components. The `data` parameter in that case
+    will contain a mapping from each of those components to the value
+    that the component stores.
+
+    Parameters
+    ----------
+    audio_components : Sequence[gr.Audio]
+        Audio components to pair with gain levels.
+    gain_components : Sequence[gr.Slider]
+        Gain level components to pair with audio tracks.
+    data : dict[gr.Audio | gr.Slider, Any]
+        Data from the audio and gain components.
+
+    Returns
+    -------
+    list[tuple[str, int]]
+        Paired audio tracks and gain levels.
+
+    Raises
+    ------
+    ValueError
+        If the number of audio tracks and gain levels are not the same.
+
+    """
+    audio_tracks = [data[component] for component in audio_components]
+    gain_levels = [data[component] for component in gain_components]
+    if len(audio_tracks) != len(gain_levels):
+        err_msg = "Number of audio tracks and gain levels must be the same."
+        raise ValueError(err_msg)
+    return [
+        (audio_track, gain_level)
+        for audio_track, gain_level in zip(audio_tracks, gain_levels, strict=True)
+        if audio_track
+    ]
+
+
+def render(
+    song_dirs: Sequence[gr.Dropdown],
+    cached_song_1click: gr.Dropdown,
+    cached_song_multi: gr.Dropdown,
+    model_multi: gr.Dropdown,
+    intermediate_audio: gr.Dropdown,
+    output_audio: gr.Dropdown,
+) -> None:
+    """
+    Render "Multi-step generation" tab.
+
+    Parameters
+    ----------
+    song_dirs : Sequence[gr.Dropdown]
+        Dropdowns for selecting song directories in the
+        "Multi-step generation" tab.
+    cached_song_1click : gr.Dropdown
+        Dropdown for selecting a cached song in the
+        "One-click generation" tab.
+    cached_song_multi : gr.Dropdown
+        Dropdown for selecting a cached song in the
+        "Multi-step generation" tab.
+    model_multi : gr.Dropdown
+        Dropdown for selecting a voice model in the
+        "Multi-step generation" tab.
+    intermediate_audio : gr.Dropdown
+        Dropdown for selecting intermediate audio files to delete in the
+        "Delete audio" tab.
+    output_audio : gr.Dropdown
+        Dropdown for selecting output audio files to delete in the
+        "Delete audio" tab.
+
+    """
+    with gr.Tab("Multi-step generation"):
+        (
+            separate_audio_dir,
+            convert_vocals_dir,
+            postprocess_vocals_dir,
+            pitch_shift_background_dir,
+            mix_dir,
+        ) = song_dirs
+        current_song_dir = gr.State(None)
+
+        input_tracks = [
+            gr.Audio(label=label, type="filepath", render=False)
+            for label in [
+                "Audio",
+                "Vocals",
+                "Vocals",
+                "Instrumentals",
+                "Backup vocals",
+                "Main vocals",
+                "Instrumentals",
+                "Backup vocals",
+            ]
+        ]
+        (
+            audio_track_input,
+            vocals_track_input,
+            converted_vocals_track_input,
+            instrumentals_track_input,
+            backup_vocals_track_input,
+            main_vocals_track_input,
+            shifted_instrumentals_track_input,
+            shifted_backup_vocals_track_input,
+        ) = input_tracks
+
+        (
+            song_output,
+            primary_stem_output,
+            secondary_stem_output,
+            converted_vocals_track_output,
+            effected_vocals_track_output,
+            shifted_instrumentals_track_output,
+            shifted_backup_vocals_track_output,
+            song_cover_output,
+        ) = [
+            gr.Audio(label=label, type="filepath", interactive=False, render=False)
+            for label in [
+                "Song",
+                "Primary stem",
+                "Secondary stem",
+                "Converted vocals",
+                "Effected vocals",
+                "Pitch-shifted instrumentals",
+                "Pitch-shifted backup vocals",
+                "Song cover",
+            ]
+        ]
+
+        transfer_defaults = [
+            ["Step 1: audio"],
+            ["Step 2: vocals"],
+            ["Step 4: instrumentals"],
+            ["Step 3: vocals"],
+            ["Step 5: main vocals"],
+            ["Step 5: instrumentals"],
+            ["Step 5: backup vocals"],
+            [],
+        ]
+
+        (
+            song_transfer_default,
+            primary_stem_transfer_default,
+            secondary_stem_transfer_default,
+            converted_vocals_transfer_default,
+            effected_vocals_transfer_default,
+            shifted_instrumentals_transfer_default,
+            shifted_backup_vocals_transfer_default,
+            song_cover_transfer_default,
+        ) = transfer_defaults
+
+        (
+            song_transfer,
+            primary_stem_transfer,
+            secondary_stem_transfer,
+            converted_vocals_transfer,
+            effected_vocals_transfer,
+            shifted_instrumentals_transfer,
+            shifted_backup_vocals_transfer,
+            song_cover_transfer,
+        ) = [
+            gr.Dropdown(
+                [
+                    "Step 1: audio",
+                    "Step 2: vocals",
+                    "Step 3: vocals",
+                    "Step 4: instrumentals",
+                    "Step 4: backup vocals",
+                    "Step 5: main vocals",
+                    "Step 5: instrumentals",
+                    "Step 5: backup vocals",
+                ],
+                label=f"{label_prefix} destination",
+                info=(
+                    "Select the input track(s) to transfer the"
+                    f" {label_prefix.lower()} to when the 'Transfer"
+                    f" {label_prefix.lower()}' button is clicked."
+                ),
+                render=False,
+                type="index",
+                multiselect=True,
+                value=value,
+            )
+            for value, label_prefix in zip(
+                transfer_defaults,
+                [
+                    "Song",
+                    "Primary stem",
+                    "Secondary stem",
+                    "Converted vocals",
+                    "Effected vocals",
+                    "Pitch-shifted instrumentals",
+                    "Pitch-shifted backup vocals",
+                    "Song cover",
+                ],
+                strict=True,
+            )
+        ]
+
+        with gr.Accordion("Step 0: song retrieval", open=True):
+            gr.Markdown("")
+            gr.Markdown("**Inputs**")
+            with gr.Row():
+                with gr.Column():
+                    source_type = gr.Dropdown(
+                        list(SourceType),
+                        value=SourceType.PATH,
+                        label="Source type",
+                        type="index",
+                        info="The type of source to retrieve a song from.",
+                    )
+                with gr.Column():
+                    source = gr.Textbox(
+                        label="Source",
+                        info=(
+                            "Link to a song on YouTube or the full path of a local"
+                            " audio file."
+                        ),
+                    )
+                    local_file = gr.Audio(
+                        label="Source",
+                        type="filepath",
+                        visible=False,
+                    )
+                    cached_song_multi.render()
+
+                source_type.input(
+                    partial(toggle_visible_component, 3),
+                    inputs=source_type,
+                    outputs=[source, local_file, cached_song_multi],
+                    show_progress="hidden",
+                )
+
+                local_file.change(
+                    update_value,
+                    inputs=local_file,
+                    outputs=source,
+                    show_progress="hidden",
+                )
+                cached_song_multi.input(
+                    update_value,
+                    inputs=cached_song_multi,
+                    outputs=source,
+                    show_progress="hidden",
+                )
+            gr.Markdown("**Settings**")
+            song_transfer.render()
+            gr.Markdown("**Outputs**")
+            song_output.render()
+            gr.Markdown("**Controls**")
+            retrieve_song_btn = gr.Button("Retrieve song", variant="primary")
+            song_transfer_btn = gr.Button("Transfer song")
+            retrieve_song_reset_btn = gr.Button("Reset settings")
+
+            retrieve_song_reset_btn.click(
+                lambda: gr.Dropdown(value=song_transfer_default),
+                outputs=song_transfer,
+                show_progress="hidden",
+            )
+            retrieve_song_btn.click(
+                partial(
+                    exception_harness(
+                        retrieve_song,
+                        info_msg="Song retrieved successfully!",
+                    ),
+                    progress_bar=PROGRESS_BAR,
+                ),
+                inputs=source,
+                outputs=[song_output, current_song_dir],
+            ).then(
+                partial(
+                    update_cached_songs,
+                    len(song_dirs) + 2,
+                    value_indices=range(len(song_dirs)),
+                ),
+                inputs=current_song_dir,
+                outputs=([*song_dirs, cached_song_multi, cached_song_1click]),
+                show_progress="hidden",
+            ).then(
+                partial(update_cached_songs, 1, [], [0]),
+                outputs=intermediate_audio,
+                show_progress="hidden",
+            )
+
+        with gr.Accordion("Step 1: vocal separation", open=False):
+            gr.Markdown("")
+            gr.Markdown("**Inputs**")
+            audio_track_input.render()
+            separate_audio_dir.render()
+            gr.Markdown("**Settings**")
+            with gr.Row():
+                separation_model = gr.Dropdown(
+                    list(SeparationModel),
+                    value=SeparationModel.UVR_MDX_NET_VOC_FT,
+                    label="Separation model",
+                    info="The model to use for audio separation.",
+                )
+                segment_size = gr.Radio(
+                    list(SegmentSize),
+                    value=SegmentSize.SEG_512,
+                    label="Segment size",
+                    info=(
+                        "Size of segments into which the audio is split. Larger"
+                        " consumes more resources, but may give better results."
+                    ),
+                )
+            with gr.Row():
+                primary_stem_transfer.render()
+                secondary_stem_transfer.render()
+
+            gr.Markdown("**Outputs**")
+            with gr.Row():
+                primary_stem_output.render()
+                secondary_stem_output.render()
+            gr.Markdown("**Controls**")
+            separate_vocals_btn = gr.Button("Separate vocals", variant="primary")
+            with gr.Row():
+                primary_stem_transfer_btn = gr.Button("Transfer primary stem")
+                secondary_stem_transfer_btn = gr.Button("Transfer secondary stem")
+            separate_audio_reset_btn = gr.Button("Reset settings")
+
+            separate_audio_reset_btn.click(
+                lambda: [
+                    SeparationModel.UVR_MDX_NET_VOC_FT,
+                    SegmentSize.SEG_512,
+                    gr.Dropdown(value=primary_stem_transfer_default),
+                    gr.Dropdown(value=secondary_stem_transfer_default),
+                ],
+                outputs=[
+                    separation_model,
+                    segment_size,
+                    primary_stem_transfer,
+                    secondary_stem_transfer,
+                ],
+                show_progress="hidden",
+            )
+            separate_vocals_btn.click(
+                partial(
+                    exception_harness(
+                        separate_audio,
+                        info_msg="Vocals separated successfully!",
+                    ),
+                    progress_bar=PROGRESS_BAR,
+                ),
+                inputs=[
+                    audio_track_input,
+                    separate_audio_dir,
+                    separation_model,
+                    segment_size,
+                ],
+                outputs=[primary_stem_output, secondary_stem_output],
+                concurrency_limit=1,
+                concurrency_id=ConcurrencyId.GPU,
+            )
+        with gr.Accordion("Step 2: vocal conversion", open=False):
+            gr.Markdown("")
+            gr.Markdown("**Inputs**")
+            vocals_track_input.render()
+            with gr.Row():
+                convert_vocals_dir.render()
+                if model_multi.info:
+                    model_multi.info += "<br><br>"
+                model_multi.render()
+            gr.Markdown("**Settings**")
+            with gr.Row():
+                n_octaves = gr.Slider(
+                    -3,
+                    3,
+                    value=0,
+                    step=1,
+                    label="Pitch shift (octaves)",
+                    info=(
+                        "The number of octaves to pitch-shift the converted vocals by."
+                        " Use 1 for male-to-female and -1 for vice-versa."
+                    ),
+                )
+                n_semitones = gr.Slider(
+                    -12,
+                    12,
+                    value=0,
+                    step=1,
+                    label="Pitch shift (semi-tones)",
+                    info=(
+                        "The number of semi-tones to pitch-shift the converted vocals"
+                        " by. Altering this slightly reduces sound quality."
+                    ),
+                )
+            with gr.Row():
+                index_rate = gr.Slider(
+                    0,
+                    1,
+                    value=0.5,
+                    label="Index rate",
+                    info=(
+                        "How much of the accent in the voice model to keep in the"
+                        " converted vocals. Increase to bias the conversion towards the"
+                        " accent of the voice model."
+                    ),
+                )
+                filter_radius = gr.Slider(
+                    0,
+                    7,
+                    value=3,
+                    step=1,
+                    label="Filter radius",
+                    info=(
+                        "If >=3: apply median filtering to harvested pitch results."
+                        " Can help reduce breathiness in the converted vocals."
+                    ),
+                )
+                rms_mix_rate = gr.Slider(
+                    0,
+                    1,
+                    value=0.25,
+                    label="RMS mix rate",
+                    info=(
+                        "How much to mimic the loudness (0) of the input vocals or a"
+                        " fixed loudness (1)."
+                        "<br><br>"
+                    ),
+                )
+            with gr.Row():
+                protect = gr.Slider(
+                    0,
+                    0.5,
+                    value=0.33,
+                    label="Protect rate",
+                    info=(
+                        "Protection of voiceless consonants and breath sounds. Decrease"
+                        " to increase protection at the cost of indexing accuracy. Set"
+                        " to 0.5 to disable."
+                        "<br>"
+                    ),
+                )
+                f0_method = gr.Dropdown(
+                    list(F0Method),
+                    value=F0Method.RMVPE,
+                    label="Pitch detection algorithm",
+                    info=(
+                        "The method to use for pitch detection. Best option is"
+                        " RMVPE (clarity in vocals), then Mangio-CREPE (smoother"
+                        " vocals)."
+                        "<br><br>"
+                    ),
+                )
+                hop_length = gr.Slider(
+                    32,
+                    320,
+                    value=128,
+                    step=1,
+                    label="Hop length",
+                    info=(
+                        "How often the CREPE-based pitch detection algorithm checks"
+                        " for pitch changes. Measured in milliseconds. Lower values"
+                        " lead to longer conversion times and a higher risk of"
+                        " voice cracks, but better pitch accuracy."
+                    ),
+                )
+
+            converted_vocals_transfer.render()
+            gr.Markdown("**Outputs**")
+            converted_vocals_track_output.render()
+            gr.Markdown("**Controls**")
+            convert_vocals_btn = gr.Button("Convert vocals", variant="primary")
+            converted_vocals_transfer_btn = gr.Button("Transfer converted vocals")
+            convert_vocals_reset_btn = gr.Button("Reset settings")
+
+            convert_vocals_reset_btn.click(
+                lambda: [
+                    0,
+                    0,
+                    0.5,
+                    3,
+                    0.25,
+                    0.33,
+                    F0Method.RMVPE,
+                    128,
+                    gr.Dropdown(value=converted_vocals_transfer_default),
+                ],
+                outputs=[
+                    n_octaves,
+                    n_semitones,
+                    index_rate,
+                    filter_radius,
+                    rms_mix_rate,
+                    protect,
+                    f0_method,
+                    hop_length,
+                    converted_vocals_transfer,
+                ],
+                show_progress="hidden",
+            )
+            convert_vocals_btn.click(
+                partial(
+                    exception_harness(
+                        convert,
+                        info_msg="Vocals converted successfully!",
+                    ),
+                    progress_bar=PROGRESS_BAR,
+                ),
+                inputs=[
+                    vocals_track_input,
+                    convert_vocals_dir,
+                    model_multi,
+                    n_octaves,
+                    n_semitones,
+                    f0_method,
+                    index_rate,
+                    filter_radius,
+                    rms_mix_rate,
+                    protect,
+                    hop_length,
+                ],
+                outputs=converted_vocals_track_output,
+                concurrency_id=ConcurrencyId.GPU,
+                concurrency_limit=1,
+            )
+        with gr.Accordion("Step 3: vocal post-processing", open=False):
+            gr.Markdown("")
+            gr.Markdown("**Inputs**")
+            converted_vocals_track_input.render()
+            postprocess_vocals_dir.render()
+            gr.Markdown("**Settings**")
+            with gr.Row():
+                room_size = gr.Slider(
+                    0,
+                    1,
+                    value=0.15,
+                    label="Room size",
+                    info=(
+                        "Size of the room which reverb effect simulates. Increase for"
+                        " longer reverb time."
+                    ),
+                )
+            with gr.Row():
+                wet_level = gr.Slider(
+                    0,
+                    1,
+                    value=0.2,
+                    label="Wetness level",
+                    info="Loudness of converted vocals with reverb effect applied.",
+                )
+                dry_level = gr.Slider(
+                    0,
+                    1,
+                    value=0.8,
+                    label="Dryness level",
+                    info="Loudness of converted vocals without reverb effect applied.",
+                )
+                damping = gr.Slider(
+                    0,
+                    1,
+                    value=0.7,
+                    label="Damping level",
+                    info="Absorption of high frequencies in reverb effect.",
+                )
+
+            effected_vocals_transfer.render()
+            gr.Markdown("**Outputs**")
+
+            effected_vocals_track_output.render()
+            gr.Markdown("**Controls**")
+            postprocess_vocals_btn = gr.Button(
+                "Post-process vocals",
+                variant="primary",
+            )
+            effected_vocals_transfer_btn = gr.Button("Transfer effected vocals")
+            postprocess_vocals_reset_btn = gr.Button("Reset settings")
+
+            postprocess_vocals_reset_btn.click(
+                lambda: [
+                    0.15,
+                    0.2,
+                    0.8,
+                    0.7,
+                    gr.Dropdown(value=effected_vocals_transfer_default),
+                ],
+                outputs=[
+                    room_size,
+                    wet_level,
+                    dry_level,
+                    damping,
+                    effected_vocals_transfer,
+                ],
+                show_progress="hidden",
+            )
+            postprocess_vocals_btn.click(
+                partial(
+                    exception_harness(
+                        postprocess,
+                        info_msg="Vocals post-processed successfully!",
+                    ),
+                    progress_bar=PROGRESS_BAR,
+                ),
+                inputs=[
+                    converted_vocals_track_input,
+                    postprocess_vocals_dir,
+                    room_size,
+                    wet_level,
+                    dry_level,
+                    damping,
+                ],
+                outputs=effected_vocals_track_output,
+            )
+        with gr.Accordion("Step 4: pitch shift of background audio", open=False):
+            gr.Markdown("")
+            gr.Markdown("**Inputs**")
+            with gr.Row():
+                instrumentals_track_input.render()
+                backup_vocals_track_input.render()
+            pitch_shift_background_dir.render()
+            gr.Markdown("**Settings**")
+            with gr.Row():
+                n_semitones_instrumentals = gr.Slider(
+                    -12,
+                    12,
+                    value=0,
+                    step=1,
+                    label="Instrumental pitch shift",
+                    info="The number of semi-tones to pitch-shift the instrumentals by",
+                )
+                n_semitones_backup_vocals = gr.Slider(
+                    -12,
+                    12,
+                    value=0,
+                    step=1,
+                    label="Backup vocal pitch shift",
+                    info="The number of semi-tones to pitch-shift the backup vocals by",
+                )
+            with gr.Row():
+                if shifted_instrumentals_transfer.info:
+                    shifted_instrumentals_transfer.info += "<br><br>"
+                shifted_instrumentals_transfer.render()
+                shifted_backup_vocals_transfer.render()
+
+            gr.Markdown("**Outputs**")
+            with gr.Row():
+                shifted_instrumentals_track_output.render()
+                shifted_backup_vocals_track_output.render()
+            gr.Markdown("**Controls**")
+            with gr.Row():
+                pitch_shift_instrumentals_btn = gr.Button(
+                    "Pitch shift instrumentals",
+                    variant="primary",
+                )
+                pitch_shift_backup_vocals_btn = gr.Button(
+                    "Pitch shift backup vocals",
+                    variant="primary",
+                )
+            with gr.Row():
+                shifted_instrumentals_transfer_btn = gr.Button(
+                    "Transfer pitch-shifted instrumentals",
+                )
+                shifted_backup_vocals_transfer_btn = gr.Button(
+                    "Transfer pitch-shifted backup vocals",
+                )
+            pitch_shift_background_reset_btn = gr.Button("Reset settings")
+
+            pitch_shift_background_reset_btn.click(
+                lambda: [
+                    0,
+                    0,
+                    gr.Dropdown(value=shifted_instrumentals_transfer_default),
+                    gr.Dropdown(value=shifted_backup_vocals_transfer_default),
+                ],
+                outputs=[
+                    n_semitones_instrumentals,
+                    n_semitones_backup_vocals,
+                    shifted_instrumentals_transfer,
+                    shifted_backup_vocals_transfer,
+                ],
+                show_progress="hidden",
+            )
+            pitch_shift_instrumentals_btn.click(
+                partial(
+                    exception_harness(
+                        pitch_shift,
+                        info_msg="Instrumentals pitch-shifted successfully!",
+                    ),
+                    display_msg="Pitch shifting instrumentals...",
+                    progress_bar=PROGRESS_BAR,
+                ),
+                inputs=[
+                    instrumentals_track_input,
+                    pitch_shift_background_dir,
+                    n_semitones_instrumentals,
+                ],
+                outputs=shifted_instrumentals_track_output,
+            )
+            pitch_shift_backup_vocals_btn.click(
+                partial(
+                    exception_harness(
+                        pitch_shift,
+                        info_msg="Backup vocals pitch-shifted successfully!",
+                    ),
+                    display_msg="Pitch shifting backup vocals...",
+                    progress_bar=PROGRESS_BAR,
+                ),
+                inputs=[
+                    backup_vocals_track_input,
+                    pitch_shift_background_dir,
+                    n_semitones_backup_vocals,
+                ],
+                outputs=shifted_backup_vocals_track_output,
+            )
+        with gr.Accordion("Step 5: song mixing", open=False):
+            gr.Markdown("")
+            gr.Markdown("**Inputs**")
+            with gr.Row():
+                main_vocals_track_input.render()
+                shifted_instrumentals_track_input.render()
+                shifted_backup_vocals_track_input.render()
+            mix_dir.render()
+            gr.Markdown("**Settings**")
+            with gr.Row():
+                main_gain = gr.Slider(
+                    -20,
+                    20,
+                    value=0,
+                    step=1,
+                    label="Main gain",
+                    info="The gain to apply to the main vocals.",
+                )
+                inst_gain = gr.Slider(
+                    -20,
+                    20,
+                    value=0,
+                    step=1,
+                    label="Instrumentals gain",
+                    info="The gain to apply to the instrumentals.",
+                )
+                backup_gain = gr.Slider(
+                    -20,
+                    20,
+                    value=0,
+                    step=1,
+                    label="Backup gain",
+                    info="The gain to apply to the backup vocals.",
+                )
+            with gr.Row():
+                output_name = gr.Textbox(
+                    value=update_song_cover_name,
+                    inputs=[main_vocals_track_input, mix_dir],
+                    label="Output name",
+                    placeholder="Ultimate RVC song cover",
+                    info=(
+                        "If no name is provided, a suitable name will be generated"
+                        " automatically."
+                    ),
+                )
+                output_sr = gr.Dropdown(
+                    choices=list(SampleRate),
+                    value=SampleRate.HZ_44100,
+                    label="Output sample rate",
+                    info="The sample rate to save the generated song in.",
+                )
+                output_format = gr.Dropdown(
+                    list(AudioExt),
+                    value=AudioExt.MP3,
+                    label="Output format",
+                    info="The format to save the generated song in.",
+                )
+            song_cover_transfer.render()
+            gr.Markdown("**Outputs**")
+            song_cover_output.render()
+            gr.Markdown("**Controls**")
+            mix_btn = gr.Button("Mix song cover", variant="primary")
+            song_cover_transfer_btn = gr.Button("Transfer song cover")
+            mix_reset_btn = gr.Button("Reset settings")
+
+            mix_reset_btn.click(
+                lambda: [
+                    0,
+                    0,
+                    0,
+                    SampleRate.HZ_44100,
+                    AudioExt.MP3,
+                    gr.Dropdown(value=song_cover_transfer_default),
+                ],
+                outputs=[
+                    main_gain,
+                    inst_gain,
+                    backup_gain,
+                    output_sr,
+                    output_format,
+                    song_cover_transfer,
+                ],
+                show_progress="hidden",
+            )
+            temp_audio_gains = gr.State()
+            mix_btn.click(
+                partial(
+                    _pair_audio_tracks_and_gain,
+                    [
+                        main_vocals_track_input,
+                        shifted_instrumentals_track_input,
+                        shifted_backup_vocals_track_input,
+                    ],
+                    [main_gain, inst_gain, backup_gain],
+                ),
+                inputs={
+                    main_vocals_track_input,
+                    shifted_instrumentals_track_input,
+                    shifted_backup_vocals_track_input,
+                    main_gain,
+                    inst_gain,
+                    backup_gain,
+                },
+                outputs=temp_audio_gains,
+            ).then(
+                partial(
+                    exception_harness(
+                        mix_song,
+                        info_msg="Song cover succesfully generated.",
+                    ),
+                    progress_bar=PROGRESS_BAR,
+                ),
+                inputs=[
+                    temp_audio_gains,
+                    mix_dir,
+                    output_sr,
+                    output_format,
+                    output_name,
+                ],
+                outputs=song_cover_output,
+            ).then(
+                partial(update_output_audio, 1, [], [0]),
+                outputs=output_audio,
+                show_progress="hidden",
+            )
+
+        for btn, transfer, output in [
+            (song_transfer_btn, song_transfer, song_output),
+            (primary_stem_transfer_btn, primary_stem_transfer, primary_stem_output),
+            (
+                secondary_stem_transfer_btn,
+                secondary_stem_transfer,
+                secondary_stem_output,
+            ),
+            (
+                converted_vocals_transfer_btn,
+                converted_vocals_transfer,
+                converted_vocals_track_output,
+            ),
+            (
+                effected_vocals_transfer_btn,
+                effected_vocals_transfer,
+                effected_vocals_track_output,
+            ),
+            (
+                shifted_instrumentals_transfer_btn,
+                shifted_instrumentals_transfer,
+                shifted_instrumentals_track_output,
+            ),
+            (
+                shifted_backup_vocals_transfer_btn,
+                shifted_backup_vocals_transfer,
+                shifted_backup_vocals_track_output,
+            ),
+            (song_cover_transfer_btn, song_cover_transfer, song_cover_output),
+        ]:
+            btn.click(
+                partial(_update_audio, len(input_tracks)),
+                inputs=[transfer, output],
+                outputs=input_tracks,
+                show_progress="hidden",
+            )
diff --git a/src/ultimate_rvc/web/tabs/one_click_generation.py b/src/ultimate_rvc/web/tabs/one_click_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cf73aee20e264375d438b27d796c0a6d86bf751
--- /dev/null
+++ b/src/ultimate_rvc/web/tabs/one_click_generation.py
@@ -0,0 +1,487 @@
+"""Module which defines the code for the "One-click generation" tab."""
+
+from collections.abc import Sequence
+from functools import partial
+
+import gradio as gr
+
+from ultimate_rvc.core.generate.song_cover import run_pipeline
+from ultimate_rvc.typing_extra import AudioExt, F0Method, SampleRate
+from ultimate_rvc.web.common import (
+    PROGRESS_BAR,
+    exception_harness,
+    toggle_visible_component,
+    update_cached_songs,
+    update_output_audio,
+    update_song_cover_name,
+    update_value,
+)
+from ultimate_rvc.web.typing_extra import ConcurrencyId, SourceType
+
+
+def _toggle_intermediate_audio(
+    visible: bool,
+) -> list[gr.Accordion]:
+    """
+    Toggle the visibility of intermediate audio accordions.
+
+    Parameters
+    ----------
+    visible : bool
+        Visibility status of the intermediate audio accordions.
+
+    Returns
+    -------
+    list[gr.Accordion]
+        The intermediate audio accordions.
+
+    """
+    accordions = [gr.Accordion(open=False) for _ in range(7)]
+    return [gr.Accordion(visible=visible, open=False), *accordions]
+
+
+def render(
+    song_dirs: Sequence[gr.Dropdown],
+    cached_song_1click: gr.Dropdown,
+    cached_song_multi: gr.Dropdown,
+    model_1click: gr.Dropdown,
+    intermediate_audio: gr.Dropdown,
+    output_audio: gr.Dropdown,
+) -> None:
+    """
+    Render "One-click generation" tab.
+
+    Parameters
+    ----------
+    song_dirs : Sequence[gr.Dropdown]
+        Dropdowns for selecting song directories in the
+        "Multi-step generation" tab.
+    cached_song_1click : gr.Dropdown
+        Dropdown for selecting a cached song in the
+        "One-click generation" tab
+    cached_song_multi : gr.Dropdown
+        Dropdown for selecting a cached song in the
+        "Multi-step generation" tab
+    model_1click : gr.Dropdown
+        Dropdown for selecting voice model in the
+        "One-click generation" tab.
+    intermediate_audio : gr.Dropdown
+        Dropdown for selecting intermediate audio files to delete in the
+        "Delete audio" tab.
+    output_audio : gr.Dropdown
+        Dropdown for selecting output audio files to delete in the
+        "Delete audio" tab.
+
+    """
+    with gr.Tab("One-click generation"):
+        with gr.Accordion("Main options"):
+            with gr.Row():
+                with gr.Column():
+                    source_type = gr.Dropdown(
+                        list(SourceType),
+                        value=SourceType.PATH,
+                        label="Source type",
+                        type="index",
+                        info="The type of source to retrieve a song from.",
+                    )
+                with gr.Column():
+                    source = gr.Textbox(
+                        label="Source",
+                        info=(
+                            "Link to a song on YouTube or the full path of a local"
+                            " audio file."
+                        ),
+                    )
+                    local_file = gr.Audio(
+                        label="Source",
+                        type="filepath",
+                        visible=False,
+                    )
+                    cached_song_1click.render()
+                source_type.input(
+                    partial(toggle_visible_component, 3),
+                    inputs=source_type,
+                    outputs=[source, local_file, cached_song_1click],
+                    show_progress="hidden",
+                )
+
+                local_file.change(
+                    update_value,
+                    inputs=local_file,
+                    outputs=source,
+                    show_progress="hidden",
+                )
+                cached_song_1click.input(
+                    update_value,
+                    inputs=cached_song_1click,
+                    outputs=source,
+                    show_progress="hidden",
+                )
+            with gr.Row():
+                model_1click.render()
+                n_octaves = gr.Slider(
+                    -3,
+                    3,
+                    value=0,
+                    step=1,
+                    label="Vocal pitch shift",
+                    info=(
+                        "The number of octaves to pitch-shift converted vocals by."
+                        " Use 1 for male-to-female and -1 for vice-versa."
+                    ),
+                )
+                n_semitones = gr.Slider(
+                    -12,
+                    12,
+                    value=0,
+                    step=1,
+                    label="Overall pitch shift",
+                    info=(
+                        "The number of semi-tones to pitch-shift converted vocals,"
+                        " instrumentals, and backup vocals by."
+                    ),
+                )
+
+        with gr.Accordion("Vocal conversion options", open=False):
+            with gr.Row():
+                index_rate = gr.Slider(
+                    0,
+                    1,
+                    value=0.5,
+                    label="Index rate",
+                    info=(
+                        "How much of the accent in the voice model to keep in the"
+                        " converted vocals. Increase to bias the conversion towards the"
+                        " accent of the voice model."
+                    ),
+                )
+                filter_radius = gr.Slider(
+                    0,
+                    7,
+                    value=3,
+                    step=1,
+                    label="Filter radius",
+                    info=(
+                        "If >=3: apply median filtering to harvested pitch results."
+                        " Can help reduce breathiness in the converted vocals."
+                    ),
+                )
+                rms_mix_rate = gr.Slider(
+                    0,
+                    1,
+                    value=0.25,
+                    label="RMS mix rate",
+                    info=(
+                        "How much to mimic the loudness (0) of the input vocals or a"
+                        " fixed loudness (1)."
+                        "<br><br>"
+                    ),
+                )
+            with gr.Row():
+                protect = gr.Slider(
+                    0,
+                    0.5,
+                    value=0.33,
+                    label="Protect rate",
+                    info=(
+                        "Protection of voiceless consonants and breath sounds. Decrease"
+                        " to increase protection at the cost of indexing accuracy. Set"
+                        " to 0.5 to disable."
+                        "<br><br>"
+                    ),
+                )
+                f0_method = gr.Dropdown(
+                    list(F0Method),
+                    value=F0Method.RMVPE,
+                    label="Pitch detection algorithm",
+                    info=(
+                        "The method to use for pitch detection. Best option is RMVPE"
+                        " (clarity in vocals), then Mangio-CREPE (smoother vocals)."
+                        "<br><br>"
+                    ),
+                )
+                hop_length = gr.Slider(
+                    32,
+                    320,
+                    value=128,
+                    step=1,
+                    label="Hop length",
+                    info=(
+                        "How often the CREPE-based pitch detection algorithm checks for"
+                        " pitch changes. Measured in milliseconds. Lower values lead to"
+                        " longer conversion times and a higher risk of voice cracks,"
+                        " but better pitch accuracy."
+                    ),
+                )
+        with gr.Accordion("Audio mixing options", open=False):
+            gr.Markdown("")
+            gr.Markdown("**Reverb control on converted vocals**")
+            with gr.Row():
+                room_size = gr.Slider(
+                    0,
+                    1,
+                    value=0.15,
+                    label="Room size",
+                    info=(
+                        "Size of the room which reverb effect simulates. Increase for"
+                        " longer reverb time."
+                    ),
+                )
+            with gr.Row():
+                wet_level = gr.Slider(
+                    0,
+                    1,
+                    value=0.2,
+                    label="Wetness level",
+                    info="Loudness of converted vocals with reverb effect applied.",
+                )
+                dry_level = gr.Slider(
+                    0,
+                    1,
+                    value=0.8,
+                    label="Dryness level",
+                    info="Loudness of converted vocals without reverb effect applied.",
+                )
+                damping = gr.Slider(
+                    0,
+                    1,
+                    value=0.7,
+                    label="Damping level",
+                    info="Absorption of high frequencies in reverb effect.",
+                )
+
+            gr.Markdown("")
+            gr.Markdown("**Volume controls (dB)**")
+            with gr.Row():
+                main_gain = gr.Slider(-20, 20, value=0, step=1, label="Main vocals")
+                inst_gain = gr.Slider(-20, 20, value=0, step=1, label="Instrumentals")
+                backup_gain = gr.Slider(-20, 20, value=0, step=1, label="Backup vocals")
+        with gr.Accordion("Audio output options", open=False):
+            with gr.Row():
+                output_name = gr.Textbox(
+                    value=partial(
+                        update_song_cover_name,
+                        None,
+                        update_placeholder=True,
+                    ),
+                    inputs=[cached_song_1click, model_1click],
+                    label="Output name",
+                    info=(
+                        "If no name is provided, a suitable name will be generated"
+                        " automatically."
+                    ),
+                    placeholder="Ultimate RVC song cover",
+                )
+                output_sr = gr.Dropdown(
+                    choices=list(SampleRate),
+                    value=SampleRate.HZ_44100,
+                    label="Output sample rate",
+                    info="The sample rate to save the generated song cover in.",
+                )
+                output_format = gr.Dropdown(
+                    list(AudioExt),
+                    value=AudioExt.MP3,
+                    label="Output format",
+                    info="The format to save the generated song cover in.",
+                )
+            with gr.Row():
+                show_intermediate_audio = gr.Checkbox(
+                    label="Show intermediate audio",
+                    value=False,
+                    info=(
+                        "Show intermediate audio tracks generated during song cover"
+                        " generation."
+                    ),
+                )
+
+        intermediate_audio_accordions = [
+            gr.Accordion(label, open=False, render=False)
+            for label in [
+                "Step 0: song retrieval",
+                "Step 1a: vocals/instrumentals separation",
+                "Step 1b: main vocals/ backup vocals separation",
+                "Step 1c: main vocals cleanup",
+                "Step 2: conversion of main vocals",
+                "Step 3: post-processing of converted vocals",
+                "Step 4: pitch shift of background tracks",
+            ]
+        ]
+        (
+            song_retrieval_accordion,
+            vocals_separation_accordion,
+            main_vocals_separation_accordion,
+            vocal_cleanup_accordion,
+            vocal_conversion_accordion,
+            vocals_postprocessing_accordion,
+            pitch_shift_accordion,
+        ) = intermediate_audio_accordions
+        intermediate_audio_tracks = [
+            gr.Audio(label=label, type="filepath", interactive=False, render=False)
+            for label in [
+                "Song",
+                "Vocals",
+                "Instrumentals",
+                "Main vocals",
+                "Backup vocals",
+                "De-reverbed main vocals",
+                "Main vocals reverb",
+                "Converted vocals",
+                "Post-processed vocals",
+                "Pitch-shifted instrumentals",
+                "Pitch-shifted backup vocals",
+            ]
+        ]
+        (
+            song,
+            vocals_track,
+            instrumentals_track,
+            main_vocals_track,
+            backup_vocals_track,
+            main_vocals_dereverbed_track,
+            main_vocals_reverb_track,
+            converted_vocals_track,
+            postprocessed_vocals_track,
+            instrumentals_shifted_track,
+            backup_vocals_shifted_track,
+        ) = intermediate_audio_tracks
+        with gr.Accordion(
+            "Intermediate audio tracks",
+            open=False,
+            visible=False,
+        ) as intermediate_audio_accordion:
+            song_retrieval_accordion.render()
+            with song_retrieval_accordion:
+                song.render()
+            vocals_separation_accordion.render()
+            with vocals_separation_accordion, gr.Row():
+                vocals_track.render()
+                instrumentals_track.render()
+            main_vocals_separation_accordion.render()
+            with main_vocals_separation_accordion, gr.Row():
+                main_vocals_track.render()
+                backup_vocals_track.render()
+            vocal_cleanup_accordion.render()
+            with vocal_cleanup_accordion, gr.Row():
+                main_vocals_dereverbed_track.render()
+                main_vocals_reverb_track.render()
+            vocal_conversion_accordion.render()
+            with vocal_conversion_accordion:
+                converted_vocals_track.render()
+            vocals_postprocessing_accordion.render()
+            with vocals_postprocessing_accordion:
+                postprocessed_vocals_track.render()
+            pitch_shift_accordion.render()
+            with pitch_shift_accordion, gr.Row():
+                instrumentals_shifted_track.render()
+                backup_vocals_shifted_track.render()
+
+        show_intermediate_audio.change(
+            _toggle_intermediate_audio,
+            inputs=show_intermediate_audio,
+            outputs=[
+                intermediate_audio_accordion,
+                *intermediate_audio_accordions,
+            ],
+            show_progress="hidden",
+        )
+
+        with gr.Row(equal_height=True):
+            reset_btn = gr.Button(value="Reset settings", scale=2)
+            generate_btn = gr.Button("Generate", scale=2, variant="primary")
+            song_cover = gr.Audio(label="Song cover", scale=3)
+
+        generate_btn.click(
+            partial(
+                exception_harness(
+                    run_pipeline,
+                    info_msg="Song cover generated successfully!",
+                ),
+                progress_bar=PROGRESS_BAR,
+            ),
+            inputs=[
+                source,
+                model_1click,
+                n_octaves,
+                n_semitones,
+                f0_method,
+                index_rate,
+                filter_radius,
+                rms_mix_rate,
+                protect,
+                hop_length,
+                room_size,
+                wet_level,
+                dry_level,
+                damping,
+                main_gain,
+                inst_gain,
+                backup_gain,
+                output_sr,
+                output_format,
+                output_name,
+            ],
+            outputs=[song_cover, *intermediate_audio_tracks],
+            concurrency_limit=1,
+            concurrency_id=ConcurrencyId.GPU,
+        ).success(
+            partial(
+                update_cached_songs,
+                3 + len(song_dirs),
+                [],
+                [2],
+            ),
+            outputs=[
+                cached_song_1click,
+                cached_song_multi,
+                intermediate_audio,
+                *song_dirs,
+            ],
+            show_progress="hidden",
+        ).then(
+            partial(update_output_audio, 1, [], [0]),
+            outputs=[output_audio],
+            show_progress="hidden",
+        )
+        reset_btn.click(
+            lambda: [
+                0,
+                0,
+                0.5,
+                3,
+                0.25,
+                0.33,
+                F0Method.RMVPE,
+                128,
+                0.15,
+                0.2,
+                0.8,
+                0.7,
+                0,
+                0,
+                0,
+                SampleRate.HZ_44100,
+                AudioExt.MP3,
+                False,
+            ],
+            outputs=[
+                n_octaves,
+                n_semitones,
+                index_rate,
+                filter_radius,
+                rms_mix_rate,
+                protect,
+                f0_method,
+                hop_length,
+                room_size,
+                wet_level,
+                dry_level,
+                damping,
+                main_gain,
+                inst_gain,
+                backup_gain,
+                output_sr,
+                output_format,
+                show_intermediate_audio,
+            ],
+            show_progress="hidden",
+        )
diff --git a/src/ultimate_rvc/web/tabs/other_settings.py b/src/ultimate_rvc/web/tabs/other_settings.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2d7fa47273444862381fbe5b77bffd21c83109d
--- /dev/null
+++ b/src/ultimate_rvc/web/tabs/other_settings.py
@@ -0,0 +1,46 @@
+"""Module which defines the code for the "Other settings" tab."""
+
+from functools import partial
+
+import gradio as gr
+
+from ultimate_rvc.core.manage.other_settings import delete_temp_files
+from ultimate_rvc.web.common import (
+    PROGRESS_BAR,
+    confirm_box_js,
+    confirmation_harness,
+    render_msg,
+)
+
+
+def render() -> None:
+    """Render "Other settings" tab."""
+    dummy_checkbox = gr.Checkbox(visible=False)
+
+    gr.Markdown("")
+    with gr.Accordion("Temporary files", open=True):
+        gr.Markdown("")
+        with gr.Row(equal_height=True):
+            temporary_files_btn = gr.Button("Delete all", variant="primary")
+            temporary_files_msg = gr.Textbox(label="Output message", interactive=False)
+
+    temporary_files_btn.click(
+        partial(
+            confirmation_harness(delete_temp_files),
+            progress_bar=PROGRESS_BAR,
+        ),
+        inputs=dummy_checkbox,
+        outputs=temporary_files_msg,
+        js=confirm_box_js(
+            "Are you sure you want to delete all temporary files? Any files uploaded"
+            " directly via the UI will not be available for further processing until"
+            " they are re-uploaded.",
+        ),
+    ).success(
+        partial(
+            render_msg,
+            "[-] Successfully deleted all temporary files!",
+        ),
+        outputs=temporary_files_msg,
+        show_progress="hidden",
+    )
diff --git a/src/ultimate_rvc/web/typing_extra.py b/src/ultimate_rvc/web/typing_extra.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfeebf1d5aa6841324f04c5f53786d8a200b1fd6
--- /dev/null
+++ b/src/ultimate_rvc/web/typing_extra.py
@@ -0,0 +1,96 @@
+"""
+Module which defines extra types for the web application of the Ultimate
+RVC project.
+"""
+
+from typing import Any, TypedDict
+
+from collections.abc import Callable, Sequence
+from enum import StrEnum, auto
+
+type DropdownChoices = (
+    Sequence[str | int | float | tuple[str, str | int | float]] | None
+)
+
+type DropdownValue = (
+    str | int | float | Sequence[str | int | float] | Callable[..., Any] | None
+)
+
+
+class ConcurrencyId(StrEnum):
+    """Enumeration of possible concurrency identifiers."""
+
+    GPU = auto()
+
+
+class SourceType(StrEnum):
+    """The type of source providing the song to generate a cover of."""
+
+    PATH = "YouTube link/local path"
+    LOCAL_FILE = "Local file"
+    CACHED_SONG = "Cached song"
+
+
+class ComponentVisibilityKwArgs(TypedDict):
+    """
+    Keyword arguments for setting component visibility.
+
+    Attributes
+    ----------
+    visible : bool
+        Whether the component should be visible.
+    value : Any
+        The value of the component.
+
+    """
+
+    visible: bool
+    value: Any
+
+
+class UpdateDropdownKwArgs(TypedDict, total=False):
+    """
+    Keyword arguments for updating a dropdown component.
+
+    Attributes
+    ----------
+    choices : DropdownChoices
+        The updated choices for the dropdown component.
+    value : DropdownValue
+        The updated value for the dropdown component.
+
+    """
+
+    choices: DropdownChoices
+    value: DropdownValue
+
+
+class TextBoxKwArgs(TypedDict, total=False):
+    """
+    Keyword arguments for updating a textbox component.
+
+    Attributes
+    ----------
+    value : str | None
+        The updated value for the textbox component.
+    placeholder : str | None
+        The updated placeholder for the textbox component.
+
+    """
+
+    value: str | None
+    placeholder: str | None
+
+
+class UpdateAudioKwArgs(TypedDict, total=False):
+    """
+    Keyword arguments for updating an audio component.
+
+    Attributes
+    ----------
+    value : str | None
+        The updated value for the audio component.
+
+    """
+
+    value: str | None
diff --git a/urvc b/urvc
new file mode 100644
index 0000000000000000000000000000000000000000..dbad58bd752b97e084eee1292362f6c126c514a8
--- /dev/null
+++ b/urvc
@@ -0,0 +1,186 @@
+#!/bin/bash
+# shellcheck shell=bash
+#
+# Licensed under the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+# Launcher script for Ultimate RVC on Debian-based linux systems.
+# Currently only supports Ubuntu 22.04 and Ubuntu 24.04.
+
+UV_PATH=$(pwd)/uv
+VENV_PATH=$UV_PATH/.venv
+
+export UV_UNMANAGED_INSTALL=$UV_PATH
+export UV_PYTHON_INSTALL_DIR="$UV_PATH/python"
+export UV_PYTHON_BIN_DIR="$UV_PATH/python/bin"
+export VIRTUAL_ENV=$VENV_PATH
+export UV_PROJECT_ENVIRONMENT=$VENV_PATH
+export UV_TOOL_DIR="$UV_PATH/tools"
+export UV_TOOL_BIN_DIR="$UV_PATH/tools/bin"
+export GRADIO_NODE_PATH="$VENV_PATH/lib/python3.12/site-packages/nodejs_wheel/bin/node"
+export PATH="$UV_PATH:$PATH"
+
+main() {
+    command=$1
+    shift
+    case $command in
+        install)
+            sudo apt install -y python3-dev unzip
+            install_distro_specifics
+            install_cuda_124
+            curl -LsSf https://astral.sh/uv/0.5.0/install.sh | sh
+            uv run ./src/ultimate_rvc/core/main.py
+            ;;
+        update)
+            git pull
+            ;;
+        uninstall)
+            confirmation_msg=$(
+                cat <<- EOF
+				Are you sure you want to uninstall?
+				This will delete all dependencies and user-generated data [Y/n]:
+				EOF
+            )
+            read -r -p "$confirmation_msg" confirmation
+            if [[ "$confirmation" =~ ^([Yy]|)$ ]]; then
+                git clean -dfX
+                echo "Uninstallation complete."
+            else
+                echo "Uninstallation canceled."
+            fi
+            ;;
+        run)
+            check_dependencies
+            uv run ./src/ultimate_rvc/web/main.py "$@"
+            ;;
+        dev)
+            check_dependencies
+            uv run gradio ./src/ultimate_rvc/web/main.py --demo-name app
+            ;;
+        cli)
+            check_dependencies
+            uv run ./src/ultimate_rvc/cli/main.py "$@"
+            ;;
+        docs)
+            check_dependencies
+            if [ "$#" -ne 2 ]; then
+                echo "The 'docs' command requires two arguments."
+                exit 1
+            fi
+            uv run python -m typer "$1" utils docs --output "$2"
+            ;;
+        uv)
+            check_dependencies
+            uv "$@"
+            ;;
+        help)
+            show_help
+            ;;
+        *)
+            cat <<- EOF
+			Invalid command. 
+			Use './urvc help' to see available commands.
+			EOF
+            exit 1
+            ;;
+    esac
+}
+
+
+install_distro_specifics() {
+    # shellcheck disable=SC1091
+    . /etc/lsb-release
+    case $DISTRIB_ID in
+        Ubuntu)
+            case $DISTRIB_RELEASE in
+                24.04)
+                    # Add Ubuntu 23.10 repository to sources.list so that we can install cuda 12.1 toolkit
+
+                    # first define the text to append to the file. 
+                    # For this we use a heredoc with removal of leading tabs
+                    TEXT=$(
+                        cat <<- EOF
+						
+						## Added by Ultimate RVC installer
+						Types: deb
+						URIs: http://archive.ubuntu.com/ubuntu/
+						Suites: lunar
+						Components: universe
+						Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg
+						EOF
+                    )
+                    FILE=/etc/apt/sources.list.d/ubuntu.sources
+                    # Append to file if not already present
+                    grep -qxF "## Added by Ultimate RVC installer" $FILE || echo "$TEXT" | sudo tee -a $FILE
+                    sudo apt update
+                    ;;
+                22.04)
+                    sudo apt install clang -y
+                    ;;
+                *)
+                    echo "Unsupported Ubuntu version"
+                    exit 1
+                    ;;
+            esac
+            ;;
+        *)
+            echo "Unsupported debian distribution"
+            exit 1
+            ;;
+    esac
+}
+
+install_cuda_124() {
+    echo "Installing CUDA 12.4"
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+    sudo dpkg -i cuda-keyring_1.1-1_all.deb
+    sudo apt update
+    sudo apt -y install cuda-toolkit-12-4
+    rm -rf cuda-keyring_1.1-1_all.deb
+    echo "CUDA 12.4 has been installed successfully"
+}
+
+check_dependencies() {
+    if [ ! -d "$UV_PATH" ]; then
+        echo "Dependencies not found. Please run './urvc install' first."
+        exit 1
+    fi
+}
+
+show_help() {
+	cat <<- EOF
+
+	Usage: ./urvc.sh [OPTIONS] COMMAND [ARGS]...
+
+	Commands:
+	  install      Install dependencies and set up environment.
+	  update       Update Ultimate RVC to the latest version.
+	  uninstall    Uninstall dependencies and user generated data.
+	  run          Start Ultimate RVC.
+	                 options:
+	                   --help     Show help message and exit.
+	                   [more information available, use --help to see all]
+	  dev          Start Ultimate RVC in development mode.
+	  cli          Start Ultimate RVC in CLI mode.
+	                 options:
+	                   --help     Show help message and exit.
+	                   [more information available, use --help to see all]
+	  docs         Generate documentation using Typer.
+	                 arguments:
+	                   0          The module to generate documentation for.
+	                   1          The output directory.
+	  uv           Run an arbitary command using uv.
+	                 arguments:
+	                   0          The command to run.
+	                   [more information available, use --help to see all]
+	                 options:
+	                   --help     Show help message and exit.
+	                   [more information available, use --help to see all]
+	  help         Show this message and exit.
+
+	EOF
+}
+
+main "$@"
diff --git a/urvc.ps1 b/urvc.ps1
new file mode 100644
index 0000000000000000000000000000000000000000..6c39dde93505710b1371eaab8d75337580395a96
--- /dev/null
+++ b/urvc.ps1
@@ -0,0 +1,143 @@
+# Licensed under the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+<#
+.SYNOPSIS
+
+The launcher for Ultimate RVC.
+
+.DESCRIPTION
+
+This script is the entry point for Ultimate RVC. It is responsible for installing dependencies,
+updating the application, running the application, and providing a CLI.
+
+.PARAMETER Command
+The command to run. The available commands are:
+
+install:   Install dependencies and set up environment.
+update:    Update Ultimate RVC to the latest version.
+uninstall: Uninstall dependencies and user generated data.
+run:       Start Ultimate RVC.
+dev:       Start Ultimate RVC in development mode.
+cli:       Start Ultimate RVC in CLI mode.
+docs:      Generate documentation using Typer.
+uv:        Run an arbitrary command using uv.
+help:      Print help.
+
+.PARAMETER Arguments
+The arguments and options to run the command with. 
+These are only used for the 'run', 'cli', 'docs' and 'uv' commands.
+
+run:
+    options:
+        --help: Print help.
+        [more information available, use --help to see all]
+cli:
+    options:
+        --help: Print help.
+        [more information available, use --help to see all]
+docs:
+    arguments:
+        0: The module to generate documentation for.
+        1: The output directory for the documentation.
+uv:
+    arguments:
+        0: The command to run.
+        [more information available, use --help to see all]
+    options:
+        --help: Print help.
+        [more information available, use --help to see all]
+
+#>
+
+param (
+    [Parameter(Position = 0, HelpMessage="The command to run.")]
+    [string]$Command,
+    [Parameter(ValueFromRemainingArguments = $true, `
+        HelpMessage="The arguments to pass to the command.")]
+    [string[]]$Arguments
+)
+
+$UvPath = "$(Get-location)\uv"
+$env:UV_UNMANAGED_INSTALL = $UvPath
+$env:UV_PYTHON_INSTALL_DIR = "$UvPath\python"
+$env:UV_PYTHON_BIN_DIR = "$UV_PATH\python\bin"
+$env:VIRTUAL_ENV = "$UvPath\.venv"
+$env:UV_PROJECT_ENVIRONMENT = "$UvPath\.venv"
+$env:UV_TOOL_DIR = "$UvPath\tools"
+$env:UV_TOOL_BIN_DIR = "$UvPath\tools\bin"
+$env:PATH = "$UvPath;$env:PATH"
+
+function Main {
+    param (
+        [string]$Command,
+        [string[]]$Arguments
+    )
+
+    switch ($Command) {
+        "install" {
+            Invoke-RestMethod https://astral.sh/uv/0.5.0/install.ps1 | Invoke-Expression
+            uv run ./src/ultimate_rvc/core/main.py
+        }
+        "update" {
+            git pull
+        }
+        "uninstall" {
+            $confirmation_msg = "Are you sure you want to uninstall?`n" `
+                + "This will delete all dependencies and user generated data [Y/n]"
+            $confirmation = Read-Host -Prompt $confirmation_msg
+            if ($confirmation -in @("", "Y", "y")) {
+                git clean -dfX
+                Write-Host "Uninstallation complete."
+            } else {
+                Write-Host "Uninstallation canceled."
+            }
+
+        }
+        "run" {
+            Assert-Dependencies
+            uv run ./src/ultimate_rvc/web/main.py @Arguments
+        }
+        "dev" {
+            Assert-Dependencies
+            uv run gradio ./src/ultimate_rvc/web/main.py --demo-name app
+        }
+        "cli" {
+            Assert-Dependencies
+            uv run ./src/ultimate_rvc/cli/main.py @Arguments
+        }
+        "docs" {
+            Assert-Dependencies
+            if ($Arguments.Length -lt 2) {
+                Write-Host "The 'docs' command requires at least two arguments."
+                Exit 1
+            }
+            uv run python -m typer $Arguments[0] utils docs --output $Arguments[1]
+        }
+        "uv" {
+            Assert-Dependencies
+            uv @Arguments
+        }
+        "help" {
+            Get-Help $PSCommandPath -Detailed
+        }
+        default {
+            $error_msg = "Invalid command.`n" `
+                + "To see a list of valid commands, use the 'help' command."
+            Write-Host $error_msg
+            Exit 1
+        }
+    }
+}
+
+function Assert-Dependencies {
+
+    if (-Not (Test-Path -Path $UvPath)) {
+        Write-Host "Dependencies not found. Please run './urvc install' first."
+        Exit 1
+    }
+}
+
+Main $Command $Arguments
\ No newline at end of file