Spaces:

lewiswu1209
/

MockingBird

Runtime error

App Files Files Community

lewiswu1209 commited on Aug 3, 2022

Commit

f4dac30

0 Parent(s):

initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +31 -0
.gitignore +18 -0
CODE_OF_CONDUCT.md +130 -0
LICENSE.txt +24 -0
README-CN.md +230 -0
README.md +13 -0
app.py +80 -0
demo_toolbox.py +49 -0
encoder/__init__.py +0 -0
encoder/audio.py +117 -0
encoder/config.py +45 -0
encoder/data_objects/__init__.py +2 -0
encoder/data_objects/random_cycler.py +37 -0
encoder/data_objects/speaker.py +40 -0
encoder/data_objects/speaker_batch.py +12 -0
encoder/data_objects/speaker_verification_dataset.py +56 -0
encoder/data_objects/utterance.py +26 -0
encoder/inference.py +195 -0
encoder/model.py +135 -0
encoder/params_data.py +29 -0
encoder/params_model.py +11 -0
encoder/preprocess.py +184 -0
encoder/saved_models/pretrained.pt +3 -0
encoder/train.py +123 -0
encoder/visualizations.py +178 -0
encoder_preprocess.py +61 -0
encoder_train.py +47 -0
gen_voice.py +128 -0
mkgui/__init__.py +0 -0
mkgui/app.py +145 -0
mkgui/app_vc.py +166 -0
mkgui/base/__init__.py +2 -0
mkgui/base/api/__init__.py +1 -0
mkgui/base/api/fastapi_utils.py +102 -0
mkgui/base/components/__init__.py +0 -0
mkgui/base/components/outputs.py +43 -0
mkgui/base/components/types.py +46 -0
mkgui/base/core.py +203 -0
mkgui/base/ui/__init__.py +1 -0
mkgui/base/ui/schema_utils.py +129 -0
mkgui/base/ui/streamlit_ui.py +888 -0
mkgui/base/ui/streamlit_utils.py +13 -0
mkgui/preprocess.py +96 -0
mkgui/static/mb.png +0 -0
mkgui/train.py +106 -0
mkgui/train_vc.py +155 -0
packages.txt +5 -0
ppg2mel/__init__.py +209 -0
ppg2mel/preprocess.py +113 -0
ppg2mel/rnn_decoder_mol.py +374 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,31 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,18 @@

+*.pyc
+*.aux
+*.log
+*.out
+*.synctex.gz
+*.suo
+*__pycache__
+*.idea
+*.ipynb_checkpoints
+*.pickle
+*.npy
+*.blg
+*.bbl
+*.bcf
+*.toc
+*.sh
+wavs
+log

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,130 @@

+# Contributor Covenant Code of Conduct
+## First of all
+Don't be evil, never
+## Our Pledge
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity
+and orientation.
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+## Our Standards
+Examples of behavior that contributes to a positive environment for our
+community include:
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the
+  overall community
+Examples of unacceptable behavior include:
+* The use of sexualized language or imagery, and sexual attention or
+  advances of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email
+  address, without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Enforcement Responsibilities
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+## Scope
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+[email protected].
+All complaints will be reviewed and investigated promptly and fairly.
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+## Enforcement Guidelines
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+### 1. Correction
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+### 2. Warning
+**Community Impact**: A violation through a single incident or series
+of actions.
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or
+permanent ban.
+### 3. Temporary Ban
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+### 4. Permanent Ban
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior,  harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.0, available at
+https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
+Community Impact Guidelines were inspired by [Mozilla's code of conduct
+enforcement ladder](https://github.com/mozilla/diversity).
+[homepage]: https://www.contributor-covenant.org
+For answers to common questions about this code of conduct, see the FAQ at
+https://www.contributor-covenant.org/faq. Translations are available at
+https://www.contributor-covenant.org/translations.

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+MIT License
+Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
+Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah)
+Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
+Original work Copyright (c) 2015 braindead (https://github.com/braindead)
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README-CN.md ADDED Viewed

	@@ -0,0 +1,230 @@

+## 实时语音克隆 - 中文/普通话
+![mockingbird](https://user-images.githubusercontent.com/12797292/131216767-6eb251d6-14fc-4951-8324-2722f0cd4c63.jpg)
+[![MIT License](https://img.shields.io/badge/license-MIT-blue.svg?style=flat)](http://choosealicense.com/licenses/mit/)
+### [English](README.md)  | 中文
+### [DEMO VIDEO](https://www.bilibili.com/video/BV17Q4y1B7mY/) | [Wiki教程](https://github.com/babysor/MockingBird/wiki/Quick-Start-(Newbie)) ｜ [训练教程](https://vaj2fgg8yn.feishu.cn/docs/doccn7kAbr3SJz0KM0SIDJ0Xnhd)
+## 特性
+🌍 **中文** 支持普通话并使用多种中文数据集进行测试：aidatatang_200zh, magicdata, aishell3, biaobei, MozillaCommonVoice, data_aishell 等
+🤩 **PyTorch** 适用于 pytorch，已在 1.9.0 版本（最新于 2021 年 8 月）中测试，GPU Tesla T4 和 GTX 2060
+🌍 **Windows + Linux** 可在 Windows 操作系统和 linux 操作系统中运行（苹果系统M1版也有社区成功运行案例）
+🤩 **Easy & Awesome** 仅需下载或新训练合成器（synthesizer）就有良好效果，复用预训练的编码器/声码器，或实时的HiFi-GAN作为vocoder
+🌍 **Webserver Ready** 可伺服你的训练结果，供远程调用
+### 进行中的工作
+*  GUI/客户端大升级与合并
+[X] 初始化框架 `./mkgui` （基于streamlit + fastapi）和 [技术设计](https://vaj2fgg8yn.feishu.cn/docs/doccnvotLWylBub8VJIjKzoEaee)
+[X] 增加 Voice Cloning and Conversion的演示页面
+[X] 增加Voice Conversion的预处理preprocessing 和训练 training 页面
+[ ] 增加其他的的预处理preprocessing 和训练 training 页面
+* 模型后端基于ESPnet2升级
+## 开始
+### 1. 安装要求
+> 按照原始存储库测试您是否已准备好所有环境。
+运行工具箱(demo_toolbox.py)需要 **Python 3.7 或更高版本** 。
+* 安装 [PyTorch](https://pytorch.org/get-started/locally/)。
+> 如果在用 pip 方式安装的时候出现 `ERROR: Could not find a version that satisfies the requirement torch==1.9.0+cu102 (from versions: 0.1.2, 0.1.2.post1, 0.1.2.post2)` 这个错误可能是 python 版本过低，3.9 可以安装成功
+* 安装 [ffmpeg](https://ffmpeg.org/download.html#get-packages)。
+* 运行`pip install -r requirements.txt` 来安装剩余的必要包。
+* 安装 webrtcvad `pip install webrtcvad-wheels`。
+### 2. 准备预训练模型
+考虑训练您自己专属的模型或者下载社区他人训练好的模型:
+> 近期创建了[知乎专题](https://www.zhihu.com/column/c_1425605280340504576) 将不定期更新炼丹小技巧or心得，也欢迎提问
+#### 2.1 使用数据集自己训练encoder模型 (可选)
+* 进行音频和梅尔频谱图预处理：
+`python encoder_preprocess.py <datasets_root>`
+使用`-d {dataset}` 指定数据集，支持 librispeech_other，voxceleb1，aidatatang_200zh，使用逗号分割处理多数据集。
+* 训练encoder: `python encoder_train.py my_run <datasets_root>/SV2TTS/encoder`
+> 训练encoder使用了visdom。你可以加上`-no_visdom`禁用visdom，但是有可视化会更好。在单独的命令行/进程中运行"visdom"来启动visdom服务器。
+#### 2.2 使用数据集自己训练合成器模型（与2.3二选一）
+* 下载 数据集并解压：确保您可以访问 *train* 文件夹中的所有音频文件（如.wav）
+* 进行音频和梅尔频谱图预处理：
+`python pre.py <datasets_root> -d {dataset} -n {number}`
+可传入参数：
+* `-d {dataset}` 指定数据集，支持 aidatatang_200zh, magicdata, aishell3, data_aishell, 不传默认为aidatatang_200zh
+* `-n {number}` 指定并行数，CPU 11770k + 32GB实测10没有问题
+> 假如你下载的 `aidatatang_200zh`文件放在D盘，`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\`
+* 训练合成器：
+`python synthesizer_train.py mandarin <datasets_root>/SV2TTS/synthesizer`
+* 当您在训练文件夹 *synthesizer/saved_models/* 中看到注意线显示和损失满足您的需要时，请转到`启动程序`一步。
+#### 2.3使用社区预先训练好的合成器（与2.2二选一）
+> 当实在没有设备或者不想慢慢调试，可以使用社区贡献的模型(欢迎持续分享):
+| 作者 | 下载链接 | 效果预览 | 信息 |
+| --- | ----------- | ----- | ----- |
+| 作者 | https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g  [百度盘链接](https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g) 4j5d |  | 75k steps 用3个开源数据集混合训练
+| 作者 | https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw  [百度盘链接](https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw) 提取码：om7f |  | 25k steps 用3个开源数据集混合训练, 切换到tag v0.0.1使用
+|@FawenYo | https://drive.google.com/file/d/1H-YGOUHpmqKxJ9FRc6vAjPuqQki24UbC/view?usp=sharing [百度盘链接](https://pan.baidu.com/s/1vSYXO4wsLyjnF3Unl-Xoxg) 提取码：1024  | [input](https://github.com/babysor/MockingBird/wiki/audio/self_test.mp3) [output](https://github.com/babysor/MockingBird/wiki/audio/export.wav) | 200k steps 台湾口音需切换到tag v0.0.1使用
+|@miven| https://pan.baidu.com/s/1PI-hM3sn5wbeChRryX-RCQ 提取码：2021 | https://www.bilibili.com/video/BV1uh411B7AD/ | 150k steps 注意：根据[issue](https://github.com/babysor/MockingBird/issues/37)修复 并切换到tag v0.0.1使用
+#### 2.4训练声码器 (可选)
+对效果影响不大，已经预置3款，如果希望自己训练可以参考以下命令。
+* 预处理数据:
+`python vocoder_preprocess.py <datasets_root> -m <synthesizer_model_path>`
+> `<datasets_root>`替换为你的数据集目录，`<synthesizer_model_path>`替换为一个你最好的synthesizer模型目录，例如 *sythensizer\saved_models\xxx*
+* 训练wavernn声码器:
+`python vocoder_train.py <trainid> <datasets_root>`
+> `<trainid>`替换为你想要的标识，同一标识再次训练时会延续原模型
+* 训练hifigan声码器:
+`python vocoder_train.py <trainid> <datasets_root> hifigan`
+> `<trainid>`替换为你想要的标识，同一标识再次训练时会延续原模型
+* 训练fregan声码器:
+`python vocoder_train.py <trainid> <datasets_root> --config config.json fregan`
+> `<trainid>`替换为你想要的标识，同一标识再次训练时会延续原模型
+* 将GAN声码器的训练切换为多GPU模式：修改GAN文件夹下.json文件中的"num_gpus"参数
+### 3. 启动程序或工具箱
+您可以尝试使用以下命令：
+### 3.1 启动Web程序（v2）：
+`python web.py`
+运行成功后在浏览器打开地址, 默认为 `http://localhost:8080`
+> * 仅支持手动新录音（16khz）, 不支持超过4MB的录音，最佳长度在5~15秒
+### 3.2 启动工具箱：
+`python demo_toolbox.py -d <datasets_root>`
+> 请指定一个可用的数据集文件路径，如果有支持的数据集则会自动加载供调试，也同时会作为手动录制音频的存储目录。
+<img width="1042" alt="d48ea37adf3660e657cfb047c10edbc" src="https://user-images.githubusercontent.com/7423248/134275227-c1ddf154-f118-4b77-8949-8c4c7daf25f0.png">
+### 4. 番外：语音转换Voice Conversion(PPG based)
+想像柯南拿着变声器然后发出毛利小五郎的声音吗？本项目现基于PPG-VC，引入额外两个模块（PPG extractor + PPG2Mel）, 可以实现变声功能。（文档不全，尤其是训练部分，正在努力补充中）
+#### 4.0 准备环境
+* 确保项目以上环境已经安装ok，运行`pip install espnet` 来安装剩余的必要包。
+* 下载以下模型 链接：https://pan.baidu.com/s/1bl_x_DHJSAUyN2fma-Q_Wg
+提取码：gh41
+  * 24K采样率专用的vocoder（hifigan）到 *vocoder\saved_models\xxx*
+  * 预训练的ppg特征encoder(ppg_extractor)到 *ppg_extractor\saved_models\xxx*
+  * 预训练的PPG2Mel到 *ppg2mel\saved_models\xxx*
+#### 4.1 使用数据集自己训练PPG2Mel模型 (可选)
+* 下载aidatatang_200zh数据集并解压：确保您可以访问 *train* 文件夹中的所有音频文件（如.wav）
+* 进行音频和梅尔频谱图预处理：
+`python pre4ppg.py <datasets_root> -d {dataset} -n {number}`
+可传入参数：
+* `-d {dataset}` 指定数据集，支持 aidatatang_200zh, 不传默认为aidatatang_200zh
+* `-n {number}` 指定并行数，CPU 11770k在8的情况下，需要运行12到18小时！待优化
+> 假如你下载的 `aidatatang_200zh`文件放在D盘，`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\`
+* 训练合成器, 注意在上一步先下载好`ppg2mel.yaml`, 修改里面的地址指向预训练好的文件夹：
+`python ppg2mel_train.py --config .\ppg2mel\saved_models\ppg2mel.yaml --oneshotvc `
+* 如果想要继续上一次的训练，可以通过`--load .\ppg2mel\saved_models\<old_pt_file>` 参数指定一个预训练模型文件。
+#### 4.2 启动工具箱VC模式
+您可以尝试使用以下命令：
+`python demo_toolbox.py -vc -d <datasets_root>`
+> 请指定一个可用的数据集文件路径，如果有支持的数据集则会自动加载供调试，也同时会作为手动录制音频的存储目录。
+<img width="971" alt="微信图片_20220305005351" src="https://user-images.githubusercontent.com/7423248/156805733-2b093dbc-d989-4e68-8609-db11f365886a.png">
+## 引用及论文
+> 该库一开始从仅支持英语的[Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning) 分叉出来的，鸣谢作者。
+| URL | Designation | 标题 | 实现源码 |
+| --- | ----------- | ----- | --------------------- |
+| [1803.09017](https://arxiv.org/abs/1803.09017) | GlobalStyleToken (synthesizer)| Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis | 本代码库 |
+| [2010.05646](https://arxiv.org/abs/2010.05646) | HiFi-GAN (vocoder)| Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis | 本代码库 |
+| [2106.02297](https://arxiv.org/abs/2106.02297) | Fre-GAN (vocoder)| Fre-GAN: Adversarial Frequency-consistent Audio Synthesis | 本代码库 |
+|[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | SV2TTS | Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis | 本代码库 |
+|[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
+|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
+|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | 本代码库 |
+## 常見問題(FQ&A)
+#### 1.數據集哪裡下載?
+| 数据集 | OpenSLR地址 | 其他源 (Google Drive, Baidu网盘等) |
+| --- | ----------- | ---------------|
+| aidatatang_200zh | [OpenSLR](http://www.openslr.org/62/) | [Google Drive](https://drive.google.com/file/d/110A11KZoVe7vy6kXlLb6zVPLb_J91I_t/view?usp=sharing) |
+| magicdata | [OpenSLR](http://www.openslr.org/68/) | [Google Drive (Dev set)](https://drive.google.com/file/d/1g5bWRUSNH68ycC6eNvtwh07nX3QhOOlo/view?usp=sharing) |
+| aishell3 | [OpenSLR](https://www.openslr.org/93/) | [Google Drive](https://drive.google.com/file/d/1shYp_o4Z0X0cZSKQDtFirct2luFUwKzZ/view?usp=sharing) |
+| data_aishell | [OpenSLR](https://www.openslr.org/33/) |  |
+> 解壓 aidatatang_200zh 後，還需將 `aidatatang_200zh\corpus\train`下的檔案全選解壓縮
+#### 2.`<datasets_root>`是什麼意思?
+假如數據集路徑為 `D:\data\aidatatang_200zh`，那麼 `<datasets_root>`就是 `D:\data`
+#### 3.訓練模型顯存不足
+訓練合成器時：將 `synthesizer/hparams.py`中的batch_size參數調小
+```
+//調整前
+tts_schedule = [(2,  1e-3,  20_000,  12),   # Progressive training schedule
+                (2,  5e-4,  40_000,  12),   # (r, lr, step, batch_size)
+                (2,  2e-4,  80_000,  12),   #
+                (2,  1e-4, 160_000,  12),   # r = reduction factor (# of mel frames
+                (2,  3e-5, 320_000,  12),   #     synthesized for each decoder iteration)
+                (2,  1e-5, 640_000,  12)],  # lr = learning rate
+//調整後
+tts_schedule = [(2,  1e-3,  20_000,  8),   # Progressive training schedule
+                (2,  5e-4,  40_000,  8),   # (r, lr, step, batch_size)
+                (2,  2e-4,  80_000,  8),   #
+                (2,  1e-4, 160_000,  8),   # r = reduction factor (# of mel frames
+                (2,  3e-5, 320_000,  8),   #     synthesized for each decoder iteration)
+                (2,  1e-5, 640_000,  8)],  # lr = learning rate
+```
+聲碼器-預處理數據集時：將 `synthesizer/hparams.py`中的batch_size參數調小
+```
+//調整前
+### Data Preprocessing
+        max_mel_frames = 900,
+        rescale = True,
+        rescaling_max = 0.9,
+        synthesis_batch_size = 16,                  # For vocoder preprocessing and inference.
+//調整後
+### Data Preprocessing
+        max_mel_frames = 900,
+        rescale = True,
+        rescaling_max = 0.9,
+        synthesis_batch_size = 8,                  # For vocoder preprocessing and inference.
+```
+聲碼器-訓練聲碼器時：將 `vocoder/wavernn/hparams.py`中的batch_size參數調小
+```
+//調整前
+# Training
+voc_batch_size = 100
+voc_lr = 1e-4
+voc_gen_at_checkpoint = 5
+voc_pad = 2
+//調整後
+# Training
+voc_batch_size = 6
+voc_lr = 1e-4
+voc_gen_at_checkpoint = 5
+voc_pad =2
+```
+#### 4.碰到`RuntimeError: Error(s) in loading state_dict for Tacotron: size mismatch for encoder.embedding.weight: copying a param with shape torch.Size([70, 512]) from checkpoint, the shape in current model is torch.Size([75, 512]).`
+請參照 issue [#37](https://github.com/babysor/MockingBird/issues/37)
+#### 5.如何改善CPU、GPU佔用率?
+適情況調整batch_size參數來改善
+#### 6.發生 `頁面文件太小，無法完成操作`
+請參考這篇[文章](https://blog.csdn.net/qq_17755303/article/details/112564030)，將虛擬內存更改為100G(102400)，例如:档案放置D槽就更改D槽的虚拟内存
+#### 7.什么时候算训练完成？
+首先一定要出现注意力模型，其次是loss足够低，取决于硬件设备和数据集。拿本人的供参考，我的注意力是在 18k 步之后出现的，并且在 50k 步之后损失变得低于 0.4
+![attention_step_20500_sample_1](https://user-images.githubusercontent.com/7423248/128587252-f669f05a-f411-4811-8784-222156ea5e9d.png)
+![step-135500-mel-spectrogram_sample_1](https://user-images.githubusercontent.com/7423248/128587255-4945faa0-5517-46ea-b173-928eff999330.png)

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: MockingBird
+emoji: 🔥
+colorFrom: red
+colorTo: red
+sdk: gradio
+sdk_version: 3.1.3
+app_file: app.py
+pinned: false
+license: mit
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import gradio as gr
+import re
+import random
+import string
+import librosa
+import numpy as np
+from pathlib import Path
+from scipy.io.wavfile import write
+from encoder import inference as encoder
+from vocoder.hifigan import inference as gan_vocoder
+from synthesizer.inference import Synthesizer
+class Mandarin:
+    def __init__(self):
+        self.encoder_path = "encoder/saved_models/pretrained.pt"
+        self.vocoder_path = "vocoder/saved_models/pretrained/g_hifigan.pt"
+        self.config_fpath = "vocoder/hifigan/config_16k_.json"
+        self.accent = "synthesizer/saved_models/普通话.pt"
+        synthesizers_cache = {}
+        if synthesizers_cache.get(self.accent) is None:
+            self.current_synt = Synthesizer(Path(self.accent))
+            synthesizers_cache[self.accent] = self.current_synt
+        else:
+            self.current_synt = synthesizers_cache[self.accent]
+        encoder.load_model(Path(self.encoder_path))
+        gan_vocoder.load_model(Path(self.vocoder_path), self.config_fpath)
+    def setVoice(self, timbre):
+        self.timbre = timbre
+        wav, sample_rate,  = librosa.load(self.timbre)
+        encoder_wav = encoder.preprocess_wav(wav, sample_rate)
+        self.embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
+    def say(self, text):
+        texts = filter(None, text.split("\n"))
+        punctuation = "！，。、？!,.?：:" # punctuate and split/clean text
+        processed_texts = []
+        for text in texts:
+            for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
+                if processed_text:
+                    processed_texts.append(processed_text.strip())
+        texts = processed_texts
+        embeds = [self.embed] * len(texts)
+        specs = self.current_synt.synthesize_spectrograms(texts, embeds)
+        spec = np.concatenate(specs, axis=1)
+        wav, sample_rate = gan_vocoder.infer_waveform(spec)
+        return wav, sample_rate
+def greet(audio, text, voice=None):
+    if voice is None:
+        voice = Mandarin()
+        voice.setVoice(audio.name)
+        voice.say("加载成功")
+    wav, sample_rate = voice.say(text)
+    output_file = "".join( random.sample(string.ascii_lowercase + string.digits, 11) ) + ".wav"
+    write(output_file, sample_rate, wav.astype(np.float32))
+    return output_file, voice
+def main():
+    gr.Interface(
+        fn=greet,
+        inputs=[gr.inputs.Audio(type="file"),"text", "state"],
+        outputs=[gr.outputs.Audio(type="file"), "state"]
+    ).launch()
+if __name__=="__main__":
+    main()

demo_toolbox.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from pathlib import Path
+from toolbox import Toolbox
+from utils.argutils import print_args
+from utils.modelutils import check_model_paths
+import argparse
+import os
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="Runs the toolbox",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("-d", "--datasets_root", type=Path, help= \
+        "Path to the directory containing your datasets. See toolbox/__init__.py for a list of "
+        "supported datasets.", default=None)
+    parser.add_argument("-vc", "--vc_mode", action="store_true",
+                        help="Voice Conversion Mode(PPG based)")
+    parser.add_argument("-e", "--enc_models_dir", type=Path, default="encoder/saved_models",
+                        help="Directory containing saved encoder models")
+    parser.add_argument("-s", "--syn_models_dir", type=Path, default="synthesizer/saved_models",
+                        help="Directory containing saved synthesizer models")
+    parser.add_argument("-v", "--voc_models_dir", type=Path, default="vocoder/saved_models",
+                        help="Directory containing saved vocoder models")
+    parser.add_argument("-ex", "--extractor_models_dir", type=Path, default="ppg_extractor/saved_models",
+                        help="Directory containing saved extrator models")
+    parser.add_argument("-cv", "--convertor_models_dir", type=Path, default="ppg2mel/saved_models",
+                        help="Directory containing saved convert models")
+    parser.add_argument("--cpu", action="store_true", help=\
+        "If True, processing is done on CPU, even when a GPU is available.")
+    parser.add_argument("--seed", type=int, default=None, help=\
+        "Optional random number seed value to make toolbox deterministic.")
+    parser.add_argument("--no_mp3_support", action="store_true", help=\
+        "If True, no mp3 files are allowed.")
+    args = parser.parse_args()
+    print_args(args, parser)
+    if args.cpu:
+        # Hide GPUs from Pytorch to force CPU processing
+        os.environ["CUDA_VISIBLE_DEVICES"] = ""
+    del args.cpu
+    ## Remind the user to download pretrained models if needed
+    check_model_paths(encoder_path=args.enc_models_dir, synthesizer_path=args.syn_models_dir,
+                      vocoder_path=args.voc_models_dir)
+    # Launch the toolbox
+    Toolbox(**vars(args))

encoder/__init__.py ADDED Viewed

File without changes

encoder/audio.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from scipy.ndimage.morphology import binary_dilation
+from encoder.params_data import *
+from pathlib import Path
+from typing import Optional, Union
+from warnings import warn
+import numpy as np
+import librosa
+import struct
+try:
+    import webrtcvad
+except:
+    warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.")
+    webrtcvad=None
+int16_max = (2 ** 15) - 1
+def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
+                   source_sr: Optional[int] = None,
+                   normalize: Optional[bool] = True,
+                   trim_silence: Optional[bool] = True):
+    """
+    Applies the preprocessing operations used in training the Speaker Encoder to a waveform
+    either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
+    :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
+    just .wav), either the waveform as a numpy array of floats.
+    :param source_sr: if passing an audio waveform, the sampling rate of the waveform before
+    preprocessing. After preprocessing, the waveform's sampling rate will match the data
+    hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
+    this argument will be ignored.
+    """
+    # Load the wav from disk if needed
+    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
+        wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
+    else:
+        wav = fpath_or_wav
+    # Resample the wav if needed
+    if source_sr is not None and source_sr != sampling_rate:
+        wav = librosa.resample(wav, source_sr, sampling_rate)
+    # Apply the preprocessing: normalize volume and shorten long silences
+    if normalize:
+        wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
+    if webrtcvad and trim_silence:
+        wav = trim_long_silences(wav)
+    return wav
+def wav_to_mel_spectrogram(wav):
+    """
+    Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
+    Note: this not a log-mel spectrogram.
+    """
+    frames = librosa.feature.melspectrogram(
+        y=wav,
+        sr=sampling_rate,
+        n_fft=int(sampling_rate * mel_window_length / 1000),
+        hop_length=int(sampling_rate * mel_window_step / 1000),
+        n_mels=mel_n_channels
+    )
+    return frames.astype(np.float32).T
+def trim_long_silences(wav):
+    """
+    Ensures that segments without voice in the waveform remain no longer than a
+    threshold determined by the VAD parameters in params.py.
+    :param wav: the raw waveform as a numpy array of floats
+    :return: the same waveform with silences trimmed away (length <= original wav length)
+    """
+    # Compute the voice detection window size
+    samples_per_window = (vad_window_length * sampling_rate) // 1000
+    # Trim the end of the audio to have a multiple of the window size
+    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
+    # Convert the float waveform to 16-bit mono PCM
+    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
+    # Perform voice activation detection
+    voice_flags = []
+    vad = webrtcvad.Vad(mode=3)
+    for window_start in range(0, len(wav), samples_per_window):
+        window_end = window_start + samples_per_window
+        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
+                                         sample_rate=sampling_rate))
+    voice_flags = np.array(voice_flags)
+    # Smooth the voice detection with a moving average
+    def moving_average(array, width):
+        array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
+        ret = np.cumsum(array_padded, dtype=float)
+        ret[width:] = ret[width:] - ret[:-width]
+        return ret[width - 1:] / width
+    audio_mask = moving_average(voice_flags, vad_moving_average_width)
+    audio_mask = np.round(audio_mask).astype(np.bool)
+    # Dilate the voiced regions
+    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
+    audio_mask = np.repeat(audio_mask, samples_per_window)
+    return wav[audio_mask == True]
+def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
+    if increase_only and decrease_only:
+        raise ValueError("Both increase only and decrease only are set")
+    dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
+    if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
+        return wav
+    return wav * (10 ** (dBFS_change / 20))

encoder/config.py ADDED Viewed

	@@ -0,0 +1,45 @@

+librispeech_datasets = {
+    "train": {
+        "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
+        "other": ["LibriSpeech/train-other-500"]
+    },
+    "test": {
+        "clean": ["LibriSpeech/test-clean"],
+        "other": ["LibriSpeech/test-other"]
+    },
+    "dev": {
+        "clean": ["LibriSpeech/dev-clean"],
+        "other": ["LibriSpeech/dev-other"]
+    },
+}
+libritts_datasets = {
+    "train": {
+        "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
+        "other": ["LibriTTS/train-other-500"]
+    },
+    "test": {
+        "clean": ["LibriTTS/test-clean"],
+        "other": ["LibriTTS/test-other"]
+    },
+    "dev": {
+        "clean": ["LibriTTS/dev-clean"],
+        "other": ["LibriTTS/dev-other"]
+    },
+}
+voxceleb_datasets = {
+    "voxceleb1" : {
+        "train": ["VoxCeleb1/wav"],
+        "test": ["VoxCeleb1/test_wav"]
+    },
+    "voxceleb2" : {
+        "train": ["VoxCeleb2/dev/aac"],
+        "test": ["VoxCeleb2/test_wav"]
+    }
+}
+other_datasets = [
+    "LJSpeech-1.1",
+    "VCTK-Corpus/wav48",
+]
+anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]

encoder/data_objects/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
2	+ from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader

encoder/data_objects/random_cycler.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import random
+class RandomCycler:
+    """
+    Creates an internal copy of a sequence and allows access to its items in a constrained random
+    order. For a source sequence of n items and one or several consecutive queries of a total
+    of m items, the following guarantees hold (one implies the other):
+        - Each item will be returned between m // n and ((m - 1) // n) + 1 times.
+        - Between two appearances of the same item, there may be at most 2 * (n - 1) other items.
+    """
+    def __init__(self, source):
+        if len(source) == 0:
+            raise Exception("Can't create RandomCycler from an empty collection")
+        self.all_items = list(source)
+        self.next_items = []
+    def sample(self, count: int):
+        shuffle = lambda l: random.sample(l, len(l))
+        out = []
+        while count > 0:
+            if count >= len(self.all_items):
+                out.extend(shuffle(list(self.all_items)))
+                count -= len(self.all_items)
+                continue
+            n = min(count, len(self.next_items))
+            out.extend(self.next_items[:n])
+            count -= n
+            self.next_items = self.next_items[n:]
+            if len(self.next_items) == 0:
+                self.next_items = shuffle(list(self.all_items))
+        return out
+    def __next__(self):
+        return self.sample(1)[0]

encoder/data_objects/speaker.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from encoder.data_objects.random_cycler import RandomCycler
+from encoder.data_objects.utterance import Utterance
+from pathlib import Path
+# Contains the set of utterances of a single speaker
+class Speaker:
+    def __init__(self, root: Path):
+        self.root = root
+        self.name = root.name
+        self.utterances = None
+        self.utterance_cycler = None
+    def _load_utterances(self):
+        with self.root.joinpath("_sources.txt").open("r") as sources_file:
+            sources = [l.split(",") for l in sources_file]
+        sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
+        self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
+        self.utterance_cycler = RandomCycler(self.utterances)
+    def random_partial(self, count, n_frames):
+        """
+        Samples a batch of <count> unique partial utterances from the disk in a way that all
+        utterances come up at least once every two cycles and in a random order every time.
+        :param count: The number of partial utterances to sample from the set of utterances from
+        that speaker. Utterances are guaranteed not to be repeated if <count> is not larger than
+        the number of utterances available.
+        :param n_frames: The number of frames in the partial utterance.
+        :return: A list of tuples (utterance, frames, range) where utterance is an Utterance,
+        frames are the frames of the partial utterances and range is the range of the partial
+        utterance with regard to the complete utterance.
+        """
+        if self.utterances is None:
+            self._load_utterances()
+        utterances = self.utterance_cycler.sample(count)
+        a = [(u,) + u.random_partial(n_frames) for u in utterances]
+        return a

encoder/data_objects/speaker_batch.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import numpy as np
+from typing import List
+from encoder.data_objects.speaker import Speaker
+class SpeakerBatch:
+    def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
+        self.speakers = speakers
+        self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
+        # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with
+        # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40)
+        self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])

encoder/data_objects/speaker_verification_dataset.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from encoder.data_objects.random_cycler import RandomCycler
+from encoder.data_objects.speaker_batch import SpeakerBatch
+from encoder.data_objects.speaker import Speaker
+from encoder.params_data import partials_n_frames
+from torch.utils.data import Dataset, DataLoader
+from pathlib import Path
+# TODO: improve with a pool of speakers for data efficiency
+class SpeakerVerificationDataset(Dataset):
+    def __init__(self, datasets_root: Path):
+        self.root = datasets_root
+        speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
+        if len(speaker_dirs) == 0:
+            raise Exception("No speakers found. Make sure you are pointing to the directory "
+                            "containing all preprocessed speaker directories.")
+        self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
+        self.speaker_cycler = RandomCycler(self.speakers)
+    def __len__(self):
+        return int(1e10)
+    def __getitem__(self, index):
+        return next(self.speaker_cycler)
+    def get_logs(self):
+        log_string = ""
+        for log_fpath in self.root.glob("*.txt"):
+            with log_fpath.open("r") as log_file:
+                log_string += "".join(log_file.readlines())
+        return log_string
+class SpeakerVerificationDataLoader(DataLoader):
+    def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None,
+                 batch_sampler=None, num_workers=0, pin_memory=False, timeout=0,
+                 worker_init_fn=None):
+        self.utterances_per_speaker = utterances_per_speaker
+        super().__init__(
+            dataset=dataset,
+            batch_size=speakers_per_batch,
+            shuffle=False,
+            sampler=sampler,
+            batch_sampler=batch_sampler,
+            num_workers=num_workers,
+            collate_fn=self.collate,
+            pin_memory=pin_memory,
+            drop_last=False,
+            timeout=timeout,
+            worker_init_fn=worker_init_fn
+        )
+    def collate(self, speakers):
+        return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames)

encoder/data_objects/utterance.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import numpy as np
+class Utterance:
+    def __init__(self, frames_fpath, wave_fpath):
+        self.frames_fpath = frames_fpath
+        self.wave_fpath = wave_fpath
+    def get_frames(self):
+        return np.load(self.frames_fpath)
+    def random_partial(self, n_frames):
+        """
+        Crops the frames into a partial utterance of n_frames
+        :param n_frames: The number of frames of the partial utterance
+        :return: the partial utterance frames and a tuple indicating the start and end of the
+        partial utterance in the complete utterance.
+        """
+        frames = self.get_frames()
+        if frames.shape[0] == n_frames:
+            start = 0
+        else:
+            start = np.random.randint(0, frames.shape[0] - n_frames)
+        end = start + n_frames
+        return frames[start:end], (start, end)

encoder/inference.py ADDED Viewed

	@@ -0,0 +1,195 @@

+from encoder.params_data import *
+from encoder.model import SpeakerEncoder
+from encoder.audio import preprocess_wav   # We want to expose this function from here
+from matplotlib import cm
+from encoder import audio
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+_model = None # type: SpeakerEncoder
+_device = None # type: torch.device
+def load_model(weights_fpath: Path, device=None):
+    """
+    Loads the model in memory. If this function is not explicitely called, it will be run on the
+    first call to embed_frames() with the default weights file.
+    :param weights_fpath: the path to saved model weights.
+    :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The
+    model will be loaded and will run on this device. Outputs will however always be on the cpu.
+    If None, will default to your GPU if it"s available, otherwise your CPU.
+    """
+    # TODO: I think the slow loading of the encoder might have something to do with the device it
+    #   was saved on. Worth investigating.
+    global _model, _device
+    if device is None:
+        _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    elif isinstance(device, str):
+        _device = torch.device(device)
+    _model = SpeakerEncoder(_device, torch.device("cpu"))
+    checkpoint = torch.load(weights_fpath, _device)
+    _model.load_state_dict(checkpoint["model_state"])
+    _model.eval()
+    print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
+    return _model
+def set_model(model, device=None):
+    global _model, _device
+    _model = model
+    if device is None:
+        _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    _device = device
+    _model.to(device)
+def is_loaded():
+    return _model is not None
+def embed_frames_batch(frames_batch):
+    """
+    Computes embeddings for a batch of mel spectrogram.
+    :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape
+    (batch_size, n_frames, n_channels)
+    :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
+    """
+    if _model is None:
+        raise Exception("Model was not loaded. Call load_model() before inference.")
+    frames = torch.from_numpy(frames_batch).to(_device)
+    embed = _model.forward(frames).detach().cpu().numpy()
+    return embed
+def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
+                           min_pad_coverage=0.75, overlap=0.5, rate=None):
+    """
+    Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
+    partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
+    spectrogram slices are returned, so as to make each partial utterance waveform correspond to
+    its spectrogram. This function assumes that the mel spectrogram parameters used are those
+    defined in params_data.py.
+    The returned ranges may be indexing further than the length of the waveform. It is
+    recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
+    :param n_samples: the number of samples in the waveform
+    :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
+    utterance
+    :param min_pad_coverage: when reaching the last partial utterance, it may or may not have
+    enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
+    then the last partial utterance will be considered, as if we padded the audio. Otherwise,
+    it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
+    utterance, this parameter is ignored so that the function always returns at least 1 slice.
+    :param overlap: by how much the partial utterance should overlap. If set to 0, the partial
+    utterances are entirely disjoint.
+    :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
+    respectively the waveform and the mel spectrogram with these slices to obtain the partial
+    utterances.
+    """
+    assert 0 <= overlap < 1
+    assert 0 < min_pad_coverage <= 1
+    if rate != None:
+        samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+        n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+        frame_step = int(np.round((sampling_rate / rate) / samples_per_frame))
+    else:
+        samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+        n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+        frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
+    assert 0 < frame_step, "The rate is too high"
+    assert frame_step <= partials_n_frames, "The rate is too low, it should be %f at least" % \
+        (sampling_rate / (samples_per_frame * partials_n_frames))
+    # Compute the slices
+    wav_slices, mel_slices = [], []
+    steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
+    for i in range(0, steps, frame_step):
+        mel_range = np.array([i, i + partial_utterance_n_frames])
+        wav_range = mel_range * samples_per_frame
+        mel_slices.append(slice(*mel_range))
+        wav_slices.append(slice(*wav_range))
+    # Evaluate whether extra padding is warranted or not
+    last_wav_range = wav_slices[-1]
+    coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
+    if coverage < min_pad_coverage and len(mel_slices) > 1:
+        mel_slices = mel_slices[:-1]
+        wav_slices = wav_slices[:-1]
+    return wav_slices, mel_slices
+def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
+    """
+    Computes an embedding for a single utterance.
+    # TODO: handle multiple wavs to benefit from batching on GPU
+    :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
+    :param using_partials: if True, then the utterance is split in partial utterances of
+    <partial_utterance_n_frames> frames and the utterance embedding is computed from their
+    normalized average. If False, the utterance is instead computed from feeding the entire
+    spectogram to the network.
+    :param return_partials: if True, the partial embeddings will also be returned along with the
+    wav slices that correspond to the partial embeddings.
+    :param kwargs: additional arguments to compute_partial_splits()
+    :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
+    <return_partials> is True, the partial utterances as a numpy array of float32 of shape
+    (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
+    returned. If <using_partials> is simultaneously set to False, both these values will be None
+    instead.
+    """
+    # Process the entire utterance if not using partials
+    if not using_partials:
+        frames = audio.wav_to_mel_spectrogram(wav)
+        embed = embed_frames_batch(frames[None, ...])[0]
+        if return_partials:
+            return embed, None, None
+        return embed
+    # Compute where to split the utterance into partials and pad if necessary
+    wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
+    max_wave_length = wave_slices[-1].stop
+    if max_wave_length >= len(wav):
+        wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+    # Split the utterance into partials
+    frames = audio.wav_to_mel_spectrogram(wav)
+    frames_batch = np.array([frames[s] for s in mel_slices])
+    partial_embeds = embed_frames_batch(frames_batch)
+    # Compute the utterance embedding from the partial embeddings
+    raw_embed = np.mean(partial_embeds, axis=0)
+    embed = raw_embed / np.linalg.norm(raw_embed, 2)
+    if return_partials:
+        return embed, partial_embeds, wave_slices
+    return embed
+def embed_speaker(wavs, **kwargs):
+    raise NotImplemented()
+def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
+    if ax is None:
+        ax = plt.gca()
+    if shape is None:
+        height = int(np.sqrt(len(embed)))
+        shape = (height, -1)
+    embed = embed.reshape(shape)
+    cmap = cm.get_cmap()
+    mappable = ax.imshow(embed, cmap=cmap)
+    cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
+    sm = cm.ScalarMappable(cmap=cmap)
+    sm.set_clim(*color_range)
+    ax.set_xticks([]), ax.set_yticks([])
+    ax.set_title(title)

encoder/model.py ADDED Viewed

	@@ -0,0 +1,135 @@

+from encoder.params_model import *
+from encoder.params_data import *
+from scipy.interpolate import interp1d
+from sklearn.metrics import roc_curve
+from torch.nn.utils import clip_grad_norm_
+from scipy.optimize import brentq
+from torch import nn
+import numpy as np
+import torch
+class SpeakerEncoder(nn.Module):
+    def __init__(self, device, loss_device):
+        super().__init__()
+        self.loss_device = loss_device
+        # Network defition
+        self.lstm = nn.LSTM(input_size=mel_n_channels,
+                            hidden_size=model_hidden_size,
+                            num_layers=model_num_layers,
+                            batch_first=True).to(device)
+        self.linear = nn.Linear(in_features=model_hidden_size,
+                                out_features=model_embedding_size).to(device)
+        self.relu = torch.nn.ReLU().to(device)
+        # Cosine similarity scaling (with fixed initial parameter values)
+        self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
+        self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
+        # Loss
+        self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
+    def do_gradient_ops(self):
+        # Gradient scale
+        self.similarity_weight.grad *= 0.01
+        self.similarity_bias.grad *= 0.01
+        # Gradient clipping
+        clip_grad_norm_(self.parameters(), 3, norm_type=2)
+    def forward(self, utterances, hidden_init=None):
+        """
+        Computes the embeddings of a batch of utterance spectrograms.
+        :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
+        (batch_size, n_frames, n_channels)
+        :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
+        batch_size, hidden_size). Will default to a tensor of zeros if None.
+        :return: the embeddings as a tensor of shape (batch_size, embedding_size)
+        """
+        # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
+        # and the final cell state.
+        out, (hidden, cell) = self.lstm(utterances, hidden_init)
+        # We take only the hidden state of the last layer
+        embeds_raw = self.relu(self.linear(hidden[-1]))
+        # L2-normalize it
+        embeds = embeds_raw / (torch.norm(embeds_raw, dim=1, keepdim=True) + 1e-5)
+        return embeds
+    def similarity_matrix(self, embeds):
+        """
+        Computes the similarity matrix according the section 2.1 of GE2E.
+        :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
+        utterances_per_speaker, embedding_size)
+        :return: the similarity matrix as a tensor of shape (speakers_per_batch,
+        utterances_per_speaker, speakers_per_batch)
+        """
+        speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+        # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
+        centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
+        centroids_incl = centroids_incl.clone() / (torch.norm(centroids_incl, dim=2, keepdim=True) + 1e-5)
+        # Exclusive centroids (1 per utterance)
+        centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
+        centroids_excl /= (utterances_per_speaker - 1)
+        centroids_excl = centroids_excl.clone() / (torch.norm(centroids_excl, dim=2, keepdim=True) + 1e-5)
+        # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
+        # product of these vectors (which is just an element-wise multiplication reduced by a sum).
+        # We vectorize the computation for efficiency.
+        sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
+                                 speakers_per_batch).to(self.loss_device)
+        mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
+        for j in range(speakers_per_batch):
+            mask = np.where(mask_matrix[j])[0]
+            sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
+            sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
+        ## Even more vectorized version (slower maybe because of transpose)
+        # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
+        #                           ).to(self.loss_device)
+        # eye = np.eye(speakers_per_batch, dtype=np.int)
+        # mask = np.where(1 - eye)
+        # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
+        # mask = np.where(eye)
+        # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
+        # sim_matrix2 = sim_matrix2.transpose(1, 2)
+        sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
+        return sim_matrix
+    def loss(self, embeds):
+        """
+        Computes the softmax loss according the section 2.1 of GE2E.
+        :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
+        utterances_per_speaker, embedding_size)
+        :return: the loss and the EER for this batch of embeddings.
+        """
+        speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+        # Loss
+        sim_matrix = self.similarity_matrix(embeds)
+        sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker,
+                                         speakers_per_batch))
+        ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
+        target = torch.from_numpy(ground_truth).long().to(self.loss_device)
+        loss = self.loss_fn(sim_matrix, target)
+        # EER (not backpropagated)
+        with torch.no_grad():
+            inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
+            labels = np.array([inv_argmax(i) for i in ground_truth])
+            preds = sim_matrix.detach().cpu().numpy()
+            # Snippet from https://yangcha.github.io/EER-ROC/
+            fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
+            eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
+        return loss, eer

encoder/params_data.py ADDED Viewed

	@@ -0,0 +1,29 @@

+## Mel-filterbank
+mel_window_length = 25  # In milliseconds
+mel_window_step = 10    # In milliseconds
+mel_n_channels = 40
+## Audio
+sampling_rate = 16000
+# Number of spectrogram frames in a partial utterance
+partials_n_frames = 160     # 1600 ms
+# Number of spectrogram frames at inference
+inference_n_frames = 80     #  800 ms
+## Voice Activation Detection
+# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+# This sets the granularity of the VAD. Should not need to be changed.
+vad_window_length = 30  # In milliseconds
+# Number of frames to average together when performing the moving average smoothing.
+# The larger this value, the larger the VAD variations must be to not get smoothed out.
+vad_moving_average_width = 8
+# Maximum number of consecutive silent frames a segment can have.
+vad_max_silence_length = 6
+## Audio volume normalization
+audio_norm_target_dBFS = -30

encoder/params_model.py ADDED Viewed

	@@ -0,0 +1,11 @@

+## Model parameters
+model_hidden_size = 256
+model_embedding_size = 256
+model_num_layers = 3
+## Training parameters
+learning_rate_init = 1e-4
+speakers_per_batch = 64
+utterances_per_speaker = 10

encoder/preprocess.py ADDED Viewed

	@@ -0,0 +1,184 @@

+from multiprocess.pool import ThreadPool
+from encoder.params_data import *
+from encoder.config import librispeech_datasets, anglophone_nationalites
+from datetime import datetime
+from encoder import audio
+from pathlib import Path
+from tqdm import tqdm
+import numpy as np
+class DatasetLog:
+    """
+    Registers metadata about the dataset in a text file.
+    """
+    def __init__(self, root, name):
+        self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
+        self.sample_data = dict()
+        start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
+        self.write_line("Creating dataset %s on %s" % (name, start_time))
+        self.write_line("-----")
+        self._log_params()
+    def _log_params(self):
+        from encoder import params_data
+        self.write_line("Parameter values:")
+        for param_name in (p for p in dir(params_data) if not p.startswith("__")):
+            value = getattr(params_data, param_name)
+            self.write_line("\t%s: %s" % (param_name, value))
+        self.write_line("-----")
+    def write_line(self, line):
+        self.text_file.write("%s\n" % line)
+    def add_sample(self, **kwargs):
+        for param_name, value in kwargs.items():
+            if not param_name in self.sample_data:
+                self.sample_data[param_name] = []
+            self.sample_data[param_name].append(value)
+    def finalize(self):
+        self.write_line("Statistics:")
+        for param_name, values in self.sample_data.items():
+            self.write_line("\t%s:" % param_name)
+            self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
+            self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
+        self.write_line("-----")
+        end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
+        self.write_line("Finished on %s" % end_time)
+        self.text_file.close()
+def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
+    dataset_root = datasets_root.joinpath(dataset_name)
+    if not dataset_root.exists():
+        print("Couldn\'t find %s, skipping this dataset." % dataset_root)
+        return None, None
+    return dataset_root, DatasetLog(out_dir, dataset_name)
+def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
+                             skip_existing, logger):
+    print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
+    # Function to preprocess utterances for one speaker
+    def preprocess_speaker(speaker_dir: Path):
+        # Give a name to the speaker that includes its dataset
+        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+        # Create an output directory with that name, as well as a txt file containing a
+        # reference to each source file.
+        speaker_out_dir = out_dir.joinpath(speaker_name)
+        speaker_out_dir.mkdir(exist_ok=True)
+        sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+        # There's a possibility that the preprocessing was interrupted earlier, check if
+        # there already is a sources file.
+        if sources_fpath.exists():
+            try:
+                with sources_fpath.open("r") as sources_file:
+                    existing_fnames = {line.split(",")[0] for line in sources_file}
+            except:
+                existing_fnames = {}
+        else:
+            existing_fnames = {}
+        # Gather all audio files for that speaker recursively
+        sources_file = sources_fpath.open("a" if skip_existing else "w")
+        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+            # Check if the target output file already exists
+            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
+            out_fname = out_fname.replace(".%s" % extension, ".npy")
+            if skip_existing and out_fname in existing_fnames:
+                continue
+            # Load and preprocess the waveform
+            wav = audio.preprocess_wav(in_fpath)
+            if len(wav) == 0:
+                continue
+            # Create the mel spectrogram, discard those that are too short
+            frames = audio.wav_to_mel_spectrogram(wav)
+            if len(frames) < partials_n_frames:
+                continue
+            out_fpath = speaker_out_dir.joinpath(out_fname)
+            np.save(out_fpath, frames)
+            logger.add_sample(duration=len(wav) / sampling_rate)
+            sources_file.write("%s,%s\n" % (out_fname, in_fpath))
+        sources_file.close()
+    # Process the utterances for each speaker
+    with ThreadPool(8) as pool:
+        list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
+                  unit="speakers"))
+    logger.finalize()
+    print("Done preprocessing %s.\n" % dataset_name)
+def preprocess_aidatatang_200zh(datasets_root: Path, out_dir: Path, skip_existing=False):
+    dataset_name = "aidatatang_200zh"
+    dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+    if not dataset_root:
+        return
+    # Preprocess all speakers
+    speaker_dirs = list(dataset_root.joinpath("corpus", "train").glob("*"))
+    _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav",
+                                skip_existing, logger)
+def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
+    for dataset_name in librispeech_datasets["train"]["other"]:
+        # Initialize the preprocessing
+        dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+        if not dataset_root:
+            return
+        # Preprocess all speakers
+        speaker_dirs = list(dataset_root.glob("*"))
+        _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac",
+                                 skip_existing, logger)
+def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
+    # Initialize the preprocessing
+    dataset_name = "VoxCeleb1"
+    dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+    if not dataset_root:
+        return
+    # Get the contents of the meta file
+    with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
+        metadata = [line.split("\t") for line in metafile][1:]
+    # Select the ID and the nationality, filter out non-anglophone speakers
+    nationalities = {line[0]: line[3] for line in metadata}
+    keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if
+                        nationality.lower() in anglophone_nationalites]
+    print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." %
+          (len(keep_speaker_ids), len(nationalities)))
+    # Get the speaker directories for anglophone speakers only
+    speaker_dirs = dataset_root.joinpath("wav").glob("*")
+    speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if
+                    speaker_dir.name in keep_speaker_ids]
+    print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." %
+          (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs)))
+    # Preprocess all speakers
+    _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav",
+                             skip_existing, logger)
+def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
+    # Initialize the preprocessing
+    dataset_name = "VoxCeleb2"
+    dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+    if not dataset_root:
+        return
+    # Get the speaker directories
+    # Preprocess all speakers
+    speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
+    _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a",
+                             skip_existing, logger)

encoder/saved_models/pretrained.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57715adc6f36047166ab06e37b904240aee2f4d10fc88f78ed91510cf4b38666
+size 17095158

encoder/train.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from encoder.visualizations import Visualizations
+from encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
+from encoder.params_model import *
+from encoder.model import SpeakerEncoder
+from utils.profiler import Profiler
+from pathlib import Path
+import torch
+def sync(device: torch.device):
+    # For correct profiling (cuda operations are async)
+    if device.type == "cuda":
+        torch.cuda.synchronize(device)
+def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
+          backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
+          no_visdom: bool):
+    # Create a dataset and a dataloader
+    dataset = SpeakerVerificationDataset(clean_data_root)
+    loader = SpeakerVerificationDataLoader(
+        dataset,
+        speakers_per_batch,
+        utterances_per_speaker,
+        num_workers=8,
+    )
+    # Setup the device on which to run the forward pass and the loss. These can be different,
+    # because the forward pass is faster on the GPU whereas the loss is often (depending on your
+    # hyperparameters) faster on the CPU.
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # FIXME: currently, the gradient is None if loss_device is cuda
+    loss_device = torch.device("cpu")
+    # Create the model and the optimizer
+    model = SpeakerEncoder(device, loss_device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
+    init_step = 1
+    # Configure file path for the model
+    state_fpath = models_dir.joinpath(run_id + ".pt")
+    backup_dir = models_dir.joinpath(run_id + "_backups")
+    # Load any existing model
+    if not force_restart:
+        if state_fpath.exists():
+            print("Found existing model \"%s\", loading it and resuming training." % run_id)
+            checkpoint = torch.load(state_fpath)
+            init_step = checkpoint["step"]
+            model.load_state_dict(checkpoint["model_state"])
+            optimizer.load_state_dict(checkpoint["optimizer_state"])
+            optimizer.param_groups[0]["lr"] = learning_rate_init
+        else:
+            print("No model \"%s\" found, starting training from scratch." % run_id)
+    else:
+        print("Starting the training from scratch.")
+    model.train()
+    # Initialize the visualization environment
+    vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom)
+    vis.log_dataset(dataset)
+    vis.log_params()
+    device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
+    vis.log_implementation({"Device": device_name})
+    # Training loop
+    profiler = Profiler(summarize_every=10, disabled=False)
+    for step, speaker_batch in enumerate(loader, init_step):
+        profiler.tick("Blocking, waiting for batch (threaded)")
+        # Forward pass
+        inputs = torch.from_numpy(speaker_batch.data).to(device)
+        sync(device)
+        profiler.tick("Data to %s" % device)
+        embeds = model(inputs)
+        sync(device)
+        profiler.tick("Forward pass")
+        embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
+        loss, eer = model.loss(embeds_loss)
+        sync(loss_device)
+        profiler.tick("Loss")
+        # Backward pass
+        model.zero_grad()
+        loss.backward()
+        profiler.tick("Backward pass")
+        model.do_gradient_ops()
+        optimizer.step()
+        profiler.tick("Parameter update")
+        # Update visualizations
+        # learning_rate = optimizer.param_groups[0]["lr"]
+        vis.update(loss.item(), eer, step)
+        # Draw projections and save them to the backup folder
+        if umap_every != 0 and step % umap_every == 0:
+            print("Drawing and saving projections (step %d)" % step)
+            backup_dir.mkdir(exist_ok=True)
+            projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step))
+            embeds = embeds.detach().cpu().numpy()
+            vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath)
+            vis.save()
+        # Overwrite the latest version of the model
+        if save_every != 0 and step % save_every == 0:
+            print("Saving the model (step %d)" % step)
+            torch.save({
+                "step": step + 1,
+                "model_state": model.state_dict(),
+                "optimizer_state": optimizer.state_dict(),
+            }, state_fpath)
+        # Make a backup
+        if backup_every != 0 and step % backup_every == 0:
+            print("Making a backup (step %d)" % step)
+            backup_dir.mkdir(exist_ok=True)
+            backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step))
+            torch.save({
+                "step": step + 1,
+                "model_state": model.state_dict(),
+                "optimizer_state": optimizer.state_dict(),
+            }, backup_fpath)
+        profiler.tick("Extras (visualizations, saving)")

encoder/visualizations.py ADDED Viewed

	@@ -0,0 +1,178 @@

+from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
+from datetime import datetime
+from time import perf_counter as timer
+import matplotlib.pyplot as plt
+import numpy as np
+# import webbrowser
+import visdom
+import umap
+colormap = np.array([
+    [76, 255, 0],
+    [0, 127, 70],
+    [255, 0, 0],
+    [255, 217, 38],
+    [0, 135, 255],
+    [165, 0, 165],
+    [255, 167, 255],
+    [0, 255, 255],
+    [255, 96, 38],
+    [142, 76, 0],
+    [33, 0, 127],
+    [0, 0, 0],
+    [183, 183, 183],
+], dtype=np.float) / 255
+class Visualizations:
+    def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False):
+        # Tracking data
+        self.last_update_timestamp = timer()
+        self.update_every = update_every
+        self.step_times = []
+        self.losses = []
+        self.eers = []
+        print("Updating the visualizations every %d steps." % update_every)
+        # If visdom is disabled TODO: use a better paradigm for that
+        self.disabled = disabled
+        if self.disabled:
+            return
+        # Set the environment name
+        now = str(datetime.now().strftime("%d-%m %Hh%M"))
+        if env_name is None:
+            self.env_name = now
+        else:
+            self.env_name = "%s (%s)" % (env_name, now)
+        # Connect to visdom and open the corresponding window in the browser
+        try:
+            self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
+        except ConnectionError:
+            raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to "
+                            "start it.")
+        # webbrowser.open("http://localhost:8097/env/" + self.env_name)
+        # Create the windows
+        self.loss_win = None
+        self.eer_win = None
+        # self.lr_win = None
+        self.implementation_win = None
+        self.projection_win = None
+        self.implementation_string = ""
+    def log_params(self):
+        if self.disabled:
+            return
+        from encoder import params_data
+        from encoder import params_model
+        param_string = "<b>Model parameters</b>:<br>"
+        for param_name in (p for p in dir(params_model) if not p.startswith("__")):
+            value = getattr(params_model, param_name)
+            param_string += "\t%s: %s<br>" % (param_name, value)
+        param_string += "<b>Data parameters</b>:<br>"
+        for param_name in (p for p in dir(params_data) if not p.startswith("__")):
+            value = getattr(params_data, param_name)
+            param_string += "\t%s: %s<br>" % (param_name, value)
+        self.vis.text(param_string, opts={"title": "Parameters"})
+    def log_dataset(self, dataset: SpeakerVerificationDataset):
+        if self.disabled:
+            return
+        dataset_string = ""
+        dataset_string += "<b>Speakers</b>: %s\n" % len(dataset.speakers)
+        dataset_string += "\n" + dataset.get_logs()
+        dataset_string = dataset_string.replace("\n", "<br>")
+        self.vis.text(dataset_string, opts={"title": "Dataset"})
+    def log_implementation(self, params):
+        if self.disabled:
+            return
+        implementation_string = ""
+        for param, value in params.items():
+            implementation_string += "<b>%s</b>: %s\n" % (param, value)
+            implementation_string = implementation_string.replace("\n", "<br>")
+        self.implementation_string = implementation_string
+        self.implementation_win = self.vis.text(
+            implementation_string,
+            opts={"title": "Training implementation"}
+        )
+    def update(self, loss, eer, step):
+        # Update the tracking data
+        now = timer()
+        self.step_times.append(1000 * (now - self.last_update_timestamp))
+        self.last_update_timestamp = now
+        self.losses.append(loss)
+        self.eers.append(eer)
+        print(".", end="")
+        # Update the plots every <update_every> steps
+        if step % self.update_every != 0:
+            return
+        time_string = "Step time:  mean: %5dms  std: %5dms" % \
+                      (int(np.mean(self.step_times)), int(np.std(self.step_times)))
+        print("\nStep %6d   Loss: %.4f   EER: %.4f   %s" %
+              (step, np.mean(self.losses), np.mean(self.eers), time_string))
+        if not self.disabled:
+            self.loss_win = self.vis.line(
+                [np.mean(self.losses)],
+                [step],
+                win=self.loss_win,
+                update="append" if self.loss_win else None,
+                opts=dict(
+                    legend=["Avg. loss"],
+                    xlabel="Step",
+                    ylabel="Loss",
+                    title="Loss",
+                )
+            )
+            self.eer_win = self.vis.line(
+                [np.mean(self.eers)],
+                [step],
+                win=self.eer_win,
+                update="append" if self.eer_win else None,
+                opts=dict(
+                    legend=["Avg. EER"],
+                    xlabel="Step",
+                    ylabel="EER",
+                    title="Equal error rate"
+                )
+            )
+            if self.implementation_win is not None:
+                self.vis.text(
+                    self.implementation_string + ("<b>%s</b>" % time_string),
+                    win=self.implementation_win,
+                    opts={"title": "Training implementation"},
+                )
+        # Reset the tracking
+        self.losses.clear()
+        self.eers.clear()
+        self.step_times.clear()
+    def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None,
+                         max_speakers=10):
+        max_speakers = min(max_speakers, len(colormap))
+        embeds = embeds[:max_speakers * utterances_per_speaker]
+        n_speakers = len(embeds) // utterances_per_speaker
+        ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
+        colors = [colormap[i] for i in ground_truth]
+        reducer = umap.UMAP()
+        projected = reducer.fit_transform(embeds)
+        plt.scatter(projected[:, 0], projected[:, 1], c=colors)
+        plt.gca().set_aspect("equal", "datalim")
+        plt.title("UMAP projection (step %d)" % step)
+        if not self.disabled:
+            self.projection_win = self.vis.matplot(plt, win=self.projection_win)
+        if out_fpath is not None:
+            plt.savefig(out_fpath)
+        plt.clf()
+    def save(self):
+        if not self.disabled:
+            self.vis.save([self.env_name])

encoder_preprocess.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from encoder.preprocess import preprocess_librispeech, preprocess_voxceleb1, preprocess_voxceleb2, preprocess_aidatatang_200zh
+from utils.argutils import print_args
+from pathlib import Path
+import argparse
+if __name__ == "__main__":
+    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
+        pass
+    parser = argparse.ArgumentParser(
+        description="Preprocesses audio files from datasets, encodes them as mel spectrograms and "
+                    "writes them to the disk. This will allow you to train the encoder. The "
+                    "datasets required are at least one of LibriSpeech, VoxCeleb1, VoxCeleb2, aidatatang_200zh. ",
+        formatter_class=MyFormatter
+    )
+    parser.add_argument("datasets_root", type=Path, help=\
+        "Path to the directory containing your LibriSpeech/TTS and VoxCeleb datasets.")
+    parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, help=\
+        "Path to the output directory that will contain the mel spectrograms. If left out, "
+        "defaults to <datasets_root>/SV2TTS/encoder/")
+    parser.add_argument("-d", "--datasets", type=str,
+                        default="librispeech_other,voxceleb1,aidatatang_200zh", help=\
+        "Comma-separated list of the name of the datasets you want to preprocess. Only the train "
+        "set of these datasets will be used. Possible names: librispeech_other, voxceleb1, "
+        "voxceleb2.")
+    parser.add_argument("-s", "--skip_existing", action="store_true", help=\
+        "Whether to skip existing output files with the same name. Useful if this script was "
+        "interrupted.")
+    parser.add_argument("--no_trim", action="store_true", help=\
+        "Preprocess audio without trimming silences (not recommended).")
+    args = parser.parse_args()
+    # Verify webrtcvad is available
+    if not args.no_trim:
+        try:
+            import webrtcvad
+        except:
+            raise ModuleNotFoundError("Package 'webrtcvad' not found. This package enables "
+                "noise removal and is recommended. Please install and try again. If installation fails, "
+                "use --no_trim to disable this error message.")
+    del args.no_trim
+    # Process the arguments
+    args.datasets = args.datasets.split(",")
+    if not hasattr(args, "out_dir"):
+        args.out_dir = args.datasets_root.joinpath("SV2TTS", "encoder")
+    assert args.datasets_root.exists()
+    args.out_dir.mkdir(exist_ok=True, parents=True)
+    # Preprocess the datasets
+    print_args(args, parser)
+    preprocess_func = {
+        "librispeech_other": preprocess_librispeech,
+        "voxceleb1": preprocess_voxceleb1,
+        "voxceleb2": preprocess_voxceleb2,
+        "aidatatang_200zh": preprocess_aidatatang_200zh,
+    }
+    args = vars(args)
+    for dataset in args.pop("datasets"):
+        print("Preprocessing %s" % dataset)
+        preprocess_func[dataset](**args)

encoder_train.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from utils.argutils import print_args
+from encoder.train import train
+from pathlib import Path
+import argparse
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Trains the speaker encoder. You must have run encoder_preprocess.py first.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("run_id", type=str, help= \
+        "Name for this model instance. If a model state from the same run ID was previously "
+        "saved, the training will restart from there. Pass -f to overwrite saved states and "
+        "restart from scratch.")
+    parser.add_argument("clean_data_root", type=Path, help= \
+        "Path to the output directory of encoder_preprocess.py. If you left the default "
+        "output directory when preprocessing, it should be <datasets_root>/SV2TTS/encoder/.")
+    parser.add_argument("-m", "--models_dir", type=Path, default="encoder/saved_models/", help=\
+        "Path to the output directory that will contain the saved model weights, as well as "
+        "backups of those weights and plots generated during training.")
+    parser.add_argument("-v", "--vis_every", type=int, default=10, help= \
+        "Number of steps between updates of the loss and the plots.")
+    parser.add_argument("-u", "--umap_every", type=int, default=100, help= \
+        "Number of steps between updates of the umap projection. Set to 0 to never update the "
+        "projections.")
+    parser.add_argument("-s", "--save_every", type=int, default=500, help= \
+        "Number of steps between updates of the model on the disk. Set to 0 to never save the "
+        "model.")
+    parser.add_argument("-b", "--backup_every", type=int, default=7500, help= \
+        "Number of steps between backups of the model. Set to 0 to never make backups of the "
+        "model.")
+    parser.add_argument("-f", "--force_restart", action="store_true", help= \
+        "Do not load any saved model.")
+    parser.add_argument("--visdom_server", type=str, default="http://localhost")
+    parser.add_argument("--no_visdom", action="store_true", help= \
+        "Disable visdom.")
+    args = parser.parse_args()
+    # Process the arguments
+    args.models_dir.mkdir(exist_ok=True)
+    # Run the training
+    print_args(args, parser)
+    train(**vars(args))

gen_voice.py ADDED Viewed

	@@ -0,0 +1,128 @@

+from encoder.params_model import model_embedding_size as speaker_embedding_size
+from utils.argutils import print_args
+from utils.modelutils import check_model_paths
+from synthesizer.inference import Synthesizer
+from encoder import inference as encoder
+from vocoder.wavernn import inference as rnn_vocoder
+from vocoder.hifigan import inference as gan_vocoder
+from pathlib import Path
+import numpy as np
+import soundfile as sf
+import librosa
+import argparse
+import torch
+import sys
+import os
+import re
+import cn2an
+import glob
+from audioread.exceptions import NoBackendError
+vocoder = gan_vocoder
+def gen_one_wav(synthesizer, in_fpath, embed, texts, file_name, seq):
+    embeds = [embed] * len(texts)
+    # If you know what the attention layer alignments are, you can retrieve them here by
+    # passing return_alignments=True
+    specs = synthesizer.synthesize_spectrograms(texts, embeds, style_idx=-1, min_stop_token=4, steps=400)
+    #spec = specs[0]
+    breaks = [spec.shape[1] for spec in specs]
+    spec = np.concatenate(specs, axis=1)
+    # If seed is specified, reset torch seed and reload vocoder
+    # Synthesizing the waveform is fairly straightforward. Remember that the longer the
+    # spectrogram, the more time-efficient the vocoder.
+    generated_wav, output_sample_rate = vocoder.infer_waveform(spec)
+    # Add breaks
+    b_ends = np.cumsum(np.array(breaks) * synthesizer.hparams.hop_size)
+    b_starts = np.concatenate(([0], b_ends[:-1]))
+    wavs = [generated_wav[start:end] for start, end, in zip(b_starts, b_ends)]
+    breaks = [np.zeros(int(0.15 * synthesizer.sample_rate))] * len(breaks)
+    generated_wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
+    ## Post-generation
+    # There's a bug with sounddevice that makes the audio cut one second earlier, so we
+    # pad it.
+    # Trim excess silences to compensate for gaps in spectrograms (issue #53)
+    generated_wav = encoder.preprocess_wav(generated_wav)
+    generated_wav = generated_wav / np.abs(generated_wav).max() * 0.97
+    # Save it on the disk
+    model=os.path.basename(in_fpath)
+    filename = "%s_%d_%s.wav" %(file_name, seq, model)
+    sf.write(filename, generated_wav, synthesizer.sample_rate)
+    print("\nSaved output as %s\n\n" % filename)
+def generate_wav(enc_model_fpath, syn_model_fpath, voc_model_fpath, in_fpath, input_txt, file_name):
+    if torch.cuda.is_available():
+        device_id = torch.cuda.current_device()
+        gpu_properties = torch.cuda.get_device_properties(device_id)
+        ## Print some environment information (for debugging purposes)
+        print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
+            "%.1fGb total memory.\n" %
+            (torch.cuda.device_count(),
+            device_id,
+            gpu_properties.name,
+            gpu_properties.major,
+            gpu_properties.minor,
+            gpu_properties.total_memory / 1e9))
+    else:
+        print("Using CPU for inference.\n")
+    print("Preparing the encoder, the synthesizer and the vocoder...")
+    encoder.load_model(enc_model_fpath)
+    synthesizer = Synthesizer(syn_model_fpath)
+    vocoder.load_model(voc_model_fpath)
+    encoder_wav = synthesizer.load_preprocess_wav(in_fpath)
+    embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
+    texts = input_txt.split("\n")
+    seq=0
+    each_num=1500
+    punctuation = '！，。、,' # punctuate and split/clean text
+    processed_texts = []
+    cur_num = 0
+    for text in texts:
+      for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
+        if processed_text:
+            processed_texts.append(processed_text.strip())
+            cur_num += len(processed_text.strip())
+      if cur_num > each_num:
+        seq = seq +1
+        gen_one_wav(synthesizer, in_fpath, embed, processed_texts, file_name, seq)
+        processed_texts = []
+        cur_num = 0
+    if len(processed_texts)>0:
+      seq = seq +1
+      gen_one_wav(synthesizer, in_fpath, embed, processed_texts, file_name, seq)
+if (len(sys.argv)>=3):
+    my_txt = ""
+    print("reading from :", sys.argv[1])
+    with open(sys.argv[1], "r") as f:
+        for line in f.readlines():
+            #line = line.strip('\n')
+            my_txt += line
+    txt_file_name = sys.argv[1]
+    wav_file_name = sys.argv[2]
+    output = cn2an.transform(my_txt, "an2cn")
+    print(output)
+    generate_wav(
+    Path("encoder/saved_models/pretrained.pt"),
+    Path("synthesizer/saved_models/mandarin.pt"),
+    Path("vocoder/saved_models/pretrained/g_hifigan.pt"), wav_file_name, output, txt_file_name
+    )
+else:
+    print("please input the file name")
+    exit(1)

mkgui/__init__.py ADDED Viewed

File without changes

mkgui/app.py ADDED Viewed

	@@ -0,0 +1,145 @@

+from pydantic import BaseModel, Field
+import os
+from pathlib import Path
+from enum import Enum
+from encoder import inference as encoder
+import librosa
+from scipy.io.wavfile import write
+import re
+import numpy as np
+from mkgui.base.components.types import FileContent
+from vocoder.hifigan import inference as gan_vocoder
+from synthesizer.inference import Synthesizer
+from typing import Any, Tuple
+import matplotlib.pyplot as plt
+# Constants
+AUDIO_SAMPLES_DIR = f"samples{os.sep}"
+SYN_MODELS_DIRT = f"synthesizer{os.sep}saved_models"
+ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
+VOC_MODELS_DIRT = f"vocoder{os.sep}saved_models"
+TEMP_SOURCE_AUDIO = f"wavs{os.sep}temp_source.wav"
+TEMP_RESULT_AUDIO = f"wavs{os.sep}temp_result.wav"
+if not os.path.isdir("wavs"):
+    os.makedirs("wavs")
+# Load local sample audio as options TODO: load dataset
+if os.path.isdir(AUDIO_SAMPLES_DIR):
+    audio_input_selection = Enum('samples', list((file.name, file) for file in Path(AUDIO_SAMPLES_DIR).glob("*.wav")))
+# Pre-Load models
+if os.path.isdir(SYN_MODELS_DIRT):
+    synthesizers =  Enum('synthesizers', list((file.name, file) for file in Path(SYN_MODELS_DIRT).glob("**/*.pt")))
+    print("Loaded synthesizer models: " + str(len(synthesizers)))
+else:
+    raise Exception(f"Model folder {SYN_MODELS_DIRT} doesn't exist.")
+if os.path.isdir(ENC_MODELS_DIRT):
+    encoders =  Enum('encoders', list((file.name, file) for file in Path(ENC_MODELS_DIRT).glob("**/*.pt")))
+    print("Loaded encoders models: " + str(len(encoders)))
+else:
+    raise Exception(f"Model folder {ENC_MODELS_DIRT} doesn't exist.")
+if os.path.isdir(VOC_MODELS_DIRT):
+    vocoders =  Enum('vocoders', list((file.name, file) for file in Path(VOC_MODELS_DIRT).glob("**/*gan*.pt")))
+    print("Loaded vocoders models: " + str(len(synthesizers)))
+else:
+    raise Exception(f"Model folder {VOC_MODELS_DIRT} doesn't exist.")
+class Input(BaseModel):
+    message: str = Field(
+        ..., example="欢迎使用工具箱, 现已支持中文输入！", alias="文本内容"
+    )
+    local_audio_file: audio_input_selection = Field(
+        ..., alias="输入语音（本地wav）",
+        description="选择本地语音文件."
+    )
+    upload_audio_file: FileContent = Field(default=None, alias="或上传语音",
+        description="拖拽或点击上传.", mime_type="audio/wav")
+    encoder: encoders = Field(
+        ..., alias="编码模型",
+        description="选择语音编码模型文件."
+    )
+    synthesizer: synthesizers = Field(
+        ..., alias="合成模型",
+        description="选择语音合成模型文件."
+    )
+    vocoder: vocoders = Field(
+        ..., alias="语音解码模型",
+        description="选择语音解码模型文件(目前只支持HifiGan类型)."
+    )
+class AudioEntity(BaseModel):
+    content: bytes
+    mel: Any
+class Output(BaseModel):
+    __root__: Tuple[AudioEntity, AudioEntity]
+    def render_output_ui(self, streamlit_app, input) -> None:  # type: ignore
+        """Custom output UI.
+        If this method is implmeneted, it will be used instead of the default Output UI renderer.
+        """
+        src, result = self.__root__
+        streamlit_app.subheader("Synthesized Audio")
+        streamlit_app.audio(result.content, format="audio/wav")
+        fig, ax = plt.subplots()
+        ax.imshow(src.mel, aspect="equal", interpolation="none")
+        ax.set_title("mel spectrogram(Source Audio)")
+        streamlit_app.pyplot(fig)
+        fig, ax = plt.subplots()
+        ax.imshow(result.mel, aspect="equal", interpolation="none")
+        ax.set_title("mel spectrogram(Result Audio)")
+        streamlit_app.pyplot(fig)
+def synthesize(input: Input) -> Output:
+    """synthesize(合成)"""
+    # load models
+    encoder.load_model(Path(input.encoder.value))
+    current_synt = Synthesizer(Path(input.synthesizer.value))
+    gan_vocoder.load_model(Path(input.vocoder.value))
+    # load file
+    if input.upload_audio_file != None:
+        with open(TEMP_SOURCE_AUDIO, "w+b") as f:
+            f.write(input.upload_audio_file.as_bytes())
+            f.seek(0)
+        wav, sample_rate = librosa.load(TEMP_SOURCE_AUDIO)
+    else:
+        wav, sample_rate  = librosa.load(input.local_audio_file.value)
+        write(TEMP_SOURCE_AUDIO, sample_rate, wav) #Make sure we get the correct wav
+    source_spec = Synthesizer.make_spectrogram(wav)
+    # preprocess
+    encoder_wav = encoder.preprocess_wav(wav, sample_rate)
+    embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
+    # Load input text
+    texts = filter(None, input.message.split("\n"))
+    punctuation = '！，。、,' # punctuate and split/clean text
+    processed_texts = []
+    for text in texts:
+        for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
+            if processed_text:
+                processed_texts.append(processed_text.strip())
+    texts = processed_texts
+    # synthesize and vocode
+    embeds = [embed] * len(texts)
+    specs = current_synt.synthesize_spectrograms(texts, embeds)
+    spec = np.concatenate(specs, axis=1)
+    sample_rate = Synthesizer.sample_rate
+    wav, sample_rate = gan_vocoder.infer_waveform(spec)
+    # write and output
+    write(TEMP_RESULT_AUDIO, sample_rate, wav) #Make sure we get the correct wav
+    with open(TEMP_SOURCE_AUDIO, "rb") as f:
+        source_file = f.read()
+    with open(TEMP_RESULT_AUDIO, "rb") as f:
+        result_file = f.read()
+    return Output(__root__=(AudioEntity(content=source_file, mel=source_spec), AudioEntity(content=result_file, mel=spec)))

mkgui/app_vc.py ADDED Viewed

	@@ -0,0 +1,166 @@

+from synthesizer.inference import Synthesizer
+from pydantic import BaseModel, Field
+from encoder import inference as speacker_encoder
+import torch
+import os
+from pathlib import Path
+from enum import Enum
+import ppg_extractor as Extractor
+import ppg2mel as Convertor
+import librosa
+from scipy.io.wavfile import write
+import re
+import numpy as np
+from mkgui.base.components.types import FileContent
+from vocoder.hifigan import inference as gan_vocoder
+from typing import Any, Tuple
+import matplotlib.pyplot as plt
+# Constants
+AUDIO_SAMPLES_DIR = f'sample{os.sep}'
+EXT_MODELS_DIRT = f'ppg_extractor{os.sep}saved_models'
+CONV_MODELS_DIRT = f'ppg2mel{os.sep}saved_models'
+VOC_MODELS_DIRT = f'vocoder{os.sep}saved_models'
+TEMP_SOURCE_AUDIO = f'wavs{os.sep}temp_source.wav'
+TEMP_TARGET_AUDIO = f'wavs{os.sep}temp_target.wav'
+TEMP_RESULT_AUDIO = f'wavs{os.sep}temp_result.wav'
+# Load local sample audio as options TODO: load dataset
+if os.path.isdir(AUDIO_SAMPLES_DIR):
+    audio_input_selection = Enum('samples', list((file.name, file) for file in Path(AUDIO_SAMPLES_DIR).glob("*.wav")))
+# Pre-Load models
+if os.path.isdir(EXT_MODELS_DIRT):
+    extractors =  Enum('extractors', list((file.name, file) for file in Path(EXT_MODELS_DIRT).glob("**/*.pt")))
+    print("Loaded extractor models: " + str(len(extractors)))
+else:
+    raise Exception(f"Model folder {EXT_MODELS_DIRT} doesn't exist.")
+if os.path.isdir(CONV_MODELS_DIRT):
+    convertors =  Enum('convertors', list((file.name, file) for file in Path(CONV_MODELS_DIRT).glob("**/*.pth")))
+    print("Loaded convertor models: " + str(len(convertors)))
+else:
+    raise Exception(f"Model folder {CONV_MODELS_DIRT} doesn't exist.")
+if os.path.isdir(VOC_MODELS_DIRT):
+    vocoders =  Enum('vocoders', list((file.name, file) for file in Path(VOC_MODELS_DIRT).glob("**/*gan*.pt")))
+    print("Loaded vocoders models: " + str(len(vocoders)))
+else:
+    raise Exception(f"Model folder {VOC_MODELS_DIRT} doesn't exist.")
+class Input(BaseModel):
+    local_audio_file: audio_input_selection = Field(
+        ..., alias="输入语音（本地wav）",
+        description="选择本地语音文件."
+    )
+    upload_audio_file: FileContent = Field(default=None, alias="或上传语音",
+        description="拖拽或点击上传.", mime_type="audio/wav")
+    local_audio_file_target: audio_input_selection = Field(
+        ..., alias="目标语音（本地wav）",
+        description="选择本地语音文件."
+    )
+    upload_audio_file_target: FileContent = Field(default=None, alias="或上传目标语音",
+        description="拖拽或点击上传.", mime_type="audio/wav")
+    extractor: extractors = Field(
+        ..., alias="编码模型",
+        description="选择语音编码模型文件."
+    )
+    convertor: convertors = Field(
+        ..., alias="转换模型",
+        description="选择语音转换模型文件."
+    )
+    vocoder: vocoders = Field(
+        ..., alias="语音解码模型",
+        description="选择语音解码模型文件(目前只支持HifiGan类型)."
+    )
+class AudioEntity(BaseModel):
+    content: bytes
+    mel: Any
+class Output(BaseModel):
+    __root__: Tuple[AudioEntity, AudioEntity, AudioEntity]
+    def render_output_ui(self, streamlit_app, input) -> None:  # type: ignore
+        """Custom output UI.
+        If this method is implmeneted, it will be used instead of the default Output UI renderer.
+        """
+        src, target, result = self.__root__
+        streamlit_app.subheader("Synthesized Audio")
+        streamlit_app.audio(result.content, format="audio/wav")
+        fig, ax = plt.subplots()
+        ax.imshow(src.mel, aspect="equal", interpolation="none")
+        ax.set_title("mel spectrogram(Source Audio)")
+        streamlit_app.pyplot(fig)
+        fig, ax = plt.subplots()
+        ax.imshow(target.mel, aspect="equal", interpolation="none")
+        ax.set_title("mel spectrogram(Target Audio)")
+        streamlit_app.pyplot(fig)
+        fig, ax = plt.subplots()
+        ax.imshow(result.mel, aspect="equal", interpolation="none")
+        ax.set_title("mel spectrogram(Result Audio)")
+        streamlit_app.pyplot(fig)
+def convert(input: Input) -> Output:
+    """convert(转换)"""
+    # load models
+    extractor = Extractor.load_model(Path(input.extractor.value))
+    convertor = Convertor.load_model(Path(input.convertor.value))
+    # current_synt = Synthesizer(Path(input.synthesizer.value))
+    gan_vocoder.load_model(Path(input.vocoder.value))
+    # load file
+    if input.upload_audio_file != None:
+        with open(TEMP_SOURCE_AUDIO, "w+b") as f:
+            f.write(input.upload_audio_file.as_bytes())
+            f.seek(0)
+        src_wav, sample_rate = librosa.load(TEMP_SOURCE_AUDIO)
+    else:
+        src_wav, sample_rate  = librosa.load(input.local_audio_file.value)
+        write(TEMP_SOURCE_AUDIO, sample_rate, src_wav) #Make sure we get the correct wav
+    if input.upload_audio_file_target != None:
+        with open(TEMP_TARGET_AUDIO, "w+b") as f:
+            f.write(input.upload_audio_file_target.as_bytes())
+            f.seek(0)
+        ref_wav, _ = librosa.load(TEMP_TARGET_AUDIO)
+    else:
+        ref_wav, _  = librosa.load(input.local_audio_file_target.value)
+        write(TEMP_TARGET_AUDIO, sample_rate, ref_wav) #Make sure we get the correct wav
+    ppg = extractor.extract_from_wav(src_wav)
+    # Import necessary dependency of Voice Conversion
+    from utils.f0_utils import compute_f0, f02lf0, compute_mean_std, get_converted_lf0uv
+    ref_lf0_mean, ref_lf0_std = compute_mean_std(f02lf0(compute_f0(ref_wav)))
+    speacker_encoder.load_model(Path("encoder{os.sep}saved_models{os.sep}pretrained_bak_5805000.pt"))
+    embed = speacker_encoder.embed_utterance(ref_wav)
+    lf0_uv = get_converted_lf0uv(src_wav, ref_lf0_mean, ref_lf0_std, convert=True)
+    min_len = min(ppg.shape[1], len(lf0_uv))
+    ppg = ppg[:, :min_len]
+    lf0_uv = lf0_uv[:min_len]
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    _, mel_pred, att_ws = convertor.inference(
+        ppg,
+        logf0_uv=torch.from_numpy(lf0_uv).unsqueeze(0).float().to(device),
+        spembs=torch.from_numpy(embed).unsqueeze(0).to(device),
+    )
+    mel_pred= mel_pred.transpose(0, 1)
+    breaks = [mel_pred.shape[1]]
+    mel_pred= mel_pred.detach().cpu().numpy()
+    # synthesize and vocode
+    wav, sample_rate = gan_vocoder.infer_waveform(mel_pred)
+    # write and output
+    write(TEMP_RESULT_AUDIO, sample_rate, wav) #Make sure we get the correct wav
+    with open(TEMP_SOURCE_AUDIO, "rb") as f:
+        source_file = f.read()
+    with open(TEMP_TARGET_AUDIO, "rb") as f:
+        target_file = f.read()
+    with open(TEMP_RESULT_AUDIO, "rb") as f:
+        result_file = f.read()
+    return Output(__root__=(AudioEntity(content=source_file, mel=Synthesizer.make_spectrogram(src_wav)), AudioEntity(content=target_file, mel=Synthesizer.make_spectrogram(ref_wav)), AudioEntity(content=result_file, mel=Synthesizer.make_spectrogram(wav))))

mkgui/base/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+
2	+ from .core import Opyrator

mkgui/base/api/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .fastapi_app import create_api

mkgui/base/api/fastapi_utils.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""Collection of utilities for FastAPI apps."""
+import inspect
+from typing import Any, Type
+from fastapi import FastAPI, Form
+from pydantic import BaseModel
+def as_form(cls: Type[BaseModel]) -> Any:
+    """Adds an as_form class method to decorated models.
+    The as_form class method can be used with FastAPI endpoints
+    """
+    new_params = [
+        inspect.Parameter(
+            field.alias,
+            inspect.Parameter.POSITIONAL_ONLY,
+            default=(Form(field.default) if not field.required else Form(...)),
+        )
+        for field in cls.__fields__.values()
+    ]
+    async def _as_form(**data):  # type: ignore
+        return cls(**data)
+    sig = inspect.signature(_as_form)
+    sig = sig.replace(parameters=new_params)
+    _as_form.__signature__ = sig  # type: ignore
+    setattr(cls, "as_form", _as_form)
+    return cls
+def patch_fastapi(app: FastAPI) -> None:
+    """Patch function to allow relative url resolution.
+    This patch is required to make fastapi fully functional with a relative url path.
+    This code snippet can be copy-pasted to any Fastapi application.
+    """
+    from fastapi.openapi.docs import get_redoc_html, get_swagger_ui_html
+    from starlette.requests import Request
+    from starlette.responses import HTMLResponse
+    async def redoc_ui_html(req: Request) -> HTMLResponse:
+        assert app.openapi_url is not None
+        redoc_ui = get_redoc_html(
+            openapi_url="./" + app.openapi_url.lstrip("/"),
+            title=app.title + " - Redoc UI",
+        )
+        return HTMLResponse(redoc_ui.body.decode("utf-8"))
+    async def swagger_ui_html(req: Request) -> HTMLResponse:
+        assert app.openapi_url is not None
+        swagger_ui = get_swagger_ui_html(
+            openapi_url="./" + app.openapi_url.lstrip("/"),
+            title=app.title + " - Swagger UI",
+            oauth2_redirect_url=app.swagger_ui_oauth2_redirect_url,
+        )
+        # insert request interceptor to have all request run on relativ path
+        request_interceptor = (
+            "requestInterceptor: (e)  => {"
+            "\n\t\t\tvar url = window.location.origin + window.location.pathname"
+            '\n\t\t\turl = url.substring( 0, url.lastIndexOf( "/" ) + 1);'
+            "\n\t\t\turl = e.url.replace(/http(s)?:\/\/[^/]*\//i, url);"  # noqa: W605
+            "\n\t\t\te.contextUrl = url"
+            "\n\t\t\te.url = url"
+            "\n\t\t\treturn e;}"
+        )
+        return HTMLResponse(
+            swagger_ui.body.decode("utf-8").replace(
+                "dom_id: '#swagger-ui',",
+                "dom_id: '#swagger-ui',\n\t\t" + request_interceptor + ",",
+            )
+        )
+    # remove old docs route and add our patched route
+    routes_new = []
+    for app_route in app.routes:
+        if app_route.path == "/docs":  # type: ignore
+            continue
+        if app_route.path == "/redoc":  # type: ignore
+            continue
+        routes_new.append(app_route)
+    app.router.routes = routes_new
+    assert app.docs_url is not None
+    app.add_route(app.docs_url, swagger_ui_html, include_in_schema=False)
+    assert app.redoc_url is not None
+    app.add_route(app.redoc_url, redoc_ui_html, include_in_schema=False)
+    # Make graphql realtive
+    from starlette import graphql
+    graphql.GRAPHIQL = graphql.GRAPHIQL.replace(
+        "({{REQUEST_PATH}}", '("." + {{REQUEST_PATH}}'
+    )

mkgui/base/components/__init__.py ADDED Viewed

File without changes

mkgui/base/components/outputs.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from typing import List
+from pydantic import BaseModel
+class ScoredLabel(BaseModel):
+    label: str
+    score: float
+class ClassificationOutput(BaseModel):
+    __root__: List[ScoredLabel]
+    def __iter__(self):  # type: ignore
+        return iter(self.__root__)
+    def __getitem__(self, item):  # type: ignore
+        return self.__root__[item]
+    def render_output_ui(self, streamlit) -> None:  # type: ignore
+        import plotly.express as px
+        sorted_predictions = sorted(
+            [prediction.dict() for prediction in self.__root__],
+            key=lambda k: k["score"],
+        )
+        num_labels = len(sorted_predictions)
+        if len(sorted_predictions) > 10:
+            num_labels = streamlit.slider(
+                "Maximum labels to show: ",
+                min_value=1,
+                max_value=len(sorted_predictions),
+                value=len(sorted_predictions),
+            )
+        fig = px.bar(
+            sorted_predictions[len(sorted_predictions) - num_labels :],
+            x="score",
+            y="label",
+            orientation="h",
+        )
+        streamlit.plotly_chart(fig, use_container_width=True)
+        # fig.show()

mkgui/base/components/types.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import base64
+from typing import Any, Dict, overload
+class FileContent(str):
+    def as_bytes(self) -> bytes:
+        return base64.b64decode(self, validate=True)
+    def as_str(self) -> str:
+        return self.as_bytes().decode()
+    @classmethod
+    def __modify_schema__(cls, field_schema: Dict[str, Any]) -> None:
+        field_schema.update(format="byte")
+    @classmethod
+    def __get_validators__(cls) -> Any:  # type: ignore
+        yield cls.validate
+    @classmethod
+    def validate(cls, value: Any) -> "FileContent":
+        if isinstance(value, FileContent):
+            return value
+        elif isinstance(value, str):
+            return FileContent(value)
+        elif isinstance(value, (bytes, bytearray, memoryview)):
+            return FileContent(base64.b64encode(value).decode())
+        else:
+            raise Exception("Wrong type")
+# # 暂时无法使用，因为浏览器中没有考虑选择文件夹
+# class DirectoryContent(FileContent):
+#     @classmethod
+#     def __modify_schema__(cls, field_schema: Dict[str, Any]) -> None:
+#         field_schema.update(format="path")
+#     @classmethod
+#     def validate(cls, value: Any) -> "DirectoryContent":
+#         if isinstance(value, DirectoryContent):
+#             return value
+#         elif isinstance(value, str):
+#             return DirectoryContent(value)
+#         elif isinstance(value, (bytes, bytearray, memoryview)):
+#             return DirectoryContent(base64.b64encode(value).decode())
+#         else:
+#             raise Exception("Wrong type")

mkgui/base/core.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import importlib
+import inspect
+import re
+from typing import Any, Callable, Type, Union, get_type_hints
+from pydantic import BaseModel, parse_raw_as
+from pydantic.tools import parse_obj_as
+def name_to_title(name: str) -> str:
+    """Converts a camelCase or snake_case name to title case."""
+    # If camelCase -> convert to snake case
+    name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
+    name = re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower()
+    # Convert to title case
+    return name.replace("_", " ").strip().title()
+def is_compatible_type(type: Type) -> bool:
+    """Returns `True` if the type is opyrator-compatible."""
+    try:
+        if issubclass(type, BaseModel):
+            return True
+    except Exception:
+        pass
+    try:
+        # valid list type
+        if type.__origin__ is list and issubclass(type.__args__[0], BaseModel):
+            return True
+    except Exception:
+        pass
+    return False
+def get_input_type(func: Callable) -> Type:
+    """Returns the input type of a given function (callable).
+    Args:
+        func: The function for which to get the input type.
+    Raises:
+        ValueError: If the function does not have a valid input type annotation.
+    """
+    type_hints = get_type_hints(func)
+    if "input" not in type_hints:
+        raise ValueError(
+            "The callable MUST have a parameter with the name `input` with typing annotation. "
+            "For example: `def my_opyrator(input: InputModel) -> OutputModel:`."
+        )
+    input_type = type_hints["input"]
+    if not is_compatible_type(input_type):
+        raise ValueError(
+            "The `input` parameter MUST be a subclass of the Pydantic BaseModel or a list of Pydantic models."
+        )
+    # TODO: return warning if more than one input parameters
+    return input_type
+def get_output_type(func: Callable) -> Type:
+    """Returns the output type of a given function (callable).
+    Args:
+        func: The function for which to get the output type.
+    Raises:
+        ValueError: If the function does not have a valid output type annotation.
+    """
+    type_hints = get_type_hints(func)
+    if "return" not in type_hints:
+        raise ValueError(
+            "The return type of the callable MUST be annotated with type hints."
+            "For example: `def my_opyrator(input: InputModel) -> OutputModel:`."
+        )
+    output_type = type_hints["return"]
+    if not is_compatible_type(output_type):
+        raise ValueError(
+            "The return value MUST be a subclass of the Pydantic BaseModel or a list of Pydantic models."
+        )
+    return output_type
+def get_callable(import_string: str) -> Callable:
+    """Import a callable from an string."""
+    callable_seperator = ":"
+    if callable_seperator not in import_string:
+        # Use dot as seperator
+        callable_seperator = "."
+    if callable_seperator not in import_string:
+        raise ValueError("The callable path MUST specify the function. ")
+    mod_name, callable_name = import_string.rsplit(callable_seperator, 1)
+    mod = importlib.import_module(mod_name)
+    return getattr(mod, callable_name)
+class Opyrator:
+    def __init__(self, func: Union[Callable, str]) -> None:
+        if isinstance(func, str):
+            # Try to load the function from a string notion
+            self.function = get_callable(func)
+        else:
+            self.function = func
+        self._action = "Execute"
+        self._input_type = None
+        self._output_type = None
+        if not callable(self.function):
+            raise ValueError("The provided function parameters is not a callable.")
+        if inspect.isclass(self.function):
+            raise ValueError(
+                "The provided callable is an uninitialized Class. This is not allowed."
+            )
+        if inspect.isfunction(self.function):
+            # The provided callable is a function
+            self._input_type = get_input_type(self.function)
+            self._output_type = get_output_type(self.function)
+            try:
+                # Get name
+                self._name = name_to_title(self.function.__name__)
+            except Exception:
+                pass
+            try:
+                # Get description from function
+                doc_string = inspect.getdoc(self.function)
+                if doc_string:
+                    self._action = doc_string
+            except Exception:
+                pass
+        elif hasattr(self.function, "__call__"):
+            # The provided callable is a function
+            self._input_type = get_input_type(self.function.__call__)  # type: ignore
+            self._output_type = get_output_type(self.function.__call__)  # type: ignore
+            try:
+                # Get name
+                self._name = name_to_title(type(self.function).__name__)
+            except Exception:
+                pass
+            try:
+                # Get action from
+                doc_string = inspect.getdoc(self.function.__call__)  # type: ignore
+                if doc_string:
+                    self._action = doc_string
+                if (
+                    not self._action
+                    or self._action == "Call"
+                ):
+                    # Get docstring from class instead of __call__ function
+                    doc_string = inspect.getdoc(self.function)
+                    if doc_string:
+                        self._action = doc_string
+            except Exception:
+                pass
+        else:
+            raise ValueError("Unknown callable type.")
+    @property
+    def name(self) -> str:
+        return self._name
+    @property
+    def action(self) -> str:
+        return self._action
+    @property
+    def input_type(self) -> Any:
+        return self._input_type
+    @property
+    def output_type(self) -> Any:
+        return self._output_type
+    def __call__(self, input: Any, **kwargs: Any) -> Any:
+        input_obj = input
+        if isinstance(input, str):
+            # Allow json input
+            input_obj = parse_raw_as(self.input_type, input)
+        if isinstance(input, dict):
+            # Allow dict input
+            input_obj = parse_obj_as(self.input_type, input)
+        return self.function(input_obj, **kwargs)

mkgui/base/ui/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .streamlit_ui import render_streamlit_ui

mkgui/base/ui/schema_utils.py ADDED Viewed

	@@ -0,0 +1,129 @@

+from typing import Dict
+def resolve_reference(reference: str, references: Dict) -> Dict:
+    return references[reference.split("/")[-1]]
+def get_single_reference_item(property: Dict, references: Dict) -> Dict:
+    # Ref can either be directly in the properties or the first element of allOf
+    reference = property.get("$ref")
+    if reference is None:
+        reference = property["allOf"][0]["$ref"]
+    return resolve_reference(reference, references)
+def is_single_string_property(property: Dict) -> bool:
+    return property.get("type") == "string"
+def is_single_datetime_property(property: Dict) -> bool:
+    if property.get("type") != "string":
+        return False
+    return property.get("format") in ["date-time", "time", "date"]
+def is_single_boolean_property(property: Dict) -> bool:
+    return property.get("type") == "boolean"
+def is_single_number_property(property: Dict) -> bool:
+    return property.get("type") in ["integer", "number"]
+def is_single_file_property(property: Dict) -> bool:
+    if property.get("type") != "string":
+        return False
+    # TODO: binary?
+    return property.get("format") == "byte"
+def is_single_directory_property(property: Dict) -> bool:
+    if property.get("type") != "string":
+        return False
+    return property.get("format") == "path"
+def is_multi_enum_property(property: Dict, references: Dict) -> bool:
+    if property.get("type") != "array":
+        return False
+    if property.get("uniqueItems") is not True:
+        # Only relevant if it is a set or other datastructures with unique items
+        return False
+    try:
+        _ = resolve_reference(property["items"]["$ref"], references)["enum"]
+        return True
+    except Exception:
+        return False
+def is_single_enum_property(property: Dict, references: Dict) -> bool:
+    try:
+        _ = get_single_reference_item(property, references)["enum"]
+        return True
+    except Exception:
+        return False
+def is_single_dict_property(property: Dict) -> bool:
+    if property.get("type") != "object":
+        return False
+    return "additionalProperties" in property
+def is_single_reference(property: Dict) -> bool:
+    if property.get("type") is not None:
+        return False
+    return bool(property.get("$ref"))
+def is_multi_file_property(property: Dict) -> bool:
+    if property.get("type") != "array":
+        return False
+    if property.get("items") is None:
+        return False
+    try:
+        # TODO: binary
+        return property["items"]["format"] == "byte"
+    except Exception:
+        return False
+def is_single_object(property: Dict, references: Dict) -> bool:
+    try:
+        object_reference = get_single_reference_item(property, references)
+        if object_reference["type"] != "object":
+            return False
+        return "properties" in object_reference
+    except Exception:
+        return False
+def is_property_list(property: Dict) -> bool:
+    if property.get("type") != "array":
+        return False
+    if property.get("items") is None:
+        return False
+    try:
+        return property["items"]["type"] in ["string", "number", "integer"]
+    except Exception:
+        return False
+def is_object_list_property(property: Dict, references: Dict) -> bool:
+    if property.get("type") != "array":
+        return False
+    try:
+        object_reference = resolve_reference(property["items"]["$ref"], references)
+        if object_reference["type"] != "object":
+            return False
+        return "properties" in object_reference
+    except Exception:
+        return False

mkgui/base/ui/streamlit_ui.py ADDED Viewed

	@@ -0,0 +1,888 @@

+import datetime
+import inspect
+import mimetypes
+import sys
+from os import getcwd, unlink
+from platform import system
+from tempfile import NamedTemporaryFile
+from typing import Any, Callable, Dict, List, Type
+from PIL import Image
+import pandas as pd
+import streamlit as st
+from fastapi.encoders import jsonable_encoder
+from loguru import logger
+from pydantic import BaseModel, ValidationError, parse_obj_as
+from mkgui.base import Opyrator
+from mkgui.base.core import name_to_title
+from mkgui.base.ui import schema_utils
+from mkgui.base.ui.streamlit_utils import CUSTOM_STREAMLIT_CSS
+STREAMLIT_RUNNER_SNIPPET = """
+from mkgui.base.ui import render_streamlit_ui
+from mkgui.base import Opyrator
+import streamlit as st
+# TODO: Make it configurable
+# Page config can only be setup once
+st.set_page_config(
+    page_title="MockingBird",
+    page_icon="🧊",
+    layout="wide")
+render_streamlit_ui()
+"""
+# with st.spinner("Loading MockingBird GUI. Please wait..."):
+#     opyrator = Opyrator("{opyrator_path}")
+def launch_ui(port: int = 8501) -> None:
+    with NamedTemporaryFile(
+        suffix=".py", mode="w", encoding="utf-8", delete=False
+    ) as f:
+        f.write(STREAMLIT_RUNNER_SNIPPET)
+        f.seek(0)
+        import subprocess
+        python_path = f'PYTHONPATH="$PYTHONPATH:{getcwd()}"'
+        if system() == "Windows":
+            python_path = f"set PYTHONPATH=%PYTHONPATH%;{getcwd()} &&"
+            subprocess.run(
+                f"""set STREAMLIT_GLOBAL_SHOW_WARNING_ON_DIRECT_EXECUTION=false""",
+                shell=True,
+            )
+        subprocess.run(
+            f"""{python_path} "{sys.executable}" -m streamlit run --server.port={port} --server.headless=True --runner.magicEnabled=False --server.maxUploadSize=50  --browser.gatherUsageStats=False {f.name}""",
+            shell=True,
+        )
+        f.close()
+        unlink(f.name)
+def function_has_named_arg(func: Callable, parameter: str) -> bool:
+    try:
+        sig = inspect.signature(func)
+        for param in sig.parameters.values():
+            if param.name == "input":
+                return True
+    except Exception:
+        return False
+    return False
+def has_output_ui_renderer(data_item: BaseModel) -> bool:
+    return hasattr(data_item, "render_output_ui")
+def has_input_ui_renderer(input_class: Type[BaseModel]) -> bool:
+    return hasattr(input_class, "render_input_ui")
+def is_compatible_audio(mime_type: str) -> bool:
+    return mime_type in ["audio/mpeg", "audio/ogg", "audio/wav"]
+def is_compatible_image(mime_type: str) -> bool:
+    return mime_type in ["image/png", "image/jpeg"]
+def is_compatible_video(mime_type: str) -> bool:
+    return mime_type in ["video/mp4"]
+class InputUI:
+    def __init__(self, session_state, input_class: Type[BaseModel]):
+        self._session_state = session_state
+        self._input_class = input_class
+        self._schema_properties = input_class.schema(by_alias=True).get(
+            "properties", {}
+        )
+        self._schema_references = input_class.schema(by_alias=True).get(
+            "definitions", {}
+        )
+    def render_ui(self, streamlit_app_root) -> None:
+        if has_input_ui_renderer(self._input_class):
+            # The input model has a rendering function
+            # The rendering also returns the current state of input data
+            self._session_state.input_data = self._input_class.render_input_ui(  # type: ignore
+                st, self._session_state.input_data
+            )
+            return
+        # print(self._schema_properties)
+        for property_key in self._schema_properties.keys():
+            property = self._schema_properties[property_key]
+            if not property.get("title"):
+                # Set property key as fallback title
+                property["title"] = name_to_title(property_key)
+            try:
+                if "input_data" in self._session_state:
+                    self._store_value(
+                        property_key,
+                        self._render_property(streamlit_app_root, property_key, property),
+                    )
+            except Exception as e:
+                print("Exception!", e)
+                pass
+    def _get_default_streamlit_input_kwargs(self, key: str, property: Dict) -> Dict:
+        streamlit_kwargs = {
+            "label": property.get("title"),
+            "key": key,
+        }
+        if property.get("description"):
+            streamlit_kwargs["help"] = property.get("description")
+        return streamlit_kwargs
+    def _store_value(self, key: str, value: Any) -> None:
+        data_element = self._session_state.input_data
+        key_elements = key.split(".")
+        for i, key_element in enumerate(key_elements):
+            if i == len(key_elements) - 1:
+                # add value to this element
+                data_element[key_element] = value
+                return
+            if key_element not in data_element:
+                data_element[key_element] = {}
+            data_element = data_element[key_element]
+    def _get_value(self, key: str) -> Any:
+        data_element = self._session_state.input_data
+        key_elements = key.split(".")
+        for i, key_element in enumerate(key_elements):
+            if i == len(key_elements) - 1:
+                # add value to this element
+                if key_element not in data_element:
+                    return None
+                return data_element[key_element]
+            if key_element not in data_element:
+                data_element[key_element] = {}
+            data_element = data_element[key_element]
+        return None
+    def _render_single_datetime_input(
+        self, streamlit_app: st, key: str, property: Dict
+    ) -> Any:
+        streamlit_kwargs = self._get_default_streamlit_input_kwargs(key, property)
+        if property.get("format") == "time":
+            if property.get("default"):
+                try:
+                    streamlit_kwargs["value"] = datetime.time.fromisoformat(  # type: ignore
+                        property.get("default")
+                    )
+                except Exception:
+                    pass
+            return streamlit_app.time_input(**streamlit_kwargs)
+        elif property.get("format") == "date":
+            if property.get("default"):
+                try:
+                    streamlit_kwargs["value"] = datetime.date.fromisoformat(  # type: ignore
+                        property.get("default")
+                    )
+                except Exception:
+                    pass
+            return streamlit_app.date_input(**streamlit_kwargs)
+        elif property.get("format") == "date-time":
+            if property.get("default"):
+                try:
+                    streamlit_kwargs["value"] = datetime.datetime.fromisoformat(  # type: ignore
+                        property.get("default")
+                    )
+                except Exception:
+                    pass
+            with streamlit_app.container():
+                streamlit_app.subheader(streamlit_kwargs.get("label"))
+                if streamlit_kwargs.get("description"):
+                    streamlit_app.text(streamlit_kwargs.get("description"))
+                selected_date = None
+                selected_time = None
+                date_col, time_col = streamlit_app.columns(2)
+                with date_col:
+                    date_kwargs = {"label": "Date", "key": key + "-date-input"}
+                    if streamlit_kwargs.get("value"):
+                        try:
+                            date_kwargs["value"] = streamlit_kwargs.get(  # type: ignore
+                                "value"
+                            ).date()
+                        except Exception:
+                            pass
+                    selected_date = streamlit_app.date_input(**date_kwargs)
+                with time_col:
+                    time_kwargs = {"label": "Time", "key": key + "-time-input"}
+                    if streamlit_kwargs.get("value"):
+                        try:
+                            time_kwargs["value"] = streamlit_kwargs.get(  # type: ignore
+                                "value"
+                            ).time()
+                        except Exception:
+                            pass
+                    selected_time = streamlit_app.time_input(**time_kwargs)
+                return datetime.datetime.combine(selected_date, selected_time)
+        else:
+            streamlit_app.warning(
+                "Date format is not supported: " + str(property.get("format"))
+            )
+    def _render_single_file_input(
+        self, streamlit_app: st, key: str, property: Dict
+    ) -> Any:
+        streamlit_kwargs = self._get_default_streamlit_input_kwargs(key, property)
+        file_extension = None
+        if "mime_type" in property:
+            file_extension = mimetypes.guess_extension(property["mime_type"])
+        uploaded_file = streamlit_app.file_uploader(
+            **streamlit_kwargs, accept_multiple_files=False, type=file_extension
+        )
+        if uploaded_file is None:
+            return None
+        bytes = uploaded_file.getvalue()
+        if property.get("mime_type"):
+            if is_compatible_audio(property["mime_type"]):
+                # Show audio
+                streamlit_app.audio(bytes, format=property.get("mime_type"))
+            if is_compatible_image(property["mime_type"]):
+                # Show image
+                streamlit_app.image(bytes)
+            if is_compatible_video(property["mime_type"]):
+                # Show video
+                streamlit_app.video(bytes, format=property.get("mime_type"))
+        return bytes
+    def _render_single_string_input(
+        self, streamlit_app: st, key: str, property: Dict
+    ) -> Any:
+        streamlit_kwargs = self._get_default_streamlit_input_kwargs(key, property)
+        if property.get("default"):
+            streamlit_kwargs["value"] = property.get("default")
+        elif property.get("example"):
+            # TODO: also use example for other property types
+            # Use example as value if it is provided
+            streamlit_kwargs["value"] = property.get("example")
+        if property.get("maxLength") is not None:
+            streamlit_kwargs["max_chars"] = property.get("maxLength")
+        if (
+            property.get("format")
+            or (
+                property.get("maxLength") is not None
+                and int(property.get("maxLength")) < 140  # type: ignore
+            )
+            or property.get("writeOnly")
+        ):
+            # If any format is set, use single text input
+            # If max chars is set to less than 140, use single text input
+            # If write only -> password field
+            if property.get("writeOnly"):
+                streamlit_kwargs["type"] = "password"
+            return streamlit_app.text_input(**streamlit_kwargs)
+        else:
+            # Otherwise use multiline text area
+            return streamlit_app.text_area(**streamlit_kwargs)
+    def _render_multi_enum_input(
+        self, streamlit_app: st, key: str, property: Dict
+    ) -> Any:
+        streamlit_kwargs = self._get_default_streamlit_input_kwargs(key, property)
+        reference_item = schema_utils.resolve_reference(
+            property["items"]["$ref"], self._schema_references
+        )
+        # TODO: how to select defaults
+        return streamlit_app.multiselect(
+            **streamlit_kwargs, options=reference_item["enum"]
+        )
+    def _render_single_enum_input(
+        self, streamlit_app: st, key: str, property: Dict
+    ) -> Any:
+        streamlit_kwargs = self._get_default_streamlit_input_kwargs(key, property)
+        reference_item = schema_utils.get_single_reference_item(
+            property, self._schema_references
+        )
+        if property.get("default") is not None:
+            try:
+                streamlit_kwargs["index"] = reference_item["enum"].index(
+                    property.get("default")
+                )
+            except Exception:
+                # Use default selection
+                pass
+        return streamlit_app.selectbox(
+            **streamlit_kwargs, options=reference_item["enum"]
+        )
+    def _render_single_dict_input(
+        self, streamlit_app: st, key: str, property: Dict
+    ) -> Any:
+        # Add title and subheader
+        streamlit_app.subheader(property.get("title"))
+        if property.get("description"):
+            streamlit_app.markdown(property.get("description"))
+        streamlit_app.markdown("---")
+        current_dict = self._get_value(key)
+        if not current_dict:
+            current_dict = {}
+        key_col, value_col = streamlit_app.columns(2)
+        with key_col:
+            updated_key = streamlit_app.text_input(
+                "Key", value="", key=key + "-new-key"
+            )
+        with value_col:
+            # TODO: also add boolean?
+            value_kwargs = {"label": "Value", "key": key + "-new-value"}
+            if property["additionalProperties"].get("type") == "integer":
+                value_kwargs["value"] = 0  # type: ignore
+                updated_value = streamlit_app.number_input(**value_kwargs)
+            elif property["additionalProperties"].get("type") == "number":
+                value_kwargs["value"] = 0.0  # type: ignore
+                value_kwargs["format"] = "%f"
+                updated_value = streamlit_app.number_input(**value_kwargs)
+            else:
+                value_kwargs["value"] = ""
+                updated_value = streamlit_app.text_input(**value_kwargs)
+        streamlit_app.markdown("---")
+        with streamlit_app.container():
+            clear_col, add_col = streamlit_app.columns([1, 2])
+            with clear_col:
+                if streamlit_app.button("Clear Items", key=key + "-clear-items"):
+                    current_dict = {}
+            with add_col:
+                if (
+                    streamlit_app.button("Add Item", key=key + "-add-item")
+                    and updated_key
+                ):
+                    current_dict[updated_key] = updated_value
+        streamlit_app.write(current_dict)
+        return current_dict
+    def _render_single_reference(
+        self, streamlit_app: st, key: str, property: Dict
+    ) -> Any:
+        reference_item = schema_utils.get_single_reference_item(
+            property, self._schema_references
+        )
+        return self._render_property(streamlit_app, key, reference_item)
+    def _render_multi_file_input(
+        self, streamlit_app: st, key: str, property: Dict
+    ) -> Any:
+        streamlit_kwargs = self._get_default_streamlit_input_kwargs(key, property)
+        file_extension = None
+        if "mime_type" in property:
+            file_extension = mimetypes.guess_extension(property["mime_type"])
+        uploaded_files = streamlit_app.file_uploader(
+            **streamlit_kwargs, accept_multiple_files=True, type=file_extension
+        )
+        uploaded_files_bytes = []
+        if uploaded_files:
+            for uploaded_file in uploaded_files:
+                uploaded_files_bytes.append(uploaded_file.read())
+        return uploaded_files_bytes
+    def _render_single_boolean_input(
+        self, streamlit_app: st, key: str, property: Dict
+    ) -> Any:
+        streamlit_kwargs = self._get_default_streamlit_input_kwargs(key, property)
+        if property.get("default"):
+            streamlit_kwargs["value"] = property.get("default")
+        return streamlit_app.checkbox(**streamlit_kwargs)
+    def _render_single_number_input(
+        self, streamlit_app: st, key: str, property: Dict
+    ) -> Any:
+        streamlit_kwargs = self._get_default_streamlit_input_kwargs(key, property)
+        number_transform = int
+        if property.get("type") == "number":
+            number_transform = float  # type: ignore
+            streamlit_kwargs["format"] = "%f"
+        if "multipleOf" in property:
+            # Set stepcount based on multiple of parameter
+            streamlit_kwargs["step"] = number_transform(property["multipleOf"])
+        elif number_transform == int:
+            # Set step size to 1 as default
+            streamlit_kwargs["step"] = 1
+        elif number_transform == float:
+            # Set step size to 0.01 as default
+            # TODO: adapt to default value
+            streamlit_kwargs["step"] = 0.01
+        if "minimum" in property:
+            streamlit_kwargs["min_value"] = number_transform(property["minimum"])
+        if "exclusiveMinimum" in property:
+            streamlit_kwargs["min_value"] = number_transform(
+                property["exclusiveMinimum"] + streamlit_kwargs["step"]
+            )
+        if "maximum" in property:
+            streamlit_kwargs["max_value"] = number_transform(property["maximum"])
+        if "exclusiveMaximum" in property:
+            streamlit_kwargs["max_value"] = number_transform(
+                property["exclusiveMaximum"] - streamlit_kwargs["step"]
+            )
+        if property.get("default") is not None:
+            streamlit_kwargs["value"] = number_transform(property.get("default"))  # type: ignore
+        else:
+            if "min_value" in streamlit_kwargs:
+                streamlit_kwargs["value"] = streamlit_kwargs["min_value"]
+            elif number_transform == int:
+                streamlit_kwargs["value"] = 0
+            else:
+                # Set default value to step
+                streamlit_kwargs["value"] = number_transform(streamlit_kwargs["step"])
+        if "min_value" in streamlit_kwargs and "max_value" in streamlit_kwargs:
+            # TODO: Only if less than X steps
+            return streamlit_app.slider(**streamlit_kwargs)
+        else:
+            return streamlit_app.number_input(**streamlit_kwargs)
+    def _render_object_input(self, streamlit_app: st, key: str, property: Dict) -> Any:
+        properties = property["properties"]
+        object_inputs = {}
+        for property_key in properties:
+            property = properties[property_key]
+            if not property.get("title"):
+                # Set property key as fallback title
+                property["title"] = name_to_title(property_key)
+            # construct full key based on key parts -> required later to get the value
+            full_key = key + "." + property_key
+            object_inputs[property_key] = self._render_property(
+                streamlit_app, full_key, property
+            )
+        return object_inputs
+    def _render_single_object_input(
+        self, streamlit_app: st, key: str, property: Dict
+    ) -> Any:
+        # Add title and subheader
+        title = property.get("title")
+        streamlit_app.subheader(title)
+        if property.get("description"):
+            streamlit_app.markdown(property.get("description"))
+        object_reference = schema_utils.get_single_reference_item(
+            property, self._schema_references
+        )
+        return self._render_object_input(streamlit_app, key, object_reference)
+    def _render_property_list_input(
+        self, streamlit_app: st, key: str, property: Dict
+    ) -> Any:
+        # Add title and subheader
+        streamlit_app.subheader(property.get("title"))
+        if property.get("description"):
+            streamlit_app.markdown(property.get("description"))
+        streamlit_app.markdown("---")
+        current_list = self._get_value(key)
+        if not current_list:
+            current_list = []
+        value_kwargs = {"label": "Value", "key": key + "-new-value"}
+        if property["items"]["type"] == "integer":
+            value_kwargs["value"] = 0  # type: ignore
+            new_value = streamlit_app.number_input(**value_kwargs)
+        elif property["items"]["type"] == "number":
+            value_kwargs["value"] = 0.0  # type: ignore
+            value_kwargs["format"] = "%f"
+            new_value = streamlit_app.number_input(**value_kwargs)
+        else:
+            value_kwargs["value"] = ""
+            new_value = streamlit_app.text_input(**value_kwargs)
+        streamlit_app.markdown("---")
+        with streamlit_app.container():
+            clear_col, add_col = streamlit_app.columns([1, 2])
+            with clear_col:
+                if streamlit_app.button("Clear Items", key=key + "-clear-items"):
+                    current_list = []
+            with add_col:
+                if (
+                    streamlit_app.button("Add Item", key=key + "-add-item")
+                    and new_value is not None
+                ):
+                    current_list.append(new_value)
+        streamlit_app.write(current_list)
+        return current_list
+    def _render_object_list_input(
+        self, streamlit_app: st, key: str, property: Dict
+    ) -> Any:
+        # TODO: support max_items, and min_items properties
+        # Add title and subheader
+        streamlit_app.subheader(property.get("title"))
+        if property.get("description"):
+            streamlit_app.markdown(property.get("description"))
+        streamlit_app.markdown("---")
+        current_list = self._get_value(key)
+        if not current_list:
+            current_list = []
+        object_reference = schema_utils.resolve_reference(
+            property["items"]["$ref"], self._schema_references
+        )
+        input_data = self._render_object_input(streamlit_app, key, object_reference)
+        streamlit_app.markdown("---")
+        with streamlit_app.container():
+            clear_col, add_col = streamlit_app.columns([1, 2])
+            with clear_col:
+                if streamlit_app.button("Clear Items", key=key + "-clear-items"):
+                    current_list = []
+            with add_col:
+                if (
+                    streamlit_app.button("Add Item", key=key + "-add-item")
+                    and input_data
+                ):
+                    current_list.append(input_data)
+        streamlit_app.write(current_list)
+        return current_list
+    def _render_property(self, streamlit_app: st, key: str, property: Dict) -> Any:
+        if schema_utils.is_single_enum_property(property, self._schema_references):
+            return self._render_single_enum_input(streamlit_app, key, property)
+        if schema_utils.is_multi_enum_property(property, self._schema_references):
+            return self._render_multi_enum_input(streamlit_app, key, property)
+        if schema_utils.is_single_file_property(property):
+            return self._render_single_file_input(streamlit_app, key, property)
+        if schema_utils.is_multi_file_property(property):
+            return self._render_multi_file_input(streamlit_app, key, property)
+        if schema_utils.is_single_datetime_property(property):
+            return self._render_single_datetime_input(streamlit_app, key, property)
+        if schema_utils.is_single_boolean_property(property):
+            return self._render_single_boolean_input(streamlit_app, key, property)
+        if schema_utils.is_single_dict_property(property):
+            return self._render_single_dict_input(streamlit_app, key, property)
+        if schema_utils.is_single_number_property(property):
+            return self._render_single_number_input(streamlit_app, key, property)
+        if schema_utils.is_single_string_property(property):
+            return self._render_single_string_input(streamlit_app, key, property)
+        if schema_utils.is_single_object(property, self._schema_references):
+            return self._render_single_object_input(streamlit_app, key, property)
+        if schema_utils.is_object_list_property(property, self._schema_references):
+            return self._render_object_list_input(streamlit_app, key, property)
+        if schema_utils.is_property_list(property):
+            return self._render_property_list_input(streamlit_app, key, property)
+        if schema_utils.is_single_reference(property):
+            return self._render_single_reference(streamlit_app, key, property)
+        streamlit_app.warning(
+            "The type of the following property is currently not supported: "
+            + str(property.get("title"))
+        )
+        raise Exception("Unsupported property")
+class OutputUI:
+    def __init__(self, output_data: Any, input_data: Any):
+        self._output_data = output_data
+        self._input_data = input_data
+    def render_ui(self, streamlit_app) -> None:
+        try:
+            if isinstance(self._output_data, BaseModel):
+                self._render_single_output(streamlit_app, self._output_data)
+                return
+            if type(self._output_data) == list:
+                self._render_list_output(streamlit_app, self._output_data)
+                return
+        except Exception as ex:
+            streamlit_app.exception(ex)
+            # Fallback to
+            streamlit_app.json(jsonable_encoder(self._output_data))
+    def _render_single_text_property(
+        self, streamlit: st, property_schema: Dict, value: Any
+    ) -> None:
+        # Add title and subheader
+        streamlit.subheader(property_schema.get("title"))
+        if property_schema.get("description"):
+            streamlit.markdown(property_schema.get("description"))
+        if value is None or value == "":
+            streamlit.info("No value returned!")
+        else:
+            streamlit.code(str(value), language="plain")
+    def _render_single_file_property(
+        self, streamlit: st, property_schema: Dict, value: Any
+    ) -> None:
+        # Add title and subheader
+        streamlit.subheader(property_schema.get("title"))
+        if property_schema.get("description"):
+            streamlit.markdown(property_schema.get("description"))
+        if value is None or value == "":
+            streamlit.info("No value returned!")
+        else:
+            # TODO: Detect if it is a FileContent instance
+            # TODO: detect if it is base64
+            file_extension = ""
+            if "mime_type" in property_schema:
+                mime_type = property_schema["mime_type"]
+                file_extension = mimetypes.guess_extension(mime_type) or ""
+                if is_compatible_audio(mime_type):
+                    streamlit.audio(value.as_bytes(), format=mime_type)
+                    return
+                if is_compatible_image(mime_type):
+                    streamlit.image(value.as_bytes())
+                    return
+                if is_compatible_video(mime_type):
+                    streamlit.video(value.as_bytes(), format=mime_type)
+                    return
+            filename = (
+                (property_schema["title"] + file_extension)
+                .lower()
+                .strip()
+                .replace(" ", "-")
+            )
+            streamlit.markdown(
+                f'<a href="data:application/octet-stream;base64,{value}" download="{filename}"><input type="button" value="Download File"></a>',
+                unsafe_allow_html=True,
+            )
+    def _render_single_complex_property(
+        self, streamlit: st, property_schema: Dict, value: Any
+    ) -> None:
+        # Add title and subheader
+        streamlit.subheader(property_schema.get("title"))
+        if property_schema.get("description"):
+            streamlit.markdown(property_schema.get("description"))
+        streamlit.json(jsonable_encoder(value))
+    def _render_single_output(self, streamlit: st, output_data: BaseModel) -> None:
+        try:
+            if has_output_ui_renderer(output_data):
+                if function_has_named_arg(output_data.render_output_ui, "input"):  # type: ignore
+                    # render method also requests the input data
+                    output_data.render_output_ui(streamlit, input=self._input_data)  # type: ignore
+                else:
+                    output_data.render_output_ui(streamlit)  # type: ignore
+                return
+        except Exception:
+            # Use default auto-generation methods if the custom rendering throws an exception
+            logger.exception(
+                "Failed to execute custom render_output_ui function. Using auto-generation instead"
+            )
+        model_schema = output_data.schema(by_alias=False)
+        model_properties = model_schema.get("properties")
+        definitions = model_schema.get("definitions")
+        if model_properties:
+            for property_key in output_data.__dict__:
+                property_schema = model_properties.get(property_key)
+                if not property_schema.get("title"):
+                    # Set property key as fallback title
+                    property_schema["title"] = property_key
+                output_property_value = output_data.__dict__[property_key]
+                if has_output_ui_renderer(output_property_value):
+                    output_property_value.render_output_ui(streamlit)  # type: ignore
+                    continue
+                if isinstance(output_property_value, BaseModel):
+                    # Render output recursivly
+                    streamlit.subheader(property_schema.get("title"))
+                    if property_schema.get("description"):
+                        streamlit.markdown(property_schema.get("description"))
+                    self._render_single_output(streamlit, output_property_value)
+                    continue
+                if property_schema:
+                    if schema_utils.is_single_file_property(property_schema):
+                        self._render_single_file_property(
+                            streamlit, property_schema, output_property_value
+                        )
+                        continue
+                    if (
+                        schema_utils.is_single_string_property(property_schema)
+                        or schema_utils.is_single_number_property(property_schema)
+                        or schema_utils.is_single_datetime_property(property_schema)
+                        or schema_utils.is_single_boolean_property(property_schema)
+                    ):
+                        self._render_single_text_property(
+                            streamlit, property_schema, output_property_value
+                        )
+                        continue
+                    if definitions and schema_utils.is_single_enum_property(
+                        property_schema, definitions
+                    ):
+                        self._render_single_text_property(
+                            streamlit, property_schema, output_property_value.value
+                        )
+                        continue
+                    # TODO: render dict as table
+                    self._render_single_complex_property(
+                        streamlit, property_schema, output_property_value
+                    )
+            return
+    def _render_list_output(self, streamlit: st, output_data: List) -> None:
+        try:
+            data_items: List = []
+            for data_item in output_data:
+                if has_output_ui_renderer(data_item):
+                    # Render using the render function
+                    data_item.render_output_ui(streamlit)  # type: ignore
+                    continue
+                data_items.append(data_item.dict())
+            # Try to show as dataframe
+            streamlit.table(pd.DataFrame(data_items))
+        except Exception:
+            # Fallback to
+            streamlit.json(jsonable_encoder(output_data))
+def getOpyrator(mode: str) -> Opyrator:
+    if mode == None or mode.startswith('VC'):
+        from mkgui.app_vc import convert
+        return  Opyrator(convert)
+    if mode == None or mode.startswith('预处理'):
+        from mkgui.preprocess import preprocess
+        return  Opyrator(preprocess)
+    if mode == None or mode.startswith('模型训练'):
+        from mkgui.train import train
+        return  Opyrator(train)
+    if mode == None or mode.startswith('模型训练(VC)'):
+        from mkgui.train_vc import train_vc
+        return  Opyrator(train_vc)
+    from mkgui.app import synthesize
+    return Opyrator(synthesize)
+def render_streamlit_ui() -> None:
+    # init
+    session_state = st.session_state
+    session_state.input_data = {}
+    # Add custom css settings
+    st.markdown(f"<style>{CUSTOM_STREAMLIT_CSS}</style>", unsafe_allow_html=True)
+    with st.spinner("Loading MockingBird GUI. Please wait..."):
+        session_state.mode = st.sidebar.selectbox(
+            '模式选择',
+            ( "AI拟音", "VC拟音", "预处理", "模型训练", "模型训练(VC)")
+        )
+        if "mode" in session_state:
+            mode = session_state.mode
+        else:
+            mode = ""
+        opyrator = getOpyrator(mode)
+    title = opyrator.name + mode
+    col1, col2, _ = st.columns(3)
+    col2.title(title)
+    col2.markdown("欢迎使用MockingBird Web 2")
+    image = Image.open('.\\mkgui\\static\\mb.png')
+    col1.image(image)
+    st.markdown("---")
+    left, right = st.columns([0.4, 0.6])
+    with left:
+        st.header("Control 控制")
+        InputUI(session_state=session_state, input_class=opyrator.input_type).render_ui(st)
+        execute_selected = st.button(opyrator.action)
+        if execute_selected:
+            with st.spinner("Executing operation. Please wait..."):
+                try:
+                    input_data_obj = parse_obj_as(
+                        opyrator.input_type, session_state.input_data
+                    )
+                    session_state.output_data = opyrator(input=input_data_obj)
+                    session_state.latest_operation_input = input_data_obj  # should this really be saved as additional session object?
+                except ValidationError as ex:
+                    st.error(ex)
+                else:
+                    # st.success("Operation executed successfully.")
+                    pass
+    with right:
+        st.header("Result 结果")
+        if 'output_data' in session_state:
+            OutputUI(
+                session_state.output_data, session_state.latest_operation_input
+            ).render_ui(st)
+            if st.button("Clear"):
+            # Clear all state
+                for key in st.session_state.keys():
+                    del st.session_state[key]
+                session_state.input_data = {}
+                st.experimental_rerun()
+        else:
+            # placeholder
+            st.caption("请使用左侧控制板进行输入并运行获得结果")

mkgui/base/ui/streamlit_utils.py ADDED Viewed

	@@ -0,0 +1,13 @@

+CUSTOM_STREAMLIT_CSS = """
+div[data-testid="stBlock"] button {
+  width: 100% !important;
+  margin-bottom: 20px !important;
+  border-color: #bfbfbf !important;
+}
+section[data-testid="stSidebar"] div {
+  max-width: 10rem;
+}
+pre code {
+    white-space: pre-wrap;
+}
+"""

mkgui/preprocess.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from pydantic import BaseModel, Field
+import os
+from pathlib import Path
+from enum import Enum
+from typing import Any, Tuple
+# Constants
+EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models"
+ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
+if os.path.isdir(EXT_MODELS_DIRT):
+    extractors =  Enum('extractors', list((file.name, file) for file in Path(EXT_MODELS_DIRT).glob("**/*.pt")))
+    print("Loaded extractor models: " + str(len(extractors)))
+else:
+    raise Exception(f"Model folder {EXT_MODELS_DIRT} doesn't exist.")
+if os.path.isdir(ENC_MODELS_DIRT):
+    encoders = Enum('encoders', list((file.name, file) for file in Path(ENC_MODELS_DIRT).glob("**/*.pt")))
+    print("Loaded encoders models: " + str(len(encoders)))
+else:
+    raise Exception(f"Model folder {ENC_MODELS_DIRT} doesn't exist.")
+class Model(str, Enum):
+    VC_PPG2MEL = "ppg2mel"
+class Dataset(str, Enum):
+    AIDATATANG_200ZH = "aidatatang_200zh"
+    AIDATATANG_200ZH_S = "aidatatang_200zh_s"
+class Input(BaseModel):
+    # def render_input_ui(st, input) -> Dict:
+    #     input["selected_dataset"] = st.selectbox(
+    #         '选择数据集',
+    #         ("aidatatang_200zh", "aidatatang_200zh_s")
+    #     )
+    # return input
+    model: Model = Field(
+        Model.VC_PPG2MEL, title="目标模型",
+    )
+    dataset: Dataset = Field(
+        Dataset.AIDATATANG_200ZH, title="数据集选择",
+    )
+    datasets_root: str = Field(
+        ..., alias="数据集根目录", description="输入数据集根目录（相对/绝对）",
+        format=True,
+        example="..\\trainning_data\\"
+    )
+    output_root: str = Field(
+        ..., alias="输出根目录", description="输出结果根目录（相对/绝对）",
+        format=True,
+        example="..\\trainning_data\\"
+    )
+    n_processes: int = Field(
+        2, alias="处理线程数", description="根据CPU线程数来设置",
+        le=32, ge=1
+    )
+    extractor: extractors = Field(
+        ..., alias="特征提取模型",
+        description="选择PPG特征提取模型文件."
+    )
+    encoder: encoders = Field(
+        ..., alias="语音编码模型",
+        description="选择语音编码模型文件."
+    )
+class AudioEntity(BaseModel):
+    content: bytes
+    mel: Any
+class Output(BaseModel):
+    __root__: Tuple[str, int]
+    def render_output_ui(self, streamlit_app, input) -> None:  # type: ignore
+        """Custom output UI.
+        If this method is implmeneted, it will be used instead of the default Output UI renderer.
+        """
+        sr, count = self.__root__
+        streamlit_app.subheader(f"Dataset {sr} done processed total of {count}")
+def preprocess(input: Input) -> Output:
+    """Preprocess(预处理)"""
+    finished = 0
+    if input.model == Model.VC_PPG2MEL:
+        from ppg2mel.preprocess import preprocess_dataset
+        finished = preprocess_dataset(
+            datasets_root=Path(input.datasets_root),
+            dataset=input.dataset,
+            out_dir=Path(input.output_root),
+            n_processes=input.n_processes,
+            ppg_encoder_model_fpath=Path(input.extractor.value),
+            speaker_encoder_model=Path(input.encoder.value)
+        )
+    # TODO: pass useful return code
+    return Output(__root__=(input.dataset, finished))

mkgui/static/mb.png ADDED Viewed

mkgui/train.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from pydantic import BaseModel, Field
+import os
+from pathlib import Path
+from enum import Enum
+from typing import Any
+from synthesizer.hparams import hparams
+from synthesizer.train import train as synt_train
+# Constants
+SYN_MODELS_DIRT = f"synthesizer{os.sep}saved_models"
+ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
+# EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models"
+# CONV_MODELS_DIRT = f"ppg2mel{os.sep}saved_models"
+# ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
+# Pre-Load models
+if os.path.isdir(SYN_MODELS_DIRT):
+    synthesizers =  Enum('synthesizers', list((file.name, file) for file in Path(SYN_MODELS_DIRT).glob("**/*.pt")))
+    print("Loaded synthesizer models: " + str(len(synthesizers)))
+else:
+    raise Exception(f"Model folder {SYN_MODELS_DIRT} doesn't exist.")
+if os.path.isdir(ENC_MODELS_DIRT):
+    encoders =  Enum('encoders', list((file.name, file) for file in Path(ENC_MODELS_DIRT).glob("**/*.pt")))
+    print("Loaded encoders models: " + str(len(encoders)))
+else:
+    raise Exception(f"Model folder {ENC_MODELS_DIRT} doesn't exist.")
+class Model(str, Enum):
+    DEFAULT = "default"
+class Input(BaseModel):
+    model: Model = Field(
+        Model.DEFAULT, title="模型类型",
+    )
+    # datasets_root: str = Field(
+    #     ..., alias="预处理数据根目录", description="输入目录（相对/绝对）,不适用于ppg2mel模型",
+    #     format=True,
+    #     example="..\\trainning_data\\"
+    # )
+    input_root: str = Field(
+        ..., alias="输入目录", description="预处理数据根目录",
+        format=True,
+        example=f"..{os.sep}audiodata{os.sep}SV2TTS{os.sep}synthesizer"
+    )
+    run_id: str = Field(
+        "", alias="新模型名/运行ID", description="使用新ID进行重新训练，否则选择下面的模型进行继续训练",
+    )
+    synthesizer: synthesizers = Field(
+        ..., alias="已有合成模型",
+        description="选择语音合成模型文件."
+    )
+    gpu: bool = Field(
+        True, alias="GPU训练", description="选择“是”，则使用GPU训练",
+    )
+    verbose: bool = Field(
+        True, alias="打印详情", description="选择“是”，输出更多详情",
+    )
+    encoder: encoders = Field(
+        ..., alias="语音编码模型",
+        description="选择语音编码模型文件."
+    )
+    save_every: int = Field(
+        1000, alias="更新间隔", description="每隔n步则更新一次模型",
+    )
+    backup_every: int = Field(
+        10000, alias="保存间隔", description="每隔n步则保存一次模型",
+    )
+    log_every: int = Field(
+        500, alias="打印间隔", description="每隔n步则打印一次训练统计",
+    )
+class AudioEntity(BaseModel):
+    content: bytes
+    mel: Any
+class Output(BaseModel):
+    __root__: int
+    def render_output_ui(self, streamlit_app) -> None:  # type: ignore
+        """Custom output UI.
+        If this method is implmeneted, it will be used instead of the default Output UI renderer.
+        """
+        streamlit_app.subheader(f"Training started with code: {self.__root__}")
+def train(input: Input) -> Output:
+    """Train(训练)"""
+    print(">>> Start training ...")
+    force_restart = len(input.run_id) > 0
+    if not force_restart:
+        input.run_id = Path(input.synthesizer.value).name.split('.')[0]
+    synt_train(
+        input.run_id,
+        input.input_root,
+        f"synthesizer{os.sep}saved_models",
+        input.save_every,
+        input.backup_every,
+        input.log_every,
+        force_restart,
+        hparams
+    )
+    return Output(__root__=0)

mkgui/train_vc.py ADDED Viewed

	@@ -0,0 +1,155 @@

+from pydantic import BaseModel, Field
+import os
+from pathlib import Path
+from enum import Enum
+from typing import Any, Tuple
+import numpy as np
+from utils.load_yaml import HpsYaml
+from utils.util import AttrDict
+import torch
+# Constants
+EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models"
+CONV_MODELS_DIRT = f"ppg2mel{os.sep}saved_models"
+ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
+if os.path.isdir(EXT_MODELS_DIRT):
+    extractors =  Enum('extractors', list((file.name, file) for file in Path(EXT_MODELS_DIRT).glob("**/*.pt")))
+    print("Loaded extractor models: " + str(len(extractors)))
+else:
+    raise Exception(f"Model folder {EXT_MODELS_DIRT} doesn't exist.")
+if os.path.isdir(CONV_MODELS_DIRT):
+    convertors =  Enum('convertors', list((file.name, file) for file in Path(CONV_MODELS_DIRT).glob("**/*.pth")))
+    print("Loaded convertor models: " + str(len(convertors)))
+else:
+    raise Exception(f"Model folder {CONV_MODELS_DIRT} doesn't exist.")
+if os.path.isdir(ENC_MODELS_DIRT):
+    encoders = Enum('encoders', list((file.name, file) for file in Path(ENC_MODELS_DIRT).glob("**/*.pt")))
+    print("Loaded encoders models: " + str(len(encoders)))
+else:
+    raise Exception(f"Model folder {ENC_MODELS_DIRT} doesn't exist.")
+class Model(str, Enum):
+    VC_PPG2MEL = "ppg2mel"
+class Dataset(str, Enum):
+    AIDATATANG_200ZH = "aidatatang_200zh"
+    AIDATATANG_200ZH_S = "aidatatang_200zh_s"
+class Input(BaseModel):
+    # def render_input_ui(st, input) -> Dict:
+    #     input["selected_dataset"] = st.selectbox(
+    #         '选择数据集',
+    #         ("aidatatang_200zh", "aidatatang_200zh_s")
+    #     )
+    # return input
+    model: Model = Field(
+        Model.VC_PPG2MEL, title="模型类型",
+    )
+    # datasets_root: str = Field(
+    #     ..., alias="预处理数据根目录", description="输入目录（相对/绝对）,不适用于ppg2mel模型",
+    #     format=True,
+    #     example="..\\trainning_data\\"
+    # )
+    output_root: str = Field(
+        ..., alias="输出目录(可选)", description="建议不填，保持默认",
+        format=True,
+        example=""
+    )
+    continue_mode: bool = Field(
+        True, alias="继续训练模式", description="选择“是”，则从下面选择的模型中继续训练",
+    )
+    gpu: bool = Field(
+        True, alias="GPU训练", description="选择“是”，则使用GPU训练",
+    )
+    verbose: bool = Field(
+        True, alias="打印详情", description="选择“是”，输出更多详情",
+    )
+    # TODO: Move to hiden fields by default
+    convertor: convertors = Field(
+        ..., alias="转换模型",
+        description="选择语音转换模型文件."
+    )
+    extractor: extractors = Field(
+        ..., alias="特征提取模型",
+        description="选择PPG特征提取模型文件."
+    )
+    encoder: encoders = Field(
+        ..., alias="语音编码模型",
+        description="选择语音编码模型文件."
+    )
+    njobs: int = Field(
+        8, alias="进程数", description="适用于ppg2mel",
+    )
+    seed: int = Field(
+        default=0, alias="初始随机数", description="适用于ppg2mel",
+    )
+    model_name: str = Field(
+        ..., alias="新模型名", description="仅在重新训练时生效,选中继续训练时无效",
+        example="test"
+    )
+    model_config: str = Field(
+        ..., alias="新模型配置", description="仅在重新训练时生效,选中继续训练时无效",
+        example=".\\ppg2mel\\saved_models\\seq2seq_mol_ppg2mel_vctk_libri_oneshotvc_r4_normMel_v2"
+    )
+class AudioEntity(BaseModel):
+    content: bytes
+    mel: Any
+class Output(BaseModel):
+    __root__: Tuple[str, int]
+    def render_output_ui(self, streamlit_app, input) -> None:  # type: ignore
+        """Custom output UI.
+        If this method is implmeneted, it will be used instead of the default Output UI renderer.
+        """
+        sr, count = self.__root__
+        streamlit_app.subheader(f"Dataset {sr} done processed total of {count}")
+def train_vc(input: Input) -> Output:
+    """Train VC(训练 VC)"""
+    print(">>> OneShot VC training ...")
+    params = AttrDict()
+    params.update({
+        "gpu": input.gpu,
+        "cpu": not input.gpu,
+        "njobs": input.njobs,
+        "seed": input.seed,
+        "verbose": input.verbose,
+        "load": input.convertor.value,
+        "warm_start": False,
+    })
+    if input.continue_mode:
+        # trace old model and config
+        p = Path(input.convertor.value)
+        params.name = p.parent.name
+        # search a config file
+        model_config_fpaths = list(p.parent.rglob("*.yaml"))
+        if len(model_config_fpaths) == 0:
+            raise "No model yaml config found for convertor"
+        config = HpsYaml(model_config_fpaths[0])
+        params.ckpdir = p.parent.parent
+        params.config = model_config_fpaths[0]
+        params.logdir = os.path.join(p.parent, "log")
+    else:
+        # Make the config dict dot visitable
+        config = HpsYaml(input.config)
+    np.random.seed(input.seed)
+    torch.manual_seed(input.seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(input.seed)
+    mode = "train"
+    from ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
+    solver = Solver(config, params, mode)
+    solver.load_data()
+    solver.set_model()
+    solver.exec()
+    print(">>> Oneshot VC train finished!")
+    # TODO: pass useful return code
+    return Output(__root__=(input.dataset, 0))

packages.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+libasound2-dev
+portaudio19-dev
+libportaudio2
+libportaudiocpp0
+ffmpeg

ppg2mel/__init__.py ADDED Viewed

	@@ -0,0 +1,209 @@

+#!/usr/bin/env python3
+# Copyright 2020 Songxiang Liu
+# Apache 2.0
+from typing import List
+import torch
+import torch.nn.functional as F
+import numpy as np
+from .utils.abs_model import AbsMelDecoder
+from .rnn_decoder_mol import Decoder
+from .utils.cnn_postnet import Postnet
+from .utils.vc_utils import get_mask_from_lengths
+from utils.load_yaml import HpsYaml
+class MelDecoderMOLv2(AbsMelDecoder):
+    """Use an encoder to preprocess ppg."""
+    def __init__(
+        self,
+        num_speakers: int,
+        spk_embed_dim: int,
+        bottle_neck_feature_dim: int,
+        encoder_dim: int = 256,
+        encoder_downsample_rates: List = [2, 2],
+        attention_rnn_dim: int = 512,
+        decoder_rnn_dim: int = 512,
+        num_decoder_rnn_layer: int = 1,
+        concat_context_to_last: bool = True,
+        prenet_dims: List = [256, 128],
+        num_mixtures: int = 5,
+        frames_per_step: int = 2,
+        mask_padding: bool = True,
+    ):
+        super().__init__()
+        self.mask_padding = mask_padding
+        self.bottle_neck_feature_dim = bottle_neck_feature_dim
+        self.num_mels = 80
+        self.encoder_down_factor=np.cumprod(encoder_downsample_rates)[-1]
+        self.frames_per_step = frames_per_step
+        self.use_spk_dvec = True
+        input_dim = bottle_neck_feature_dim
+        # Downsampling convolution
+        self.bnf_prenet = torch.nn.Sequential(
+            torch.nn.Conv1d(input_dim, encoder_dim, kernel_size=1, bias=False),
+            torch.nn.LeakyReLU(0.1),
+            torch.nn.InstanceNorm1d(encoder_dim, affine=False),
+            torch.nn.Conv1d(
+                encoder_dim, encoder_dim,
+                kernel_size=2*encoder_downsample_rates[0],
+                stride=encoder_downsample_rates[0],
+                padding=encoder_downsample_rates[0]//2,
+            ),
+            torch.nn.LeakyReLU(0.1),
+            torch.nn.InstanceNorm1d(encoder_dim, affine=False),
+            torch.nn.Conv1d(
+                encoder_dim, encoder_dim,
+                kernel_size=2*encoder_downsample_rates[1],
+                stride=encoder_downsample_rates[1],
+                padding=encoder_downsample_rates[1]//2,
+            ),
+            torch.nn.LeakyReLU(0.1),
+            torch.nn.InstanceNorm1d(encoder_dim, affine=False),
+        )
+        decoder_enc_dim = encoder_dim
+        self.pitch_convs = torch.nn.Sequential(
+            torch.nn.Conv1d(2, encoder_dim, kernel_size=1, bias=False),
+            torch.nn.LeakyReLU(0.1),
+            torch.nn.InstanceNorm1d(encoder_dim, affine=False),
+            torch.nn.Conv1d(
+                encoder_dim, encoder_dim,
+                kernel_size=2*encoder_downsample_rates[0],
+                stride=encoder_downsample_rates[0],
+                padding=encoder_downsample_rates[0]//2,
+            ),
+            torch.nn.LeakyReLU(0.1),
+            torch.nn.InstanceNorm1d(encoder_dim, affine=False),
+            torch.nn.Conv1d(
+                encoder_dim, encoder_dim,
+                kernel_size=2*encoder_downsample_rates[1],
+                stride=encoder_downsample_rates[1],
+                padding=encoder_downsample_rates[1]//2,
+            ),
+            torch.nn.LeakyReLU(0.1),
+            torch.nn.InstanceNorm1d(encoder_dim, affine=False),
+        )
+        self.reduce_proj = torch.nn.Linear(encoder_dim + spk_embed_dim, encoder_dim)
+        # Decoder
+        self.decoder = Decoder(
+            enc_dim=decoder_enc_dim,
+            num_mels=self.num_mels,
+            frames_per_step=frames_per_step,
+            attention_rnn_dim=attention_rnn_dim,
+            decoder_rnn_dim=decoder_rnn_dim,
+            num_decoder_rnn_layer=num_decoder_rnn_layer,
+            prenet_dims=prenet_dims,
+            num_mixtures=num_mixtures,
+            use_stop_tokens=True,
+            concat_context_to_last=concat_context_to_last,
+            encoder_down_factor=self.encoder_down_factor,
+        )
+        # Mel-Spec Postnet: some residual CNN layers
+        self.postnet = Postnet()
+    def parse_output(self, outputs, output_lengths=None):
+        if self.mask_padding and output_lengths is not None:
+            mask = ~get_mask_from_lengths(output_lengths, outputs[0].size(1))
+            mask = mask.unsqueeze(2).expand(mask.size(0), mask.size(1), self.num_mels)
+            outputs[0].data.masked_fill_(mask, 0.0)
+            outputs[1].data.masked_fill_(mask, 0.0)
+        return outputs
+    def forward(
+        self,
+        bottle_neck_features: torch.Tensor,
+        feature_lengths: torch.Tensor,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        logf0_uv: torch.Tensor = None,
+        spembs: torch.Tensor = None,
+        output_att_ws: bool = False,
+    ):
+        decoder_inputs = self.bnf_prenet(
+            bottle_neck_features.transpose(1, 2)
+        ).transpose(1, 2)
+        logf0_uv = self.pitch_convs(logf0_uv.transpose(1, 2)).transpose(1, 2)
+        decoder_inputs = decoder_inputs + logf0_uv
+        assert spembs is not None
+        spk_embeds = F.normalize(
+            spembs).unsqueeze(1).expand(-1, decoder_inputs.size(1), -1)
+        decoder_inputs = torch.cat([decoder_inputs, spk_embeds], dim=-1)
+        decoder_inputs = self.reduce_proj(decoder_inputs)
+        # (B, num_mels, T_dec)
+        T_dec = torch.div(feature_lengths, int(self.encoder_down_factor), rounding_mode='floor')
+        mel_outputs, predicted_stop, alignments = self.decoder(
+            decoder_inputs, speech, T_dec)
+        ## Post-processing
+        mel_outputs_postnet = self.postnet(mel_outputs.transpose(1, 2)).transpose(1, 2)
+        mel_outputs_postnet = mel_outputs + mel_outputs_postnet
+        if output_att_ws:
+            return self.parse_output(
+                [mel_outputs, mel_outputs_postnet, predicted_stop, alignments], speech_lengths)
+        else:
+            return self.parse_output(
+                [mel_outputs, mel_outputs_postnet, predicted_stop], speech_lengths)
+        # return mel_outputs, mel_outputs_postnet
+    def inference(
+        self,
+        bottle_neck_features: torch.Tensor,
+        logf0_uv: torch.Tensor = None,
+        spembs: torch.Tensor = None,
+    ):
+        decoder_inputs = self.bnf_prenet(bottle_neck_features.transpose(1, 2)).transpose(1, 2)
+        logf0_uv = self.pitch_convs(logf0_uv.transpose(1, 2)).transpose(1, 2)
+        decoder_inputs = decoder_inputs + logf0_uv
+        assert spembs is not None
+        spk_embeds = F.normalize(
+            spembs).unsqueeze(1).expand(-1, decoder_inputs.size(1), -1)
+        bottle_neck_features = torch.cat([decoder_inputs, spk_embeds], dim=-1)
+        bottle_neck_features = self.reduce_proj(bottle_neck_features)
+        ## Decoder
+        if bottle_neck_features.size(0) > 1:
+            mel_outputs, alignments = self.decoder.inference_batched(bottle_neck_features)
+        else:
+            mel_outputs, alignments = self.decoder.inference(bottle_neck_features,)
+        ## Post-processing
+        mel_outputs_postnet = self.postnet(mel_outputs.transpose(1, 2)).transpose(1, 2)
+        mel_outputs_postnet = mel_outputs + mel_outputs_postnet
+        # outputs = mel_outputs_postnet[0]
+        return mel_outputs[0], mel_outputs_postnet[0], alignments[0]
+def load_model(model_file, device=None):
+    # search a config file
+    model_config_fpaths = list(model_file.parent.rglob("*.yaml"))
+    if len(model_config_fpaths) == 0:
+        raise "No model yaml config found for convertor"
+    if device is None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model_config = HpsYaml(model_config_fpaths[0])
+    ppg2mel_model = MelDecoderMOLv2(
+        **model_config["model"]
+    ).to(device)
+    ckpt = torch.load(model_file, map_location=device)
+    ppg2mel_model.load_state_dict(ckpt["model"])
+    ppg2mel_model.eval()
+    return ppg2mel_model

ppg2mel/preprocess.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import os
+import torch
+import numpy as np
+from tqdm import tqdm
+from pathlib import Path
+import soundfile
+import resampy
+from ppg_extractor import load_model
+import encoder.inference as Encoder
+from encoder.audio import preprocess_wav
+from encoder import audio
+from utils.f0_utils import compute_f0
+from torch.multiprocessing import Pool, cpu_count
+from functools import partial
+SAMPLE_RATE=16000
+def _compute_bnf(
+    wav: any,
+    output_fpath: str,
+    device: torch.device,
+    ppg_model_local: any,
+):
+    """
+    Compute CTC-Attention Seq2seq ASR encoder bottle-neck features (BNF).
+    """
+    ppg_model_local.to(device)
+    wav_tensor = torch.from_numpy(wav).float().to(device).unsqueeze(0)
+    wav_length = torch.LongTensor([wav.shape[0]]).to(device)
+    with torch.no_grad():
+        bnf = ppg_model_local(wav_tensor, wav_length)
+    bnf_npy = bnf.squeeze(0).cpu().numpy()
+    np.save(output_fpath, bnf_npy, allow_pickle=False)
+    return bnf_npy, len(bnf_npy)
+def _compute_f0_from_wav(wav, output_fpath):
+    """Compute merged f0 values."""
+    f0 = compute_f0(wav, SAMPLE_RATE)
+    np.save(output_fpath, f0, allow_pickle=False)
+    return f0, len(f0)
+def _compute_spkEmbed(wav, output_fpath, encoder_model_local, device):
+    Encoder.set_model(encoder_model_local)
+    # Compute where to split the utterance into partials and pad if necessary
+    wave_slices, mel_slices = Encoder.compute_partial_slices(len(wav), rate=1.3, min_pad_coverage=0.75)
+    max_wave_length = wave_slices[-1].stop
+    if max_wave_length >= len(wav):
+        wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+    # Split the utterance into partials
+    frames = audio.wav_to_mel_spectrogram(wav)
+    frames_batch = np.array([frames[s] for s in mel_slices])
+    partial_embeds = Encoder.embed_frames_batch(frames_batch)
+    # Compute the utterance embedding from the partial embeddings
+    raw_embed = np.mean(partial_embeds, axis=0)
+    embed = raw_embed / np.linalg.norm(raw_embed, 2)
+    np.save(output_fpath, embed, allow_pickle=False)
+    return embed, len(embed)
+def preprocess_one(wav_path, out_dir, device, ppg_model_local, encoder_model_local):
+    # wav = preprocess_wav(wav_path)
+    # try:
+    wav, sr = soundfile.read(wav_path)
+    if len(wav) < sr:
+        return None, sr, len(wav)
+    if sr != SAMPLE_RATE:
+        wav = resampy.resample(wav, sr, SAMPLE_RATE)
+        sr = SAMPLE_RATE
+    utt_id = os.path.basename(wav_path).rstrip(".wav")
+    _, length_bnf = _compute_bnf(output_fpath=f"{out_dir}/bnf/{utt_id}.ling_feat.npy", wav=wav, device=device, ppg_model_local=ppg_model_local)
+    _, length_f0 = _compute_f0_from_wav(output_fpath=f"{out_dir}/f0/{utt_id}.f0.npy", wav=wav)
+    _, length_embed = _compute_spkEmbed(output_fpath=f"{out_dir}/embed/{utt_id}.npy",  device=device, encoder_model_local=encoder_model_local, wav=wav)
+def preprocess_dataset(datasets_root, dataset, out_dir, n_processes, ppg_encoder_model_fpath, speaker_encoder_model):
+    # Glob wav files
+    wav_file_list = sorted(Path(f"{datasets_root}/{dataset}").glob("**/*.wav"))
+    print(f"Globbed {len(wav_file_list)} wav files.")
+    out_dir.joinpath("bnf").mkdir(exist_ok=True, parents=True)
+    out_dir.joinpath("f0").mkdir(exist_ok=True, parents=True)
+    out_dir.joinpath("embed").mkdir(exist_ok=True, parents=True)
+    ppg_model_local = load_model(ppg_encoder_model_fpath, "cpu")
+    encoder_model_local = Encoder.load_model(speaker_encoder_model, "cpu")
+    if n_processes is None:
+        n_processes = cpu_count()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    func = partial(preprocess_one, out_dir=out_dir, ppg_model_local=ppg_model_local, encoder_model_local=encoder_model_local, device=device)
+    job = Pool(n_processes).imap(func, wav_file_list)
+    list(tqdm(job, "Preprocessing", len(wav_file_list), unit="wav"))
+    # finish processing and mark
+    t_fid_file = out_dir.joinpath("train_fidlist.txt").open("w", encoding="utf-8")
+    d_fid_file = out_dir.joinpath("dev_fidlist.txt").open("w", encoding="utf-8")
+    e_fid_file = out_dir.joinpath("eval_fidlist.txt").open("w", encoding="utf-8")
+    for file in sorted(out_dir.joinpath("f0").glob("*.npy")):
+        id = os.path.basename(file).split(".f0.npy")[0]
+        if id.endswith("01"):
+            d_fid_file.write(id + "\n")
+        elif id.endswith("09"):
+            e_fid_file.write(id + "\n")
+        else:
+            t_fid_file.write(id + "\n")
+    t_fid_file.close()
+    d_fid_file.close()
+    e_fid_file.close()
+    return len(wav_file_list)

ppg2mel/rnn_decoder_mol.py ADDED Viewed

	@@ -0,0 +1,374 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from .utils.mol_attention import MOLAttention
+from .utils.basic_layers import Linear
+from .utils.vc_utils import get_mask_from_lengths
+class DecoderPrenet(nn.Module):
+    def __init__(self, in_dim, sizes):
+        super().__init__()
+        in_sizes = [in_dim] + sizes[:-1]
+        self.layers = nn.ModuleList(
+            [Linear(in_size, out_size, bias=False)
+             for (in_size, out_size) in zip(in_sizes, sizes)])
+    def forward(self, x):
+        for linear in self.layers:
+            x = F.dropout(F.relu(linear(x)), p=0.5, training=True)
+        return x
+class Decoder(nn.Module):
+    """Mixture of Logistic (MoL) attention-based RNN Decoder."""
+    def __init__(
+        self,
+        enc_dim,
+        num_mels,
+        frames_per_step,
+        attention_rnn_dim,
+        decoder_rnn_dim,
+        prenet_dims,
+        num_mixtures,
+        encoder_down_factor=1,
+        num_decoder_rnn_layer=1,
+        use_stop_tokens=False,
+        concat_context_to_last=False,
+    ):
+        super().__init__()
+        self.enc_dim = enc_dim
+        self.encoder_down_factor = encoder_down_factor
+        self.num_mels = num_mels
+        self.frames_per_step = frames_per_step
+        self.attention_rnn_dim = attention_rnn_dim
+        self.decoder_rnn_dim = decoder_rnn_dim
+        self.prenet_dims = prenet_dims
+        self.use_stop_tokens = use_stop_tokens
+        self.num_decoder_rnn_layer = num_decoder_rnn_layer
+        self.concat_context_to_last = concat_context_to_last
+        # Mel prenet
+        self.prenet = DecoderPrenet(num_mels, prenet_dims)
+        self.prenet_pitch = DecoderPrenet(num_mels, prenet_dims)
+        # Attention RNN
+        self.attention_rnn = nn.LSTMCell(
+            prenet_dims[-1] + enc_dim,
+            attention_rnn_dim
+        )
+        # Attention
+        self.attention_layer = MOLAttention(
+            attention_rnn_dim,
+            r=frames_per_step/encoder_down_factor,
+            M=num_mixtures,
+        )
+        # Decoder RNN
+        self.decoder_rnn_layers = nn.ModuleList()
+        for i in range(num_decoder_rnn_layer):
+            if i == 0:
+                self.decoder_rnn_layers.append(
+                    nn.LSTMCell(
+                        enc_dim + attention_rnn_dim,
+                        decoder_rnn_dim))
+            else:
+                self.decoder_rnn_layers.append(
+                    nn.LSTMCell(
+                        decoder_rnn_dim,
+                        decoder_rnn_dim))
+        # self.decoder_rnn = nn.LSTMCell(
+            # 2 * enc_dim + attention_rnn_dim,
+            # decoder_rnn_dim
+        # )
+        if concat_context_to_last:
+            self.linear_projection = Linear(
+                enc_dim + decoder_rnn_dim,
+                num_mels * frames_per_step
+            )
+        else:
+            self.linear_projection = Linear(
+                decoder_rnn_dim,
+                num_mels * frames_per_step
+            )
+        # Stop-token layer
+        if self.use_stop_tokens:
+            if concat_context_to_last:
+                self.stop_layer = Linear(
+                    enc_dim + decoder_rnn_dim, 1, bias=True, w_init_gain="sigmoid"
+                )
+            else:
+                self.stop_layer = Linear(
+                    decoder_rnn_dim, 1, bias=True, w_init_gain="sigmoid"
+                )
+    def get_go_frame(self, memory):
+        B = memory.size(0)
+        go_frame = torch.zeros((B, self.num_mels), dtype=torch.float,
+                               device=memory.device)
+        return go_frame
+    def initialize_decoder_states(self, memory, mask):
+        device = next(self.parameters()).device
+        B = memory.size(0)
+        # attention rnn states
+        self.attention_hidden = torch.zeros(
+            (B, self.attention_rnn_dim), device=device)
+        self.attention_cell = torch.zeros(
+            (B, self.attention_rnn_dim), device=device)
+        # decoder rnn states
+        self.decoder_hiddens = []
+        self.decoder_cells = []
+        for i in range(self.num_decoder_rnn_layer):
+            self.decoder_hiddens.append(
+                torch.zeros((B, self.decoder_rnn_dim),
+                            device=device)
+            )
+            self.decoder_cells.append(
+                torch.zeros((B, self.decoder_rnn_dim),
+                            device=device)
+            )
+        # self.decoder_hidden = torch.zeros(
+            # (B, self.decoder_rnn_dim), device=device)
+        # self.decoder_cell = torch.zeros(
+            # (B, self.decoder_rnn_dim), device=device)
+        self.attention_context =  torch.zeros(
+            (B, self.enc_dim), device=device)
+        self.memory = memory
+        # self.processed_memory = self.attention_layer.memory_layer(memory)
+        self.mask = mask
+    def parse_decoder_inputs(self, decoder_inputs):
+        """Prepare decoder inputs, i.e. gt mel
+        Args:
+            decoder_inputs:(B, T_out, n_mel_channels) inputs used for teacher-forced training.
+        """
+        decoder_inputs = decoder_inputs.reshape(
+            decoder_inputs.size(0),
+            int(decoder_inputs.size(1)/self.frames_per_step), -1)
+        # (B, T_out//r, r*num_mels) -> (T_out//r, B, r*num_mels)
+        decoder_inputs = decoder_inputs.transpose(0, 1)
+        # (T_out//r, B, num_mels)
+        decoder_inputs = decoder_inputs[:,:,-self.num_mels:]
+        return decoder_inputs
+    def parse_decoder_outputs(self, mel_outputs, alignments, stop_outputs):
+        """ Prepares decoder outputs for output
+        Args:
+            mel_outputs:
+            alignments:
+        """
+        # (T_out//r, B, T_enc) -> (B, T_out//r, T_enc)
+        alignments = torch.stack(alignments).transpose(0, 1)
+        # (T_out//r, B) -> (B, T_out//r)
+        if stop_outputs is not None:
+            if alignments.size(0) == 1:
+                stop_outputs = torch.stack(stop_outputs).unsqueeze(0)
+            else:
+                stop_outputs = torch.stack(stop_outputs).transpose(0, 1)
+            stop_outputs = stop_outputs.contiguous()
+        # (T_out//r, B, num_mels*r) -> (B, T_out//r, num_mels*r)
+        mel_outputs = torch.stack(mel_outputs).transpose(0, 1).contiguous()
+        # decouple frames per step
+        # (B, T_out, num_mels)
+        mel_outputs = mel_outputs.view(
+            mel_outputs.size(0), -1, self.num_mels)
+        return mel_outputs, alignments, stop_outputs
+    def attend(self, decoder_input):
+        cell_input = torch.cat((decoder_input, self.attention_context), -1)
+        self.attention_hidden, self.attention_cell = self.attention_rnn(
+            cell_input, (self.attention_hidden, self.attention_cell))
+        self.attention_context, attention_weights = self.attention_layer(
+            self.attention_hidden, self.memory, None, self.mask)
+        decoder_rnn_input = torch.cat(
+            (self.attention_hidden, self.attention_context), -1)
+        return decoder_rnn_input, self.attention_context, attention_weights
+    def decode(self, decoder_input):
+        for i in range(self.num_decoder_rnn_layer):
+            if i == 0:
+                self.decoder_hiddens[i], self.decoder_cells[i] = self.decoder_rnn_layers[i](
+                    decoder_input, (self.decoder_hiddens[i], self.decoder_cells[i]))
+            else:
+                self.decoder_hiddens[i], self.decoder_cells[i] = self.decoder_rnn_layers[i](
+                    self.decoder_hiddens[i-1], (self.decoder_hiddens[i], self.decoder_cells[i]))
+        return self.decoder_hiddens[-1]
+    def forward(self, memory, mel_inputs, memory_lengths):
+        """ Decoder forward pass for training
+        Args:
+            memory: (B, T_enc, enc_dim) Encoder outputs
+            decoder_inputs: (B, T, num_mels) Decoder inputs for teacher forcing.
+            memory_lengths: (B, ) Encoder output lengths for attention masking.
+        Returns:
+            mel_outputs: (B, T, num_mels) mel outputs from the decoder
+            alignments: (B, T//r, T_enc) attention weights.
+        """
+        # [1, B, num_mels]
+        go_frame = self.get_go_frame(memory).unsqueeze(0)
+        # [T//r, B, num_mels]
+        mel_inputs = self.parse_decoder_inputs(mel_inputs)
+        # [T//r + 1, B, num_mels]
+        mel_inputs = torch.cat((go_frame, mel_inputs), dim=0)
+        # [T//r + 1, B, prenet_dim]
+        decoder_inputs = self.prenet(mel_inputs)
+        # decoder_inputs_pitch = self.prenet_pitch(decoder_inputs__)
+        self.initialize_decoder_states(
+            memory, mask=~get_mask_from_lengths(memory_lengths),
+        )
+        self.attention_layer.init_states(memory)
+        # self.attention_layer_pitch.init_states(memory_pitch)
+        mel_outputs, alignments = [], []
+        if self.use_stop_tokens:
+            stop_outputs = []
+        else:
+            stop_outputs = None
+        while len(mel_outputs) < decoder_inputs.size(0) - 1:
+            decoder_input = decoder_inputs[len(mel_outputs)]
+            # decoder_input_pitch = decoder_inputs_pitch[len(mel_outputs)]
+            decoder_rnn_input, context, attention_weights = self.attend(decoder_input)
+            decoder_rnn_output = self.decode(decoder_rnn_input)
+            if self.concat_context_to_last:
+                decoder_rnn_output = torch.cat(
+                    (decoder_rnn_output, context), dim=1)
+            mel_output = self.linear_projection(decoder_rnn_output)
+            if self.use_stop_tokens:
+                stop_output = self.stop_layer(decoder_rnn_output)
+                stop_outputs += [stop_output.squeeze()]
+            mel_outputs += [mel_output.squeeze(1)] #? perhaps don't need squeeze
+            alignments += [attention_weights]
+            # alignments_pitch += [attention_weights_pitch]
+        mel_outputs, alignments, stop_outputs = self.parse_decoder_outputs(
+            mel_outputs, alignments, stop_outputs)
+        if stop_outputs is None:
+            return mel_outputs, alignments
+        else:
+            return mel_outputs, stop_outputs, alignments
+    def inference(self, memory, stop_threshold=0.5):
+        """ Decoder inference
+        Args:
+            memory: (1, T_enc, D_enc) Encoder outputs
+        Returns:
+            mel_outputs: mel outputs from the decoder
+            alignments: sequence of attention weights from the decoder
+        """
+        # [1, num_mels]
+        decoder_input = self.get_go_frame(memory)
+        self.initialize_decoder_states(memory, mask=None)
+        self.attention_layer.init_states(memory)
+        mel_outputs, alignments = [], []
+        # NOTE(sx): heuristic
+        max_decoder_step = memory.size(1)*self.encoder_down_factor//self.frames_per_step
+        min_decoder_step = memory.size(1)*self.encoder_down_factor // self.frames_per_step - 5
+        while True:
+            decoder_input = self.prenet(decoder_input)
+            decoder_input_final, context, alignment = self.attend(decoder_input)
+            #mel_output, stop_output, alignment = self.decode(decoder_input)
+            decoder_rnn_output = self.decode(decoder_input_final)
+            if self.concat_context_to_last:
+                decoder_rnn_output = torch.cat(
+                    (decoder_rnn_output, context), dim=1)
+            mel_output = self.linear_projection(decoder_rnn_output)
+            stop_output = self.stop_layer(decoder_rnn_output)
+            mel_outputs += [mel_output.squeeze(1)]
+            alignments += [alignment]
+            if torch.sigmoid(stop_output.data) > stop_threshold and len(mel_outputs) >= min_decoder_step:
+                break
+            if len(mel_outputs) >= max_decoder_step:
+                # print("Warning! Decoding steps reaches max decoder steps.")
+                break
+            decoder_input = mel_output[:,-self.num_mels:]
+        mel_outputs, alignments, _  = self.parse_decoder_outputs(
+            mel_outputs, alignments, None)
+        return mel_outputs, alignments
+    def inference_batched(self, memory, stop_threshold=0.5):
+        """ Decoder inference
+        Args:
+            memory: (B, T_enc, D_enc) Encoder outputs
+        Returns:
+            mel_outputs: mel outputs from the decoder
+            alignments: sequence of attention weights from the decoder
+        """
+        # [1, num_mels]
+        decoder_input = self.get_go_frame(memory)
+        self.initialize_decoder_states(memory, mask=None)
+        self.attention_layer.init_states(memory)
+        mel_outputs, alignments = [], []
+        stop_outputs = []
+        # NOTE(sx): heuristic
+        max_decoder_step = memory.size(1)*self.encoder_down_factor//self.frames_per_step
+        min_decoder_step = memory.size(1)*self.encoder_down_factor // self.frames_per_step - 5
+        while True:
+            decoder_input = self.prenet(decoder_input)
+            decoder_input_final, context, alignment = self.attend(decoder_input)
+            #mel_output, stop_output, alignment = self.decode(decoder_input)
+            decoder_rnn_output = self.decode(decoder_input_final)
+            if self.concat_context_to_last:
+                decoder_rnn_output = torch.cat(
+                    (decoder_rnn_output, context), dim=1)
+            mel_output = self.linear_projection(decoder_rnn_output)
+            # (B, 1)
+            stop_output = self.stop_layer(decoder_rnn_output)
+            stop_outputs += [stop_output.squeeze()]
+            # stop_outputs.append(stop_output)
+            mel_outputs += [mel_output.squeeze(1)]
+            alignments += [alignment]
+            # print(stop_output.shape)
+            if torch.all(torch.sigmoid(stop_output.squeeze().data) > stop_threshold) \
+                    and len(mel_outputs) >= min_decoder_step:
+                break
+            if len(mel_outputs) >= max_decoder_step:
+                # print("Warning! Decoding steps reaches max decoder steps.")
+                break
+            decoder_input = mel_output[:,-self.num_mels:]
+        mel_outputs, alignments, stop_outputs = self.parse_decoder_outputs(
+            mel_outputs, alignments, stop_outputs)
+        mel_outputs_stacked = []
+        for mel, stop_logit in zip(mel_outputs, stop_outputs):
+            idx = np.argwhere(torch.sigmoid(stop_logit.cpu()) > stop_threshold)[0][0].item()
+            mel_outputs_stacked.append(mel[:idx,:])
+        mel_outputs = torch.cat(mel_outputs_stacked, dim=0).unsqueeze(0)
+        return mel_outputs, alignments