diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..4eca27be6d8721a0bc619989952418132b3d59aa --- /dev/null +++ b/.dockerignore @@ -0,0 +1,8 @@ +docs +logs +output +reference +SoVITS_weights +GPT_weights +TEMP +.git diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..0bb4e0bf22a20558257328203d997569b4a83722 --- /dev/null +++ b/.gitignore @@ -0,0 +1,200 @@ +.DS_Store +.vscode +__pycache__ +*.pyc +env +runtime +.idea +output +logs +reference +GPT_weights +SoVITS_weights +GPT_weights_v2 +SoVITS_weights_v2 +GPT_weights_v3 +SoVITS_weights_v3 +TEMP +weight.json +ffmpeg* +ffprobe* +cfg.json +speakers.json +ref_audios +tools/AP_BWE_main/24kto48k/* +!tools/AP_BWE_main/24kto48k/readme.txt + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc diff --git a/Colab-Inference.ipynb b/Colab-Inference.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..8a31701312c50aaf9575292ca1b97509631fd7d3 --- /dev/null +++ b/Colab-Inference.ipynb @@ -0,0 +1,184 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GPT-SoVITS Infer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Env Setup (Run Once Only)\n", + "## 环境配置, 只需运行一次" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "e9b7iFV3dm1f" + }, + "outputs": [], + "source": [ + "%%writefile /content/setup.sh\n", + "set -e\n", + "\n", + "cd /content\n", + "\n", + "git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n", + "\n", + "cd GPT-SoVITS\n", + "\n", + "mkdir GPT_weights\n", + "\n", + "mkdir SoVITS_weights\n", + "\n", + "if conda env list | awk '{print $1}' | grep -Fxq \"GPTSoVITS\"; then\n", + " :\n", + "else\n", + " conda create -n GPTSoVITS python=3.10 -y\n", + "fi\n", + "\n", + "source activate GPTSoVITS\n", + "\n", + "pip install ipykernel\n", + "\n", + "bash install.sh --source HF" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "0NgxXg5sjv7z" + }, + "outputs": [], + "source": [ + "%pip install -q condacolab\n", + "import condacolab\n", + "condacolab.install_from_url(\"https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh\")\n", + "!cd /content && bash setup.sh" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Download Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Download From HuggingFace" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "vbZY-LnM0tzq" + }, + "outputs": [], + "source": [ + "# Modify These\n", + "USER_ID = \"AkitoP\"\n", + "REPO_NAME = \"GPT-SoVITS-v2-aegi\"\n", + "BRANCH = \"main\"\n", + "GPT_PATH = \"new_aegigoe-e100.ckpt\"\n", + "SOVITS_PATH = \"new_aegigoe_e60_s32220.pth\"\n", + "\n", + "# Do Not Modify\n", + "HF_BASE = \"https://huggingface.co\"\n", + "REPO_ID = f\"{USER_ID}/{REPO_NAME}\"\n", + "GPT_URL = f\"{HF_BASE}/{REPO_ID}/blob/{BRANCH}/{GPT_PATH}\"\n", + "SOVITS_URL = f\"{HF_BASE}/{REPO_ID}/blob/{BRANCH}/{SOVITS_PATH}\"\n", + "\n", + "!cd \"/content/GPT-SoVITS/GPT_weights\" && wget \"{GPT_URL}\"\n", + "!cd \"/content/GPT-SoVITS/SoVITS_weights\" && wget \"{SOVITS_URL}\"\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Download From ModelScope" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Modify These\n", + "USER_ID = \"aihobbyist\"\n", + "REPO_NAME = \"GPT-SoVits-V2-models\"\n", + "BRANCH = \"master\"\n", + "GPT_PATH = \"Genshin_Impact/EN/GPT_GenshinImpact_EN_5.1.ckpt\"\n", + "SOVITS_PATH = \"Wuthering_Waves/CN/SV_WutheringWaves_CN_1.3.pth\"\n", + "\n", + "# Do Not Modify\n", + "HF_BASE = \"https://www.modelscope.cn/models\"\n", + "REPO_ID = f\"{USER_ID}/{REPO_NAME}\"\n", + "GPT_URL = f\"{HF_BASE}/{REPO_ID}/resolve/{BRANCH}/{GPT_PATH}\"\n", + "SOVITS_URL = f\"{HF_BASE}/{REPO_ID}/resolve/{BRANCH}/{SOVITS_PATH}\"\n", + "\n", + "!cd \"/content/GPT-SoVITS/GPT_weights\" && wget \"{GPT_URL}\"\n", + "!cd \"/content/GPT-SoVITS/SoVITS_weights\" && wget \"{SOVITS_URL}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Launch WebUI\n", + "# 启动 WebUI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "4oRGUzkrk8C7" + }, + "outputs": [], + "source": [ + "!cd /content/GPT-SoVITS && source activate GPTSoVITS && export is_share=True && python webui.py" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..80cd9f3a106c9c997c23c0aa7c8fecab7039001b --- /dev/null +++ b/Dockerfile @@ -0,0 +1,42 @@ +# Base CUDA image +FROM cnstark/pytorch:2.0.1-py3.9.17-cuda11.8.0-ubuntu20.04 + +LABEL maintainer="breakstring@hotmail.com" +LABEL version="dev-20240209" +LABEL description="Docker image for GPT-SoVITS" + + +# Install 3rd party apps +ENV DEBIAN_FRONTEND=noninteractive +ENV TZ=Etc/UTC +RUN apt-get update && \ + apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && \ + git lfs install && \ + rm -rf /var/lib/apt/lists/* + +# Copy only requirements.txt initially to leverage Docker cache +WORKDIR /workspace +COPY requirements.txt /workspace/ +RUN pip install --no-cache-dir -r requirements.txt + +# Define a build-time argument for image type +ARG IMAGE_TYPE=full + +# Conditional logic based on the IMAGE_TYPE argument +# Always copy the Docker directory, but only use it if IMAGE_TYPE is not "elite" +COPY ./Docker /workspace/Docker +# elite 类型的镜像里面不包含额外的模型 +RUN if [ "$IMAGE_TYPE" != "elite" ]; then \ + chmod +x /workspace/Docker/download.sh && \ + /workspace/Docker/download.sh && \ + python /workspace/Docker/download.py && \ + python -m nltk.downloader averaged_perceptron_tagger cmudict; \ + fi + + +# Copy the rest of the application +COPY . /workspace + +EXPOSE 9871 9872 9873 9874 9880 + +CMD ["python", "webui.py"] diff --git a/GPT_SoVITS/AR/__init__.py b/GPT_SoVITS/AR/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/GPT_SoVITS/AR/data/__init__.py b/GPT_SoVITS/AR/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/GPT_SoVITS/AR/data/bucket_sampler.py b/GPT_SoVITS/AR/data/bucket_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..d84573340732b916eaa6e9f8e88cb4166d6f1ca5 --- /dev/null +++ b/GPT_SoVITS/AR/data/bucket_sampler.py @@ -0,0 +1,149 @@ +# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/bucket_sampler.py +# reference: https://github.com/lifeiteng/vall-e +import itertools +import math +import random +from random import shuffle +from typing import Iterator, Optional, TypeVar + +import torch +import torch.distributed as dist +from torch.utils.data import Dataset, Sampler + +__all__ = [ + "DistributedBucketSampler", +] + +T_co = TypeVar("T_co", covariant=True) + + +class DistributedBucketSampler(Sampler[T_co]): + r""" + sort the dataset wrt. input length + divide samples into buckets + sort within buckets + divide buckets into batches + sort batches + """ + + def __init__( + self, + dataset: Dataset, + num_replicas: Optional[int] = None, + rank: Optional[int] = None, + shuffle: bool = True, + seed: int = 0, + drop_last: bool = False, + batch_size: int = 32, + ) -> None: + if num_replicas is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + num_replicas = dist.get_world_size() if torch.cuda.is_available() else 1 + if rank is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + rank = dist.get_rank() if torch.cuda.is_available() else 0 + if torch.cuda.is_available(): + torch.cuda.set_device(rank) + if rank >= num_replicas or rank < 0: + raise ValueError("Invalid rank {}, rank should be in the interval [0, {}]".format(rank, num_replicas - 1)) + self.dataset = dataset + self.num_replicas = num_replicas + self.rank = rank + self.epoch = 0 + self.drop_last = drop_last + # If the dataset length is evenly divisible by # of replicas, then there + # is no need to drop any data, since the dataset will be split equally. + if self.drop_last and len(self.dataset) % self.num_replicas != 0: # type: ignore[arg-type] + # Split to nearest available length that is evenly divisible. + # This is to ensure each rank receives the same amount of data when + # using this Sampler. + self.num_samples = math.ceil( + (len(self.dataset) - self.num_replicas) / self.num_replicas, # type: ignore[arg-type] + ) + else: + self.num_samples = math.ceil( + len(self.dataset) / self.num_replicas, + ) # type: ignore[arg-type] + self.total_size = self.num_samples * self.num_replicas + self.shuffle = shuffle + self.seed = seed + self.batch_size = batch_size + self.id_with_length = self._get_sample_lengths() + self.id_buckets = self.make_buckets(bucket_width=2.0) + + def _get_sample_lengths(self): + id_with_lengths = [] + for i in range(len(self.dataset)): + id_with_lengths.append((i, self.dataset.get_sample_length(i))) + id_with_lengths.sort(key=lambda x: x[1]) + return id_with_lengths + + def make_buckets(self, bucket_width: float = 2.0): + buckets = [] + cur = [] + max_sec = bucket_width + for id, sec in self.id_with_length: + if sec < max_sec: + cur.append(id) + else: + buckets.append(cur) + cur = [id] + max_sec += bucket_width + if len(cur) > 0: + buckets.append(cur) + return buckets + + def __iter__(self) -> Iterator[T_co]: + if self.shuffle: + # deterministically shuffle based on epoch and seed + g = torch.Generator() + g.manual_seed(self.seed + self.epoch) + random.seed(self.epoch + self.seed) + shuffled_bucket = [] + for buc in self.id_buckets: + buc_copy = buc.copy() + shuffle(buc_copy) + shuffled_bucket.append(buc_copy) + grouped_batch_size = self.batch_size * self.num_replicas + shuffled_bucket = list(itertools.chain(*shuffled_bucket)) + n_batch = int(math.ceil(len(shuffled_bucket) / grouped_batch_size)) + batches = [shuffled_bucket[b * grouped_batch_size : (b + 1) * grouped_batch_size] for b in range(n_batch)] + shuffle(batches) + indices = list(itertools.chain(*batches)) + else: + # type: ignore[arg-type] + indices = list(range(len(self.dataset))) + + if not self.drop_last: + # add extra samples to make it evenly divisible + padding_size = self.total_size - len(indices) + if padding_size <= len(indices): + indices += indices[:padding_size] + else: + indices += (indices * math.ceil(padding_size / len(indices)))[:padding_size] + else: + # remove tail of data to make it evenly divisible. + indices = indices[: self.total_size] + assert len(indices) == self.total_size + + # subsample + indices = indices[self.rank : self.total_size : self.num_replicas] + assert len(indices) == self.num_samples + + return iter(indices) + + def __len__(self) -> int: + return self.num_samples + + def set_epoch(self, epoch: int) -> None: + r""" + Sets the epoch for this sampler. When :attr:`shuffle=True`, this ensures all replicas + use a different random ordering for each epoch. Otherwise, the next iteration of this + sampler will yield the same ordering. + + Args: + epoch (int): Epoch number. + """ + self.epoch = epoch diff --git a/GPT_SoVITS/AR/data/data_module.py b/GPT_SoVITS/AR/data/data_module.py new file mode 100644 index 0000000000000000000000000000000000000000..f360503ba06301fc130ae4afbe27ecae4dae33ef --- /dev/null +++ b/GPT_SoVITS/AR/data/data_module.py @@ -0,0 +1,81 @@ +# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py +# reference: https://github.com/lifeiteng/vall-e +from pytorch_lightning import LightningDataModule +from torch.utils.data import DataLoader + +from AR.data.bucket_sampler import DistributedBucketSampler +from AR.data.dataset import Text2SemanticDataset + + +class Text2SemanticDataModule(LightningDataModule): + def __init__( + self, + config, + train_semantic_path, + train_phoneme_path, + dev_semantic_path=None, + dev_phoneme_path=None, + ): + super().__init__() + self.config = config + self.train_semantic_path = train_semantic_path + self.train_phoneme_path = train_phoneme_path + self.dev_semantic_path = dev_semantic_path + self.dev_phoneme_path = dev_phoneme_path + self.num_workers = self.config["data"]["num_workers"] + + def prepare_data(self): + pass + + def setup(self, stage=None, output_logs=False): + self._train_dataset = Text2SemanticDataset( + phoneme_path=self.train_phoneme_path, + semantic_path=self.train_semantic_path, + max_sec=self.config["data"]["max_sec"], + pad_val=self.config["data"]["pad_val"], + ) + self._dev_dataset = self._train_dataset + # self._dev_dataset = Text2SemanticDataset( + # phoneme_path=self.dev_phoneme_path, + # semantic_path=self.dev_semantic_path, + # max_sample=self.config['data']['max_eval_sample'], + # max_sec=self.config['data']['max_sec'], + # pad_val=self.config['data']['pad_val']) + + def train_dataloader(self): + batch_size = ( + self.config["train"]["batch_size"] // 2 + if self.config["train"].get("if_dpo", False) is True + else self.config["train"]["batch_size"] + ) + batch_size = max(min(batch_size, len(self._train_dataset) // 4), 1) # 防止不保存 + sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size) + return DataLoader( + self._train_dataset, + batch_size=batch_size, + sampler=sampler, + collate_fn=self._train_dataset.collate, + num_workers=self.num_workers, + persistent_workers=True, + prefetch_factor=16, + ) + + def val_dataloader(self): + return DataLoader( + self._dev_dataset, + batch_size=1, + shuffle=False, + collate_fn=self._train_dataset.collate, + num_workers=max(self.num_workers, 12), + persistent_workers=True, + prefetch_factor=16, + ) + + # 这个会使用到嘛? + def test_dataloader(self): + return DataLoader( + self._dev_dataset, + batch_size=1, + shuffle=False, + collate_fn=self._train_dataset.collate, + ) diff --git a/GPT_SoVITS/AR/data/dataset.py b/GPT_SoVITS/AR/data/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..402483d918bd05b8609fbcde4eb18f87b2242560 --- /dev/null +++ b/GPT_SoVITS/AR/data/dataset.py @@ -0,0 +1,320 @@ +# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/dataset.py +# reference: https://github.com/lifeiteng/vall-e + +# sys.path.append("/data/docker/liujing04/gpt-vits/mq-vits-s1bert_no_bert") +import os +import traceback +from typing import Dict, List + +import numpy as np +import pandas as pd +import torch +from torch.utils.data import DataLoader, Dataset + +version = os.environ.get("version", None) + +from text import cleaned_text_to_sequence + +# from config import exp_dir + + +def batch_sequences(sequences: List[np.array], axis: int = 0, pad_value: int = 0): + seq = sequences[0] + ndim = seq.ndim + if axis < 0: + axis += ndim + dtype = seq.dtype + pad_value = dtype.type(pad_value) + seq_lengths = [seq.shape[axis] for seq in sequences] + max_length = np.max(seq_lengths) + + padded_sequences = [] + for seq, length in zip(sequences, seq_lengths): + padding = [(0, 0)] * axis + [(0, max_length - length)] + [(0, 0)] * (ndim - axis - 1) + padded_seq = np.pad(seq, padding, mode="constant", constant_values=pad_value) + padded_sequences.append(padded_seq) + batch = np.stack(padded_sequences) + return batch + + +class Text2SemanticDataset(Dataset): + """dataset class for text tokens to semantic model training.""" + + def __init__( + self, + phoneme_path: str, + semantic_path: str, + max_sample: int = None, + max_sec: int = 100, + pad_val: int = 1024, + # min value of phoneme/sec + min_ps_ratio: int = 3, + # max value of phoneme/sec + max_ps_ratio: int = 25, + ) -> None: + super().__init__() + + self.semantic_data = pd.read_csv( + semantic_path, + delimiter="\t", + encoding="utf-8", + ) + # get dict + self.path2 = phoneme_path # "%s/2-name2text.txt"%exp_dir#phoneme_path + self.path3 = "%s/3-bert" % ( + os.path.dirname( + phoneme_path, + ) + ) # "%s/3-bert"%exp_dir#bert_dir + self.path6 = semantic_path # "%s/6-name2semantic.tsv"%exp_dir#semantic_path + assert os.path.exists(self.path2) + assert os.path.exists(self.path6) + self.phoneme_data = {} + with open(self.path2, "r", encoding="utf8") as f: + lines = f.read().strip("\n").split("\n") + + for line in lines: + tmp = line.split("\t") + if len(tmp) != 4: + continue + self.phoneme_data[tmp[0]] = [tmp[1], tmp[2], tmp[3]] + + # self.phoneme_data = np.load(phoneme_path, allow_pickle=True).item() + # pad for semantic tokens + self.PAD: int = pad_val + # self.hz = 25 + # with open("/data/docker/liujing04/gpt-vits/mq-vits-s1bert_no_bert/configs/s2.json", "r") as f:data = f.read() + # data=json.loads(data)["model"]["semantic_frame_rate"]#50hz + # self.hz=int(data[:-2])# + self.hz = int(os.environ.get("hz", "25hz")[:-2]) + + # max seconds of semantic token + self.max_sec = max_sec + self.min_ps_ratio = min_ps_ratio + self.max_ps_ratio = max_ps_ratio + + if max_sample is not None: + self.semantic_data = self.semantic_data[:max_sample] + + # {idx: (semantic, phoneme)} + # semantic list, phoneme list + self.semantic_phoneme = [] + self.item_names = [] + + self.inited = False + + if not self.inited: + # 调用初始化函数 + self.init_batch() + self.inited = True + del self.semantic_data + del self.phoneme_data + # self.tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext-large") + # self.tokenizer = AutoTokenizer.from_pretrained("/data/docker/liujing04/bert-vits2/Bert-VITS2-master20231106/bert/chinese-roberta-wwm-ext-large") + + def init_batch(self): + semantic_data_len = len(self.semantic_data) + phoneme_data_len = len(self.phoneme_data.keys()) + print("semantic_data_len:", semantic_data_len) + print("phoneme_data_len:", phoneme_data_len) + print(self.semantic_data) + idx = 0 + num_not_in = 0 + num_deleted_bigger = 0 + num_deleted_ps = 0 + for i in range(semantic_data_len): + # 先依次遍历 + # get str + item_name = self.semantic_data.iloc[i, 0] + # print(self.phoneme_data) + try: + phoneme, word2ph, text = self.phoneme_data[item_name] + except Exception: + traceback.print_exc() + # print(f"{item_name} not in self.phoneme_data !") + num_not_in += 1 + continue + + semantic_str = self.semantic_data.iloc[i, 1] + # get token list + semantic_ids = [int(idx) for idx in semantic_str.split(" ")] + # (T), 是否需要变成 (1, T) -> 不需要,因为需要求 len + # 过滤掉太长的样本 + if ( + len(semantic_ids) > self.max_sec * self.hz + ): #########1###根据token个数推测总时长过滤时长60s(config里)#40*25=1k + num_deleted_bigger += 1 + continue + # (T, ), 这个速度不会很慢,所以可以在一开始就处理,无需在 __getitem__ 里面单个处理#### + phoneme = phoneme.split(" ") + + try: + phoneme_ids = cleaned_text_to_sequence(phoneme, version) + except: + traceback.print_exc() + # print(f"{item_name} not in self.phoneme_data !") + num_not_in += 1 + continue + # if len(phoneme_ids) >400:###########2:改为恒定限制为semantic/2.5就行 + if len(phoneme_ids) > self.max_sec * self.hz / 2.5: ###########2:改为恒定限制为semantic/2.5就行 + num_deleted_ps += 1 + continue + # if len(semantic_ids) > 1000:###########3 + # num_deleted_bigger += 1 + # continue + + ps_ratio = len(phoneme_ids) / (len(semantic_ids) / self.hz) + + if ps_ratio > self.max_ps_ratio or ps_ratio < self.min_ps_ratio: ##########4#3~25#每秒多少个phone + num_deleted_ps += 1 + # print(item_name) + continue + + self.semantic_phoneme.append((semantic_ids, phoneme_ids)) + idx += 1 + self.item_names.append(item_name) + + min_num = 100 # 20直接不补#30补了也不存ckpt + leng = len(self.semantic_phoneme) + if leng < min_num: + tmp1 = self.semantic_phoneme + tmp2 = self.item_names + self.semantic_phoneme = [] + self.item_names = [] + for _ in range(max(2, int(min_num / leng))): + self.semantic_phoneme += tmp1 + self.item_names += tmp2 + if num_not_in > 0: + print(f"there are {num_not_in} semantic datas not in phoneme datas") + if num_deleted_bigger > 0: + print( + f"deleted {num_deleted_bigger} audios who's duration are bigger than {self.max_sec} seconds", + ) + if num_deleted_ps > 0: + # 4702 for LibriTTS, LirbriTTS 是标注数据, 是否需要筛?=> 需要,有值为 100 的极端值 + print( + f"deleted {num_deleted_ps} audios who's phoneme/sec are bigger than {self.max_ps_ratio} or smaller than {self.min_ps_ratio}", + ) + """ + there are 31 semantic datas not in phoneme datas + deleted 34 audios who's duration are bigger than 54 seconds + deleted 3190 audios who's phoneme/sec are bigger than 25 or smaller than 3 + dataset.__len__(): 366463 + + """ + # 345410 for LibriTTS + print("dataset.__len__():", self.__len__()) + + def __get_item_names__(self) -> List[str]: + return self.item_names + + def __len__(self) -> int: + return len(self.semantic_phoneme) + + def __getitem__(self, idx: int) -> Dict: + semantic_ids, phoneme_ids = self.semantic_phoneme[idx] + item_name = self.item_names[idx] + phoneme_ids_len = len(phoneme_ids) + # semantic tokens target + semantic_ids_len = len(semantic_ids) + + flag = 0 + path_bert = "%s/%s.pt" % (self.path3, item_name) + if os.path.exists(path_bert) == True: + bert_feature = torch.load(path_bert, map_location="cpu") + else: + flag = 1 + if flag == 1: + # bert_feature=torch.zeros_like(phoneme_ids,dtype=torch.float32) + bert_feature = None + else: + assert bert_feature.shape[-1] == len(phoneme_ids) + return { + "idx": idx, + "phoneme_ids": phoneme_ids, + "phoneme_ids_len": phoneme_ids_len, + "semantic_ids": semantic_ids, + "semantic_ids_len": semantic_ids_len, + "bert_feature": bert_feature, + } + + def get_sample_length(self, idx: int): + semantic_ids = self.semantic_phoneme[idx][0] + sec = 1.0 * len(semantic_ids) / self.hz + return sec + + def collate(self, examples: List[Dict]) -> Dict: + sample_index: List[int] = [] + phoneme_ids: List[torch.Tensor] = [] + phoneme_ids_lens: List[int] = [] + semantic_ids: List[torch.Tensor] = [] + semantic_ids_lens: List[int] = [] + # return + + for item in examples: + sample_index.append(item["idx"]) + phoneme_ids.append(np.array(item["phoneme_ids"], dtype=np.int64)) + semantic_ids.append(np.array(item["semantic_ids"], dtype=np.int64)) + phoneme_ids_lens.append(item["phoneme_ids_len"]) + semantic_ids_lens.append(item["semantic_ids_len"]) + + # pad 0 + phoneme_ids = batch_sequences(phoneme_ids) + semantic_ids = batch_sequences(semantic_ids, pad_value=self.PAD) + + # # convert each batch to torch.tensor + phoneme_ids = torch.tensor(phoneme_ids) + semantic_ids = torch.tensor(semantic_ids) + phoneme_ids_lens = torch.tensor(phoneme_ids_lens) + semantic_ids_lens = torch.tensor(semantic_ids_lens) + bert_padded = torch.FloatTensor(len(examples), 1024, max(phoneme_ids_lens)) + bert_padded.zero_() + + for idx, item in enumerate(examples): + bert = item["bert_feature"] + if bert != None: + bert_padded[idx, :, : bert.shape[-1]] = bert + + return { + # List[int] + "ids": sample_index, + # torch.Tensor (B, max_phoneme_length) + "phoneme_ids": phoneme_ids, + # torch.Tensor (B) + "phoneme_ids_len": phoneme_ids_lens, + # torch.Tensor (B, max_semantic_ids_length) + "semantic_ids": semantic_ids, + # torch.Tensor (B) + "semantic_ids_len": semantic_ids_lens, + # torch.Tensor (B, 1024, max_phoneme_length) + "bert_feature": bert_padded, + } + + +if __name__ == "__main__": + root_dir = "/data/docker/liujing04/gpt-vits/prepare/dump_mix/" + dataset = Text2SemanticDataset( + phoneme_path=root_dir + "phoneme_train.npy", + semantic_path=root_dir + "semantic_train.tsv", + ) + + batch_size = 12 + dataloader = DataLoader( + dataset, + batch_size=batch_size, + collate_fn=dataset.collate, + shuffle=False, + ) + for i, batch in enumerate(dataloader): + if i % 1000 == 0: + print(i) + # if i == 0: + # print('batch["ids"]:', batch["ids"]) + # print('batch["phoneme_ids"]:', batch["phoneme_ids"], + # batch["phoneme_ids"].shape) + # print('batch["phoneme_ids_len"]:', batch["phoneme_ids_len"], + # batch["phoneme_ids_len"].shape) + # print('batch["semantic_ids"]:', batch["semantic_ids"], + # batch["semantic_ids"].shape) + # print('batch["semantic_ids_len"]:', batch["semantic_ids_len"], + # batch["semantic_ids_len"].shape) diff --git a/GPT_SoVITS/AR/models/__init__.py b/GPT_SoVITS/AR/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/GPT_SoVITS/AR/models/t2s_lightning_module.py b/GPT_SoVITS/AR/models/t2s_lightning_module.py new file mode 100644 index 0000000000000000000000000000000000000000..0696c35f8036af098be493c924cf3b54c99fdb37 --- /dev/null +++ b/GPT_SoVITS/AR/models/t2s_lightning_module.py @@ -0,0 +1,145 @@ +# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py +# reference: https://github.com/lifeiteng/vall-e +import os +import sys + +now_dir = os.getcwd() +sys.path.append(now_dir) +from typing import Dict + +import torch +from pytorch_lightning import LightningModule + +from AR.models.t2s_model import Text2SemanticDecoder +from AR.modules.lr_schedulers import WarmupCosineLRSchedule +from AR.modules.optim import ScaledAdam + + +class Text2SemanticLightningModule(LightningModule): + def __init__(self, config, output_dir, is_train=True): + super().__init__() + self.config = config + self.top_k = 3 + self.model = Text2SemanticDecoder(config=config, top_k=self.top_k) + pretrained_s1 = config.get("pretrained_s1") + if pretrained_s1 and is_train: + # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"])) + print( + self.load_state_dict( + torch.load( + pretrained_s1, + map_location="cpu", + )["weight"], + ) + ) + if is_train: + self.automatic_optimization = False + self.save_hyperparameters() + self.eval_dir = output_dir / "eval" + self.eval_dir.mkdir(parents=True, exist_ok=True) + + def training_step(self, batch: Dict, batch_idx: int): + opt = self.optimizers() + scheduler = self.lr_schedulers() + forward = self.model.forward if self.config["train"].get("if_dpo", False) == True else self.model.forward_old + loss, acc = forward( + batch["phoneme_ids"], + batch["phoneme_ids_len"], + batch["semantic_ids"], + batch["semantic_ids_len"], + batch["bert_feature"], + ) + self.manual_backward(loss) + if batch_idx > 0 and batch_idx % 4 == 0: + opt.step() + opt.zero_grad() + scheduler.step() + + self.log( + "total_loss", + loss, + on_step=True, + on_epoch=True, + prog_bar=True, + sync_dist=True, + ) + self.log( + "lr", + scheduler.get_last_lr()[0], + on_epoch=True, + prog_bar=True, + sync_dist=True, + ) + self.log( + f"top_{self.top_k}_acc", + acc, + on_step=True, + on_epoch=True, + prog_bar=True, + sync_dist=True, + ) + + def validation_step(self, batch: Dict, batch_idx: int): + return + + # # get loss + # loss, acc = self.model.forward( + # batch['phoneme_ids'], batch['phoneme_ids_len'], + # batch['semantic_ids'], batch['semantic_ids_len'], + # batch['bert_feature'] + # ) + # + # self.log( + # "val_total_loss", + # loss, + # on_step=True, + # on_epoch=True, + # prog_bar=True, + # sync_dist=True) + # self.log( + # f"val_top_{self.top_k}_acc", + # acc, + # on_step=True, + # on_epoch=True, + # prog_bar=True, + # sync_dist=True) + # + # # get infer output + # semantic_len = batch['semantic_ids'].size(1) + # prompt_len = min(int(semantic_len * 0.5), 150) + # prompt = batch['semantic_ids'][:, :prompt_len] + # pred_semantic = self.model.infer(batch['phoneme_ids'], + # batch['phoneme_ids_len'], prompt, + # batch['bert_feature'] + # ) + # save_name = f'semantic_toks_{batch_idx}.pt' + # save_path = os.path.join(self.eval_dir, save_name) + # torch.save(pred_semantic.detach().cpu(), save_path) + + def configure_optimizers(self): + model_parameters = self.model.parameters() + parameters_names = [] + parameters_names.append([name_param_pair[0] for name_param_pair in self.model.named_parameters()]) + lm_opt = ScaledAdam( + model_parameters, + lr=0.01, + betas=(0.9, 0.95), + clipping_scale=2.0, + parameters_names=parameters_names, + show_dominant_parameters=False, + clipping_update_period=1000, + ) + + return { + "optimizer": lm_opt, + "lr_scheduler": { + "scheduler": WarmupCosineLRSchedule( + lm_opt, + init_lr=self.config["optimizer"]["lr_init"], + peak_lr=self.config["optimizer"]["lr"], + end_lr=self.config["optimizer"]["lr_end"], + warmup_steps=self.config["optimizer"]["warmup_steps"], + total_steps=self.config["optimizer"]["decay_steps"], + ) + }, + } diff --git a/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py b/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..b0ab59c4c80ed2cf0f7b5aa93e27cd60bf8279c7 --- /dev/null +++ b/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py @@ -0,0 +1,110 @@ +# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py +# reference: https://github.com/lifeiteng/vall-e +import os +import sys + +now_dir = os.getcwd() +sys.path.append(now_dir) +from typing import Dict + +import torch +from pytorch_lightning import LightningModule + +from AR.models.t2s_model_onnx import Text2SemanticDecoder +from AR.modules.lr_schedulers import WarmupCosineLRSchedule +from AR.modules.optim import ScaledAdam + + +class Text2SemanticLightningModule(LightningModule): + def __init__(self, config, output_dir, is_train=True): + super().__init__() + self.config = config + self.top_k = 3 + self.model = Text2SemanticDecoder(config=config, top_k=self.top_k) + pretrained_s1 = config.get("pretrained_s1") + if pretrained_s1 and is_train: + # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"])) + print( + self.load_state_dict( + torch.load( + pretrained_s1, + map_location="cpu", + )["weight"], + ), + ) + if is_train: + self.automatic_optimization = False + self.save_hyperparameters() + self.eval_dir = output_dir / "eval" + self.eval_dir.mkdir(parents=True, exist_ok=True) + + def training_step(self, batch: Dict, batch_idx: int): + opt = self.optimizers() + scheduler = self.lr_schedulers() + loss, acc = self.model.forward( + batch["phoneme_ids"], + batch["phoneme_ids_len"], + batch["semantic_ids"], + batch["semantic_ids_len"], + batch["bert_feature"], + ) + self.manual_backward(loss) + if batch_idx > 0 and batch_idx % 4 == 0: + opt.step() + opt.zero_grad() + scheduler.step() + + self.log( + "total_loss", + loss, + on_step=True, + on_epoch=True, + prog_bar=True, + sync_dist=True, + ) + self.log( + "lr", + scheduler.get_last_lr()[0], + on_epoch=True, + prog_bar=True, + sync_dist=True, + ) + self.log( + f"top_{self.top_k}_acc", + acc, + on_step=True, + on_epoch=True, + prog_bar=True, + sync_dist=True, + ) + + def validation_step(self, batch: Dict, batch_idx: int): + return + + def configure_optimizers(self): + model_parameters = self.model.parameters() + parameters_names = [] + parameters_names.append([name_param_pair[0] for name_param_pair in self.model.named_parameters()]) + lm_opt = ScaledAdam( + model_parameters, + lr=0.01, + betas=(0.9, 0.95), + clipping_scale=2.0, + parameters_names=parameters_names, + show_dominant_parameters=False, + clipping_update_period=1000, + ) + + return { + "optimizer": lm_opt, + "lr_scheduler": { + "scheduler": WarmupCosineLRSchedule( + lm_opt, + init_lr=self.config["optimizer"]["lr_init"], + peak_lr=self.config["optimizer"]["lr"], + end_lr=self.config["optimizer"]["lr_end"], + warmup_steps=self.config["optimizer"]["warmup_steps"], + total_steps=self.config["optimizer"]["decay_steps"], + ) + }, + } diff --git a/GPT_SoVITS/AR/models/t2s_model.py b/GPT_SoVITS/AR/models/t2s_model.py new file mode 100644 index 0000000000000000000000000000000000000000..4725b7a3b2657cf1c83a035e7dd796ce51703745 --- /dev/null +++ b/GPT_SoVITS/AR/models/t2s_model.py @@ -0,0 +1,935 @@ +# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_model.py +# reference: https://github.com/lifeiteng/vall-e +import math +from typing import List, Optional + +import torch +from torch import nn +from torch.nn import functional as F +from torchmetrics.classification import MulticlassAccuracy +from tqdm import tqdm + +from AR.models.utils import ( + dpo_loss, + get_batch_logps, + make_pad_mask, + make_pad_mask_left, + make_reject_y, + sample, + topk_sampling, +) +from AR.modules.embedding import SinePositionalEmbedding, TokenEmbedding +from AR.modules.transformer import LayerNorm, TransformerEncoder, TransformerEncoderLayer + +default_config = { + "embedding_dim": 512, + "hidden_dim": 512, + "num_head": 8, + "num_layers": 12, + "num_codebook": 8, + "p_dropout": 0.0, + "vocab_size": 1024 + 1, + "phoneme_vocab_size": 512, + "EOS": 1024, +} + + +# @torch.jit.script ## 使用的话首次推理会非常慢,而且推理速度不稳定 +# Efficient implementation equivalent to the following: +def scaled_dot_product_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + scale: Optional[torch.Tensor] = None, +) -> torch.Tensor: + B, H, L, S = query.size(0), query.size(1), query.size(-2), key.size(-2) + if scale is None: + scale_factor = torch.tensor(1 / math.sqrt(query.size(-1))) + else: + scale_factor = scale + attn_bias = torch.zeros(B, H, L, S, dtype=query.dtype, device=query.device) + + if attn_mask is not None: + if attn_mask.dtype == torch.bool: + attn_bias.masked_fill_(attn_mask, float("-inf")) + else: + attn_bias += attn_mask + attn_weight = query @ key.transpose(-2, -1) * scale_factor + attn_weight += attn_bias + attn_weight = torch.softmax(attn_weight, dim=-1) + + if attn_mask is not None: + if attn_mask.dtype == torch.bool: + attn_weight.masked_fill_(attn_mask, 0) + else: + attn_mask[attn_mask != float("-inf")] = 0 + attn_mask[attn_mask == float("-inf")] = 1 + attn_weight.masked_fill_(attn_mask, 0) + + return attn_weight @ value + + +@torch.jit.script +class T2SMLP: + def __init__(self, w1, b1, w2, b2): + self.w1 = w1 + self.b1 = b1 + self.w2 = w2 + self.b2 = b2 + + def forward(self, x): + x = F.relu(F.linear(x, self.w1, self.b1)) + x = F.linear(x, self.w2, self.b2) + return x + + +@torch.jit.script +class T2SBlock: + def __init__( + self, + num_heads, + hidden_dim: int, + mlp: T2SMLP, + qkv_w, + qkv_b, + out_w, + out_b, + norm_w1, + norm_b1, + norm_eps1, + norm_w2, + norm_b2, + norm_eps2, + ): + self.num_heads = num_heads + self.mlp = mlp + self.hidden_dim: int = hidden_dim + self.qkv_w = qkv_w + self.qkv_b = qkv_b + self.out_w = out_w + self.out_b = out_b + self.norm_w1 = norm_w1 + self.norm_b1 = norm_b1 + self.norm_eps1 = norm_eps1 + self.norm_w2 = norm_w2 + self.norm_b2 = norm_b2 + self.norm_eps2 = norm_eps2 + + self.false = torch.tensor(False, dtype=torch.bool) + + @torch.jit.ignore + def to_mask( + self, + x: torch.Tensor, + padding_mask: Optional[torch.Tensor], + ): + if padding_mask is None: + return x + + if padding_mask.dtype == torch.bool: + return x.masked_fill(padding_mask, 0) + else: + return x * padding_mask + + def process_prompt( + self, + x: torch.Tensor, + attn_mask: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + torch_sdpa: bool = True, + ): + q, k, v = F.linear(self.to_mask(x, padding_mask), self.qkv_w, self.qkv_b).chunk(3, dim=-1) + + batch_size = q.shape[0] + q_len = q.shape[1] + kv_len = k.shape[1] + + q = self.to_mask(q, padding_mask) + k_cache = self.to_mask(k, padding_mask) + v_cache = self.to_mask(v, padding_mask) + + q = q.view(batch_size, q_len, self.num_heads, -1).transpose(1, 2) + k = k_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2) + v = v_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2) + + if torch_sdpa: + attn = F.scaled_dot_product_attention(q, k, v, ~attn_mask) + else: + attn = scaled_dot_product_attention(q, k, v, attn_mask) + + attn = attn.transpose(1, 2).reshape(batch_size, q_len, -1) + attn = F.linear(self.to_mask(attn, padding_mask), self.out_w, self.out_b) + + x = x + attn + x = F.layer_norm(x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1) + x = x + self.mlp.forward(x) + x = F.layer_norm( + x, + [self.hidden_dim], + self.norm_w2, + self.norm_b2, + self.norm_eps2, + ) + return x, k_cache, v_cache + + def decode_next_token( + self, + x: torch.Tensor, + k_cache: torch.Tensor, + v_cache: torch.Tensor, + attn_mask: torch.Tensor = None, + torch_sdpa: bool = True, + ): + q, k, v = F.linear(x, self.qkv_w, self.qkv_b).chunk(3, dim=-1) + + k_cache = torch.cat([k_cache, k], dim=1) + v_cache = torch.cat([v_cache, v], dim=1) + + batch_size = q.shape[0] + q_len = q.shape[1] + kv_len = k_cache.shape[1] + + q = q.view(batch_size, q_len, self.num_heads, -1).transpose(1, 2) + k = k_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2) + v = v_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2) + + if torch_sdpa: + attn = F.scaled_dot_product_attention(q, k, v, (~attn_mask) if attn_mask is not None else None) + else: + attn = scaled_dot_product_attention(q, k, v, attn_mask) + + attn = attn.transpose(1, 2).reshape(batch_size, q_len, -1) + attn = F.linear(attn, self.out_w, self.out_b) + + x = x + attn + x = F.layer_norm( + x, + [self.hidden_dim], + self.norm_w1, + self.norm_b1, + self.norm_eps1, + ) + x = x + self.mlp.forward(x) + x = F.layer_norm( + x, + [self.hidden_dim], + self.norm_w2, + self.norm_b2, + self.norm_eps2, + ) + return x, k_cache, v_cache + + +@torch.jit.script +class T2STransformer: + def __init__(self, num_blocks: int, blocks: List[T2SBlock]): + self.num_blocks: int = num_blocks + self.blocks = blocks + + def process_prompt( + self, + x: torch.Tensor, + attn_mask: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + torch_sdpa: bool = True, + ): + k_cache: List[torch.Tensor] = [] + v_cache: List[torch.Tensor] = [] + for i in range(self.num_blocks): + x, k_cache_, v_cache_ = self.blocks[i].process_prompt(x, attn_mask, padding_mask, torch_sdpa) + k_cache.append(k_cache_) + v_cache.append(v_cache_) + return x, k_cache, v_cache + + def decode_next_token( + self, + x: torch.Tensor, + k_cache: List[torch.Tensor], + v_cache: List[torch.Tensor], + attn_mask: torch.Tensor = None, + torch_sdpa: bool = True, + ): + for i in range(self.num_blocks): + x, k_cache[i], v_cache[i] = self.blocks[i].decode_next_token( + x, k_cache[i], v_cache[i], attn_mask, torch_sdpa + ) + return x, k_cache, v_cache + + +class Text2SemanticDecoder(nn.Module): + def __init__(self, config, norm_first=False, top_k=3): + super(Text2SemanticDecoder, self).__init__() + self.model_dim = config["model"]["hidden_dim"] + self.embedding_dim = config["model"]["embedding_dim"] + self.num_head = config["model"]["head"] + self.num_layers = config["model"]["n_layer"] + self.norm_first = norm_first + self.vocab_size = config["model"]["vocab_size"] + self.phoneme_vocab_size = config["model"]["phoneme_vocab_size"] + self.p_dropout = config["model"]["dropout"] + self.EOS = config["model"]["EOS"] + self.norm_first = norm_first + assert self.EOS == self.vocab_size - 1 + # should be same as num of kmeans bin + # assert self.EOS == 1024 + self.bert_proj = nn.Linear(1024, self.embedding_dim) + self.ar_text_embedding = TokenEmbedding( + self.embedding_dim, + self.phoneme_vocab_size, + self.p_dropout, + ) + self.ar_text_position = SinePositionalEmbedding( + self.embedding_dim, + dropout=0.1, + scale=False, + alpha=True, + ) + self.ar_audio_embedding = TokenEmbedding( + self.embedding_dim, + self.vocab_size, + self.p_dropout, + ) + self.ar_audio_position = SinePositionalEmbedding( + self.embedding_dim, + dropout=0.1, + scale=False, + alpha=True, + ) + + self.h = TransformerEncoder( + TransformerEncoderLayer( + d_model=self.model_dim, + nhead=self.num_head, + dim_feedforward=self.model_dim * 4, + dropout=0.1, + batch_first=True, + norm_first=norm_first, + ), + num_layers=self.num_layers, + norm=LayerNorm(self.model_dim) if norm_first else None, + ) + + self.ar_predict_layer = nn.Linear(self.model_dim, self.vocab_size, bias=False) + self.loss_fct = nn.CrossEntropyLoss(reduction="sum") + + self.ar_accuracy_metric = MulticlassAccuracy( + self.vocab_size, + top_k=top_k, + average="micro", + multidim_average="global", + ignore_index=self.EOS, + ) + + blocks = [] + + for i in range(self.num_layers): + layer = self.h.layers[i] + t2smlp = T2SMLP( + layer.linear1.weight, + layer.linear1.bias, + layer.linear2.weight, + layer.linear2.bias, + ) + + block = T2SBlock( + self.num_head, + self.model_dim, + t2smlp, + layer.self_attn.in_proj_weight, + layer.self_attn.in_proj_bias, + layer.self_attn.out_proj.weight, + layer.self_attn.out_proj.bias, + layer.norm1.weight, + layer.norm1.bias, + layer.norm1.eps, + layer.norm2.weight, + layer.norm2.bias, + layer.norm2.eps, + ) + + blocks.append(block) + + self.t2s_transformer = T2STransformer(self.num_layers, blocks) + + def make_input_data(self, x, x_lens, y, y_lens, bert_feature): + x = self.ar_text_embedding(x) + x = x + self.bert_proj(bert_feature.transpose(1, 2)) + x = self.ar_text_position(x) + x_mask = make_pad_mask(x_lens) + + y_mask = make_pad_mask(y_lens) + y_mask_int = y_mask.type(torch.int64) + codes = y.type(torch.int64) * (1 - y_mask_int) + + # Training + # AR Decoder + y, targets = self.pad_y_eos(codes, y_mask_int, eos_id=self.EOS) + x_len = x_lens.max() + y_len = y_lens.max() + y_emb = self.ar_audio_embedding(y) + y_pos = self.ar_audio_position(y_emb) + + xy_padding_mask = torch.concat([x_mask, y_mask], dim=1) + + ar_xy_padding_mask = xy_padding_mask + + x_attn_mask = F.pad( + torch.zeros((x_len, x_len), dtype=torch.bool, device=x.device), + (0, y_len), + value=True, + ) + # x_attn_mask[:, x_len]=False + y_attn_mask = F.pad( + torch.triu( + torch.ones(y_len, y_len, dtype=torch.bool, device=x.device), + diagonal=1, + ), + (x_len, 0), + value=False, + ) + + xy_attn_mask = torch.concat([x_attn_mask, y_attn_mask], dim=0) + bsz, src_len = x.shape[0], x_len + y_len + _xy_padding_mask = ( + ar_xy_padding_mask.view(bsz, 1, 1, src_len) + .expand(-1, self.num_head, -1, -1) + .reshape(bsz * self.num_head, 1, src_len) + ) + xy_attn_mask = xy_attn_mask.logical_or(_xy_padding_mask) + new_attn_mask = torch.zeros_like(xy_attn_mask, dtype=x.dtype) + new_attn_mask.masked_fill_(xy_attn_mask, float("-inf")) + xy_attn_mask = new_attn_mask + # x 和完整的 y 一次性输入模型 + xy_pos = torch.concat([x, y_pos], dim=1) + + return xy_pos, xy_attn_mask, targets + + def forward(self, x, x_lens, y, y_lens, bert_feature): + """ + x: phoneme_ids + y: semantic_ids + """ + + reject_y, reject_y_lens = make_reject_y(y, y_lens) + + xy_pos, xy_attn_mask, targets = self.make_input_data(x, x_lens, y, y_lens, bert_feature) + + xy_dec, _ = self.h( + (xy_pos, None), + mask=xy_attn_mask, + ) + x_len = x_lens.max() + logits = self.ar_predict_layer(xy_dec[:, x_len:]) + + ###### DPO ############# + reject_xy_pos, reject_xy_attn_mask, reject_targets = self.make_input_data( + x, x_lens, reject_y, reject_y_lens, bert_feature + ) + + reject_xy_dec, _ = self.h( + (reject_xy_pos, None), + mask=reject_xy_attn_mask, + ) + x_len = x_lens.max() + reject_logits = self.ar_predict_layer(reject_xy_dec[:, x_len:]) + + # loss + # from feiteng: 每次 duration 越多, 梯度更新也应该更多, 所以用 sum + + loss_1 = F.cross_entropy(logits.permute(0, 2, 1), targets, reduction="sum") + acc = self.ar_accuracy_metric(logits.permute(0, 2, 1).detach(), targets).item() + + A_logits, R_logits = get_batch_logps(logits, reject_logits, targets, reject_targets) + loss_2, _, _ = dpo_loss(A_logits, R_logits, 0, 0, 0.2, reference_free=True) + + loss = loss_1 + loss_2 + + return loss, acc + + def forward_old(self, x, x_lens, y, y_lens, bert_feature): + """ + x: phoneme_ids + y: semantic_ids + """ + x = self.ar_text_embedding(x) + x = x + self.bert_proj(bert_feature.transpose(1, 2)) + x = self.ar_text_position(x) + x_mask = make_pad_mask(x_lens) + + y_mask = make_pad_mask(y_lens) + y_mask_int = y_mask.type(torch.int64) + codes = y.type(torch.int64) * (1 - y_mask_int) + + # Training + # AR Decoder + y, targets = self.pad_y_eos(codes, y_mask_int, eos_id=self.EOS) + x_len = x_lens.max() + y_len = y_lens.max() + y_emb = self.ar_audio_embedding(y) + y_pos = self.ar_audio_position(y_emb) + + xy_padding_mask = torch.concat([x_mask, y_mask], dim=1) + ar_xy_padding_mask = xy_padding_mask + + x_attn_mask = F.pad( + torch.zeros((x_len, x_len), dtype=torch.bool, device=x.device), + (0, y_len), + value=True, + ) + y_attn_mask = F.pad( + torch.triu( + torch.ones(y_len, y_len, dtype=torch.bool, device=x.device), + diagonal=1, + ), + (x_len, 0), + value=False, + ) + xy_attn_mask = torch.concat([x_attn_mask, y_attn_mask], dim=0) + bsz, src_len = x.shape[0], x_len + y_len + _xy_padding_mask = ( + ar_xy_padding_mask.view(bsz, 1, 1, src_len) + .expand(-1, self.num_head, -1, -1) + .reshape(bsz * self.num_head, 1, src_len) + ) + xy_attn_mask = xy_attn_mask.logical_or(_xy_padding_mask) + new_attn_mask = torch.zeros_like(xy_attn_mask, dtype=x.dtype) + new_attn_mask.masked_fill_(xy_attn_mask, float("-inf")) + xy_attn_mask = new_attn_mask + # x 和完整的 y 一次性输入模型 + xy_pos = torch.concat([x, y_pos], dim=1) + xy_dec, _ = self.h( + (xy_pos, None), + mask=xy_attn_mask, + ) + logits = self.ar_predict_layer(xy_dec[:, x_len:]).permute(0, 2, 1) + # loss + # from feiteng: 每次 duration 越多, 梯度更新也应该更多, 所以用 sum + loss = F.cross_entropy(logits, targets, reduction="sum") + acc = self.ar_accuracy_metric(logits.detach(), targets).item() + return loss, acc + + # 需要看下这个函数和 forward 的区别以及没有 semantic 的时候 prompts 输入什么 + def infer( + self, + x, + x_lens, + prompts, + bert_feature, + top_k: int = -100, + early_stop_num: int = -1, + temperature: float = 1.0, + ): + x = self.ar_text_embedding(x) + x = x + self.bert_proj(bert_feature.transpose(1, 2)) + x = self.ar_text_position(x) + + # AR Decoder + y = prompts + prefix_len = y.shape[1] + x_len = x.shape[1] + x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool) + stop = False + for _ in tqdm(range(1500)): + y_emb = self.ar_audio_embedding(y) + y_pos = self.ar_audio_position(y_emb) + # x 和逐渐增长的 y 一起输入给模型 + xy_pos = torch.concat([x, y_pos], dim=1) + y_len = y.shape[1] + x_attn_mask_pad = F.pad( + x_attn_mask, + (0, y_len), + value=True, + ) + y_attn_mask = F.pad( + torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1), + (x_len, 0), + value=False, + ) + xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0).to(y.device) + + xy_dec, _ = self.h( + (xy_pos, None), + mask=xy_attn_mask, + ) + logits = self.ar_predict_layer(xy_dec[:, -1]) + samples = topk_sampling(logits, top_k=top_k, top_p=1.0, temperature=temperature) + + if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: + print("use early stop num:", early_stop_num) + stop = True + + if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS: + # print(torch.argmax(logits, dim=-1)[0] == self.EOS, samples[0, 0] == self.EOS) + stop = True + if stop: + if prompts.shape[1] == y.shape[1]: + y = torch.concat([y, torch.zeros_like(samples)], dim=1) + print("bad zero prediction") + print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]") + break + # 本次生成的 semantic_ids 和之前的 y 构成新的 y + # print(samples.shape)#[1,1]#第一个1是bs + # import os + # os._exit(2333) + y = torch.concat([y, samples], dim=1) + return y + + def pad_y_eos(self, y, y_mask_int, eos_id): + targets = F.pad(y, (0, 1), value=0) + eos_id * F.pad(y_mask_int, (0, 1), value=1) + # 错位 + return targets[:, :-1], targets[:, 1:] + + def infer_panel_batch_infer( + self, + x: List[torch.LongTensor], #####全部文本token + x_lens: torch.LongTensor, + prompts: torch.LongTensor, ####参考音频token + bert_feature: List[torch.LongTensor], + top_k: int = -100, + top_p: int = 100, + early_stop_num: int = -1, + temperature: float = 1.0, + repetition_penalty: float = 1.35, + **kwargs, + ): + if prompts is None: + print("Warning: Prompt free is not supported batch_infer! switch to naive_infer") + return self.infer_panel_naive_batched( + x, + x_lens, + prompts, + bert_feature, + top_k=top_k, + top_p=top_p, + early_stop_num=early_stop_num, + temperature=temperature, + **kwargs, + ) + + max_len = kwargs.get("max_len", x_lens.max()) + x_list = [] + for x_item, bert_item in zip(x, bert_feature): + # max_len = max(max_len, x_item.shape[0], bert_item.shape[1]) + x_item = self.ar_text_embedding(x_item.unsqueeze(0)) + x_item = x_item + self.bert_proj(bert_item.transpose(0, 1).unsqueeze(0)) + x_item = self.ar_text_position(x_item).squeeze(0) + # x_item = F.pad(x_item,(0,0,0,max_len-x_item.shape[0]),value=0) if x_item.shape[0] early_stop_num) or idx == 1499: + print("use early stop num:", early_stop_num) + stop = True + for i, batch_index in enumerate(batch_idx_map): + batch_index = batch_idx_map[i] + idx_list[batch_index] = idx + y_list[batch_index] = y[i, :-1] + + if None not in idx_list: + stop = True + + if stop: + if y.shape[1] == 0: + y = torch.concat([y, torch.zeros_like(samples)], dim=1) + print("bad zero prediction") + print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]") + break + + ####################### update next step ################################### + y_emb = self.ar_audio_embedding(y[:, -1:]) + xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[ + :, y_len + idx + ].to(dtype=y_emb.dtype, device=y_emb.device) + + if None in idx_list: + for i in range(x.shape[0]): + if idx_list[i] is None: + idx_list[i] = 1500 - 1 ###如果没有生成到EOS,就用最大长度代替 + + if ref_free: + return y_list, [0] * x.shape[0] + # print(idx_list) + return y_list, idx_list + + def infer_panel_naive_batched( + self, + x: List[torch.LongTensor], #####全部文本token + x_lens: torch.LongTensor, + prompts: torch.LongTensor, ####参考音频token + bert_feature: List[torch.LongTensor], + top_k: int = -100, + top_p: int = 100, + early_stop_num: int = -1, + temperature: float = 1.0, + repetition_penalty: float = 1.35, + **kwargs, + ): + y_list = [] + idx_list = [] + for i in range(len(x)): + y, idx = self.infer_panel_naive( + x[i].unsqueeze(0), + x_lens[i], + prompts[i].unsqueeze(0) if prompts is not None else None, + bert_feature[i].unsqueeze(0), + top_k, + top_p, + early_stop_num, + temperature, + repetition_penalty, + **kwargs, + ) + y_list.append(y[0]) + idx_list.append(idx) + + return y_list, idx_list + + def infer_panel_naive( + self, + x: torch.LongTensor, #####全部文本token + x_lens: torch.LongTensor, + prompts: torch.LongTensor, ####参考音频token + bert_feature: torch.LongTensor, + top_k: int = -100, + top_p: int = 100, + early_stop_num: int = -1, + temperature: float = 1.0, + repetition_penalty: float = 1.35, + **kwargs, + ): + x = self.ar_text_embedding(x) + x = x + self.bert_proj(bert_feature.transpose(1, 2)) + x = self.ar_text_position(x) + + # AR Decoder + y = prompts + + x_len = x.shape[1] + x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool) + stop = False + # print(1111111,self.num_layers) + + k_cache = None + v_cache = None + ################### first step ########################## + if y is not None: + y_emb = self.ar_audio_embedding(y) + y_len = y_emb.shape[1] + prefix_len = y.shape[1] + y_pos = self.ar_audio_position(y_emb) + xy_pos = torch.concat([x, y_pos], dim=1) + ref_free = False + else: + y_emb = None + y_len = 0 + prefix_len = 0 + y_pos = None + xy_pos = x + y = torch.zeros(x.shape[0], 0, dtype=torch.int, device=x.device) + ref_free = True + + bsz = x.shape[0] + src_len = x_len + y_len + x_attn_mask_pad = F.pad( + x_attn_mask, + (0, y_len), ###xx的纯0扩展到xx纯0+xy纯1,(x,x+y) + value=True, + ) + y_attn_mask = F.pad( ###yy的右上1扩展到左边xy的0,(y,x+y) + torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1), + (x_len, 0), + value=False, + ) + xy_attn_mask = ( + torch.concat([x_attn_mask_pad, y_attn_mask], dim=0) + .unsqueeze(0) + .expand(bsz * self.num_head, -1, -1) + .view(bsz, self.num_head, src_len, src_len) + .to(device=x.device, dtype=torch.bool) + ) + + for idx in tqdm(range(1500)): + if xy_attn_mask is not None: + xy_dec, k_cache, v_cache = self.t2s_transformer.process_prompt(xy_pos, xy_attn_mask, None) + else: + xy_dec, k_cache, v_cache = self.t2s_transformer.decode_next_token(xy_pos, k_cache, v_cache) + + logits = self.ar_predict_layer(xy_dec[:, -1]) + + if idx == 0: + xy_attn_mask = None + if idx < 11: ###至少预测出10个token不然不给停止(0.4s) + logits = logits[:, :-1] + + samples = sample( + logits, y, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, temperature=temperature + )[0] + + y = torch.concat([y, samples], dim=1) + + if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: + print("use early stop num:", early_stop_num) + stop = True + + if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS: + stop = True + if stop: + if y.shape[1] == 0: + y = torch.concat([y, torch.zeros_like(samples)], dim=1) + print("bad zero prediction") + print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]") + break + + ####################### update next step ################################### + y_emb = self.ar_audio_embedding(y[:, -1:]) + xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[ + :, y_len + idx + ].to(dtype=y_emb.dtype, device=y_emb.device) + + if ref_free: + return y[:, :-1], 0 + return y[:, :-1], idx + + def infer_panel( + self, + x: torch.LongTensor, #####全部文本token + x_lens: torch.LongTensor, + prompts: torch.LongTensor, ####参考音频token + bert_feature: torch.LongTensor, + top_k: int = -100, + top_p: int = 100, + early_stop_num: int = -1, + temperature: float = 1.0, + repetition_penalty: float = 1.35, + **kwargs, + ): + return self.infer_panel_naive( + x, x_lens, prompts, bert_feature, top_k, top_p, early_stop_num, temperature, repetition_penalty, **kwargs + ) diff --git a/GPT_SoVITS/AR/models/t2s_model_onnx.py b/GPT_SoVITS/AR/models/t2s_model_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..4f7b50a3adc0217d8016bfc41507e71eb8dc00b8 --- /dev/null +++ b/GPT_SoVITS/AR/models/t2s_model_onnx.py @@ -0,0 +1,394 @@ +# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_model.py +# reference: https://github.com/lifeiteng/vall-e +import torch +from torch import nn +from torch.nn import functional as F +from torchmetrics.classification import MulticlassAccuracy + +from AR.modules.embedding_onnx import SinePositionalEmbedding, TokenEmbedding +from AR.modules.transformer_onnx import LayerNorm, TransformerEncoder, TransformerEncoderLayer + +default_config = { + "embedding_dim": 512, + "hidden_dim": 512, + "num_head": 8, + "num_layers": 12, + "num_codebook": 8, + "p_dropout": 0.0, + "vocab_size": 1024 + 1, + "phoneme_vocab_size": 512, + "EOS": 1024, +} + +inf_tensor_value = torch.FloatTensor([-float("Inf")]).float() + + +def logits_to_probs( + logits, + previous_tokens=None, + temperature: float = 1.0, + top_k=None, + top_p=None, + repetition_penalty: float = 1.0, +): + previous_tokens = previous_tokens.squeeze() + if previous_tokens is not None and repetition_penalty != 1.0: + previous_tokens = previous_tokens.long() + score = torch.gather(logits, dim=0, index=previous_tokens) + score = torch.where( + score < 0, + score * repetition_penalty, + score / repetition_penalty, + ) + logits.scatter_(dim=0, index=previous_tokens, src=score) + + if top_p is not None and top_p < 1.0: + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cum_probs = torch.cumsum( + torch.nn.functional.softmax( + sorted_logits, + dim=-1, + ), + dim=-1, + ) + sorted_indices_to_remove = cum_probs > top_p + sorted_indices_to_remove[0] = False # keep at least one option + indices_to_remove = sorted_indices_to_remove.scatter( + dim=0, + index=sorted_indices, + src=sorted_indices_to_remove, + ) + logits = logits.masked_fill(indices_to_remove, -float("Inf")) + + logits = logits / max(temperature, 1e-5) + + if top_k is not None: + v, _ = torch.topk(logits, top_k) + pivot = v.select(-1, -1).unsqueeze(-1) + logits = torch.where(logits < pivot, inf_tensor_value, logits) + + probs = torch.nn.functional.softmax(logits, dim=-1) + return probs + + +def multinomial_sample_one_no_sync( + probs_sort, +): # Does multinomial sampling without a cuda synchronization + q = torch.randn_like(probs_sort) + return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int) + + +def sample( + logits, + previous_tokens, + **sampling_kwargs, +): + probs = logits_to_probs( + logits=logits, + previous_tokens=previous_tokens, + **sampling_kwargs, + ) + idx_next = multinomial_sample_one_no_sync(probs) + return idx_next, probs + + +class OnnxEncoder(nn.Module): + def __init__(self, ar_text_embedding, bert_proj, ar_text_position): + super().__init__() + self.ar_text_embedding = ar_text_embedding + self.bert_proj = bert_proj + self.ar_text_position = ar_text_position + + def forward(self, x, bert_feature): + x = self.ar_text_embedding(x) + x = x + self.bert_proj(bert_feature.transpose(1, 2)) + return self.ar_text_position(x) + + +class T2SFirstStageDecoder(nn.Module): + def __init__( + self, + ar_audio_embedding, + ar_audio_position, + h, + ar_predict_layer, + loss_fct, + ar_accuracy_metric, + top_k, + early_stop_num, + num_layers, + ): + super().__init__() + self.ar_audio_embedding = ar_audio_embedding + self.ar_audio_position = ar_audio_position + self.h = h + self.ar_predict_layer = ar_predict_layer + self.loss_fct = loss_fct + self.ar_accuracy_metric = ar_accuracy_metric + self.top_k = top_k + self.early_stop_num = early_stop_num + self.num_layers = num_layers + + def forward(self, x, prompt): + y = prompt + x_example = x[:, :, 0] * 0.0 + # N, 1, 512 + cache = { + "all_stage": self.num_layers, + "k": None, + "v": None, + "y_emb": None, + "first_infer": 1, + "stage": 0, + } + + y_emb = self.ar_audio_embedding(y) + + cache["y_emb"] = y_emb + y_pos = self.ar_audio_position(y_emb) + + xy_pos = torch.concat([x, y_pos], dim=1) + + y_example = y_pos[:, :, 0] * 0.0 + x_attn_mask = torch.matmul(x_example.transpose(0, 1), x_example).bool() + y_attn_mask = torch.ones_like(torch.matmul(y_example.transpose(0, 1), y_example), dtype=torch.int64) + y_attn_mask = torch.cumsum(y_attn_mask, dim=1) - torch.cumsum( + torch.ones_like( + y_example.transpose(0, 1), + dtype=torch.int64, + ), + dim=0, + ) + y_attn_mask = y_attn_mask > 0 + + x_y_pad = torch.matmul(x_example.transpose(0, 1), y_example).bool() + y_x_pad = torch.matmul(y_example.transpose(0, 1), x_example).bool() + x_attn_mask_pad = torch.cat([x_attn_mask, torch.ones_like(x_y_pad)], dim=1) + y_attn_mask = torch.cat([y_x_pad, y_attn_mask], dim=1) + xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0) + cache["k"] = ( + torch.matmul(x_attn_mask_pad[0].float().unsqueeze(-1), torch.zeros((1, 512))) + .unsqueeze(1) + .repeat(self.num_layers, 1, 1, 1) + ) + cache["v"] = ( + torch.matmul(x_attn_mask_pad[0].float().unsqueeze(-1), torch.zeros((1, 512))) + .unsqueeze(1) + .repeat(self.num_layers, 1, 1, 1) + ) + + xy_dec = self.h(xy_pos, mask=xy_attn_mask, cache=cache) + logits = self.ar_predict_layer(xy_dec[:, -1]) + samples = sample(logits[0], y, top_k=self.top_k, top_p=1.0, repetition_penalty=1.35)[0].unsqueeze(0) + + y = torch.concat([y, samples], dim=1) + + return y, cache["k"], cache["v"], cache["y_emb"], x_example + + +class T2SStageDecoder(nn.Module): + def __init__( + self, + ar_audio_embedding, + ar_audio_position, + h, + ar_predict_layer, + loss_fct, + ar_accuracy_metric, + top_k, + early_stop_num, + num_layers, + ): + super().__init__() + self.ar_audio_embedding = ar_audio_embedding + self.ar_audio_position = ar_audio_position + self.h = h + self.ar_predict_layer = ar_predict_layer + self.loss_fct = loss_fct + self.ar_accuracy_metric = ar_accuracy_metric + self.top_k = top_k + self.early_stop_num = early_stop_num + self.num_layers = num_layers + + def forward(self, y, k, v, y_emb, x_example): + cache = { + "all_stage": self.num_layers, + "k": torch.nn.functional.pad(k, (0, 0, 0, 0, 0, 1)), + "v": torch.nn.functional.pad(v, (0, 0, 0, 0, 0, 1)), + "y_emb": y_emb, + "first_infer": 0, + "stage": 0, + } + + y_emb = torch.cat( + [ + cache["y_emb"], + self.ar_audio_embedding(y[:, -1:]), + ], + 1, + ) + cache["y_emb"] = y_emb + y_pos = self.ar_audio_position(y_emb) + + xy_pos = y_pos[:, -1:] + + y_example = y_pos[:, :, 0] * 0.0 + + xy_attn_mask = torch.cat([x_example, y_example], dim=1) + xy_attn_mask = torch.zeros_like(xy_attn_mask, dtype=torch.bool) + + xy_dec = self.h(xy_pos, mask=xy_attn_mask, cache=cache) + logits = self.ar_predict_layer(xy_dec[:, -1]) + samples = sample(logits[0], y, top_k=self.top_k, top_p=1.0, repetition_penalty=1.35)[0].unsqueeze(0) + + y = torch.concat([y, samples], dim=1) + + return y, cache["k"], cache["v"], cache["y_emb"], logits, samples + + +class Text2SemanticDecoder(nn.Module): + def __init__(self, config, norm_first=False, top_k=3): + super(Text2SemanticDecoder, self).__init__() + self.model_dim = config["model"]["hidden_dim"] + self.embedding_dim = config["model"]["embedding_dim"] + self.num_head = config["model"]["head"] + self.num_layers = config["model"]["n_layer"] + self.norm_first = norm_first + self.vocab_size = config["model"]["vocab_size"] + self.phoneme_vocab_size = config["model"]["phoneme_vocab_size"] + self.p_dropout = float(config["model"]["dropout"]) + self.EOS = config["model"]["EOS"] + self.norm_first = norm_first + assert self.EOS == self.vocab_size - 1 + self.bert_proj = nn.Linear(1024, self.embedding_dim) + self.ar_text_embedding = TokenEmbedding(self.embedding_dim, self.phoneme_vocab_size, self.p_dropout) + self.ar_text_position = SinePositionalEmbedding(self.embedding_dim, dropout=0.1, scale=False, alpha=True) + self.ar_audio_embedding = TokenEmbedding(self.embedding_dim, self.vocab_size, self.p_dropout) + self.ar_audio_position = SinePositionalEmbedding(self.embedding_dim, dropout=0.1, scale=False, alpha=True) + self.h = TransformerEncoder( + TransformerEncoderLayer( + d_model=self.model_dim, + nhead=self.num_head, + dim_feedforward=self.model_dim * 4, + dropout=0.1, + batch_first=True, + norm_first=norm_first, + ), + num_layers=self.num_layers, + norm=LayerNorm(self.model_dim) if norm_first else None, + ) + self.ar_predict_layer = nn.Linear(self.model_dim, self.vocab_size, bias=False) + self.loss_fct = nn.CrossEntropyLoss(reduction="sum") + self.ar_accuracy_metric = MulticlassAccuracy( + self.vocab_size, + top_k=top_k, + average="micro", + multidim_average="global", + ignore_index=self.EOS, + ) + self.top_k = torch.LongTensor([1]) + self.early_stop_num = torch.LongTensor([-1]) + + def init_onnx(self): + self.onnx_encoder = OnnxEncoder(self.ar_text_embedding, self.bert_proj, self.ar_text_position) + self.first_stage_decoder = T2SFirstStageDecoder( + self.ar_audio_embedding, + self.ar_audio_position, + self.h, + self.ar_predict_layer, + self.loss_fct, + self.ar_accuracy_metric, + self.top_k, + self.early_stop_num, + self.num_layers, + ) + self.stage_decoder = T2SStageDecoder( + self.ar_audio_embedding, + self.ar_audio_position, + self.h, + self.ar_predict_layer, + self.loss_fct, + self.ar_accuracy_metric, + self.top_k, + self.early_stop_num, + self.num_layers, + ) + + def forward(self, x, prompts, bert_feature): + early_stop_num = self.early_stop_num + prefix_len = prompts.shape[1] + + x = self.onnx_encoder(x, bert_feature) + y, k, v, y_emb, stage, x_example = self.first_stage_decoder(x, prompts) + + stop = False + for idx in range(1, 1500): + enco = self.stage_decoder(y, k, v, y_emb, stage, x_example) + y, k, v, y_emb, stage, logits, samples = enco + if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: + stop = True + if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS: + stop = True + if stop: + break + y[0, -1] = 0 + return y, idx + + def infer(self, x, prompts, bert_feature): + top_k = self.top_k + early_stop_num = self.early_stop_num + + x = self.onnx_encoder(x, bert_feature) + + y = prompts + prefix_len = y.shape[1] + x_len = x.shape[1] + x_example = x[:, :, 0] * 0.0 + x_attn_mask = torch.matmul(x_example.transpose(0, 1), x_example) + x_attn_mask = torch.zeros_like(x_attn_mask, dtype=torch.bool) + + stop = False + cache = { + "all_stage": self.num_layers, + "k": [None] * self.num_layers, + "v": [None] * self.num_layers, + "y_emb": None, + "first_infer": 1, + "stage": 0, + } + for idx in range(1500): + if cache["first_infer"] == 1: + y_emb = self.ar_audio_embedding(y) + else: + y_emb = torch.cat([cache["y_emb"], self.ar_audio_embedding(y[:, -1:])], 1) + cache["y_emb"] = y_emb + y_pos = self.ar_audio_position(y_emb) + if cache["first_infer"] == 1: + xy_pos = torch.concat([x, y_pos], dim=1) + else: + xy_pos = y_pos[:, -1:] + y_len = y_pos.shape[1] + if cache["first_infer"] == 1: + x_attn_mask_pad = F.pad(x_attn_mask, (0, y_len), value=True) + y_attn_mask = F.pad( + torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1), + (x_len, 0), + value=False, + ) + xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0) + else: + xy_attn_mask = torch.zeros((1, x_len + y_len), dtype=torch.bool) + xy_dec = self.h(xy_pos, mask=xy_attn_mask, cache=cache) + logits = self.ar_predict_layer(xy_dec[:, -1]) + samples = sample(logits[0], y, top_k=top_k, top_p=1.0, repetition_penalty=1.35)[0].unsqueeze(0) + if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: + stop = True + if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS: + stop = True + if stop: + if prompts.shape[1] == y.shape[1]: + y = torch.concat([y, torch.zeros_like(samples)], dim=1) + break + y = torch.concat([y, samples], dim=1) + cache["first_infer"] = 0 + return y, idx diff --git a/GPT_SoVITS/AR/models/utils.py b/GPT_SoVITS/AR/models/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..cc4f24d89ff7c02a2e35ce8c083f4b028dee85a0 --- /dev/null +++ b/GPT_SoVITS/AR/models/utils.py @@ -0,0 +1,282 @@ +# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/utils.py +# reference: https://github.com/lifeiteng/vall-e +from typing import Tuple + +import torch +import torch.nn.functional as F + + +def sequence_mask(length, max_length=None): + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: + """ + Args: + lengths: + A 1-D tensor containing sentence lengths. + max_len: + The length of masks. + Returns: + Return a 2-D bool tensor, where masked positions + are filled with `True` and non-masked positions are + filled with `False`. + + #>>> lengths = torch.tensor([1, 3, 2, 5]) + #>>> make_pad_mask(lengths) + tensor([[False, True, True, True, True], + [False, False, False, True, True], + [False, False, True, True, True], + [False, False, False, False, False]]) + """ + assert lengths.ndim == 1, lengths.ndim + max_len = max(max_len, lengths.max()) + n = lengths.size(0) + seq_range = torch.arange(0, max_len, device=lengths.device) + expaned_lengths = seq_range.unsqueeze(0).expand(n, max_len) + + return expaned_lengths >= lengths.unsqueeze(-1) + + +def make_pad_mask_left(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: + """ + Args: + lengths: + A 1-D tensor containing sentence lengths. + max_len: + The length of masks. + Returns: + Return a 2-D bool tensor, where masked positions + are filled with `True` and non-masked positions are + filled with `False`. + + #>>> lengths = torch.tensor([1, 3, 2, 5]) + #>>> make_pad_mask(lengths) + tensor( + [ + [True, True, False], + [True, False, False], + [True, True, False], + ... + ] + ) + """ + assert lengths.ndim == 1, lengths.ndim + max_len = max(max_len, lengths.max()) + n = lengths.size(0) + seq_range = torch.arange(0, max_len, device=lengths.device) + expaned_lengths = seq_range.unsqueeze(0).repeat(n, 1) + expaned_lengths -= (max_len - lengths).unsqueeze(-1) + + return expaned_lengths < 0 + + +# https://github.com/microsoft/unilm/blob/master/xtune/src/transformers/modeling_utils.py +def top_k_top_p_filtering( + logits, + top_k=0, + top_p=1.0, + filter_value=-float("Inf"), + min_tokens_to_keep=1, +): + """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering + Args: + logits: logits distribution shape (batch size, vocabulary size) + if top_k > 0: keep only top k tokens with highest probability (top-k filtering). + if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). + Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) + Make sure we keep at least min_tokens_to_keep per batch example in the output + From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 + """ + if top_k > 0: + top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1)) # Safety check + # Remove all tokens with a probability less than the last token of the top-k + indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] + logits[indices_to_remove] = filter_value + + if top_p < 1.0: + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) + + # Remove tokens with cumulative probability above the threshold (token with 0 are kept) + sorted_indices_to_remove = cumulative_probs > top_p + if min_tokens_to_keep > 1: + # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) + sorted_indices_to_remove[..., :min_tokens_to_keep] = 0 + # Shift the indices to the right to keep also the first token above the threshold + sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() + sorted_indices_to_remove[..., 0] = 0 + + # scatter sorted tensors to original indexing + indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove) + logits[indices_to_remove] = filter_value + return logits + + +def topk_sampling(logits, top_k=10, top_p=1.0, temperature=1.0): + # temperature: (`optional`) float + # The value used to module the next token probabilities. Must be strictly positive. Default to 1.0. + # top_k: (`optional`) int + # The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50. + # top_p: (`optional`) float + # The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1. + + # Temperature (higher temperature => more likely to sample low probability tokens) + if temperature != 1.0: + logits = logits / temperature + # Top-p/top-k filtering + logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p) + # Sample + token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1) + return token + + +from typing import Optional + + +def multinomial_sample_one_no_sync( + probs_sort, +): # Does multinomial sampling without a cuda synchronization + q = torch.empty_like(probs_sort).exponential_(1) + return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int) + + +def logits_to_probs( + logits, + previous_tokens: Optional[torch.Tensor] = None, + temperature: float = 1.0, + top_k: Optional[int] = None, + top_p: Optional[int] = None, + repetition_penalty: float = 1.0, +): + # if previous_tokens is not None: + # previous_tokens = previous_tokens.squeeze() + # print(logits.shape,previous_tokens.shape) + # pdb.set_trace() + if previous_tokens is not None and repetition_penalty != 1.0: + previous_tokens = previous_tokens.long() + score = torch.gather(logits, dim=1, index=previous_tokens) + score = torch.where( + score < 0, + score * repetition_penalty, + score / repetition_penalty, + ) + logits.scatter_(dim=1, index=previous_tokens, src=score) + + if top_p is not None and top_p < 1.0: + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cum_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1) + sorted_indices_to_remove = cum_probs > top_p + sorted_indices_to_remove[:, 0] = False # keep at least one option + indices_to_remove = sorted_indices_to_remove.scatter( + dim=1, + index=sorted_indices, + src=sorted_indices_to_remove, + ) + logits = logits.masked_fill(indices_to_remove, -float("Inf")) + + logits = logits / max(temperature, 1e-5) + + if top_k is not None: + v, _ = torch.topk(logits, min(top_k, logits.size(-1))) + pivot = v[:, -1].unsqueeze(-1) + logits = torch.where(logits < pivot, -float("Inf"), logits) + + probs = torch.nn.functional.softmax(logits, dim=-1) + return probs + + +def sample( + logits, + previous_tokens: Optional[torch.Tensor] = None, + **sampling_kwargs, +) -> Tuple[torch.Tensor, torch.Tensor]: + probs = logits_to_probs(logits=logits, previous_tokens=previous_tokens, **sampling_kwargs) + idx_next = multinomial_sample_one_no_sync(probs) + return idx_next, probs + + +def dpo_loss( + policy_chosen_logps: torch.FloatTensor, + policy_rejected_logps: torch.FloatTensor, + reference_chosen_logps: torch.FloatTensor, + reference_rejected_logps: torch.FloatTensor, + beta: float, + reference_free: bool = False, +) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: + pi_logratios = policy_chosen_logps - policy_rejected_logps + ref_logratios = reference_chosen_logps - reference_rejected_logps + + if reference_free: + ref_logratios = 0 + + logits = pi_logratios - ref_logratios + + losses = -F.logsigmoid(beta * logits) + chosen_rewards = beta * (policy_chosen_logps - reference_chosen_logps).detach() + rejected_rewards = beta * (policy_rejected_logps - reference_rejected_logps).detach() + + return losses.mean(), chosen_rewards, rejected_rewards + + +def get_batch_logps( + logits_target: torch.FloatTensor, + logits_reject: torch.FloatTensor, + labels_target: torch.LongTensor, + labels_reject: torch.LongTensor, + average_log_prob: bool = False, +) -> Tuple[torch.FloatTensor, torch.FloatTensor]: + # dummy token; we'll ignore the losses on these tokens later + + per_token_logps_target = torch.gather( + logits_target.log_softmax(-1), dim=2, index=labels_target.unsqueeze(2) + ).squeeze(2) + per_token_logps_reject = torch.gather( + logits_reject.log_softmax(-1), dim=2, index=labels_reject.unsqueeze(2) + ).squeeze(2) + + return per_token_logps_target.sum(-1), per_token_logps_reject.sum(-1) + + +def make_reject_y(y_o, y_lens): + def repeat_P(y): + range_idx, _ = torch.randint(0, len(y), size=(2,)).sort() + pre = y[: range_idx[0]] + shf = y[range_idx[1] :] + range_text = y[range_idx[0] : range_idx[1]] + new_y = torch.cat([pre, range_text, range_text, shf]) + return new_y + + def lost_P(y): + range_idx, _ = torch.randint(0, len(y), size=(2,)).sort() + pre = y[: range_idx[0]] + shf = y[range_idx[1] :] + range_text = y[range_idx[0] : range_idx[1]] + new_y = torch.cat([pre, shf]) + return new_y + + bs = len(y_lens) + reject_y = [] + reject_y_lens = [] + for b in range(bs): + process_item_idx = torch.randint(0, 1, size=(1,))[0] + if process_item_idx == 0: + new_y = repeat_P(y_o[b]) + reject_y.append(new_y) + reject_y_lens.append(len(new_y)) + elif process_item_idx == 1: + new_y = lost_P(y_o[b]) + reject_y.append(new_y) + reject_y_lens.append(len(new_y)) + max_length = max(reject_y_lens) + for b in range(bs): + pad_length = max_length - reject_y_lens[b] + reject_y[b] = torch.cat([reject_y[b], torch.zeros(pad_length, dtype=y_o.dtype, device=y_o.device)], dim=0) + + reject_y = torch.stack(reject_y, dim=0) + reject_y_lens = torch.tensor(reject_y_lens, device=y_lens.device) + + return reject_y, reject_y_lens diff --git a/GPT_SoVITS/AR/modules/__init__.py b/GPT_SoVITS/AR/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/GPT_SoVITS/AR/modules/activation.py b/GPT_SoVITS/AR/modules/activation.py new file mode 100644 index 0000000000000000000000000000000000000000..936f9c3fa5c18357eac4d57294167f23d7ce5700 --- /dev/null +++ b/GPT_SoVITS/AR/modules/activation.py @@ -0,0 +1,413 @@ +# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py +from typing import Optional, Tuple + +import torch +from torch import Tensor +from torch.nn import Linear, Module +from torch.nn import functional as F +from torch.nn.init import constant_, xavier_normal_, xavier_uniform_ +from torch.nn.modules.linear import NonDynamicallyQuantizableLinear +from torch.nn.parameter import Parameter + +from AR.modules.patched_mha_with_cache import multi_head_attention_forward_patched + +F.multi_head_attention_forward = multi_head_attention_forward_patched + + +class MultiheadAttention(Module): + r"""Allows the model to jointly attend to information + from different representation subspaces as described in the paper: + `Attention Is All You Need `_. + + Multi-Head Attention is defined as: + + .. math:: + \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O + + where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`. + + ``forward()`` will use a special optimized implementation if all of the following + conditions are met: + + - self attention is being computed (i.e., ``query``, ``key``, and ``value`` are the same tensor. This + restriction will be loosened in the future.) + - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor argument ``requires_grad`` + - training is disabled (using ``.eval()``) + - dropout is 0 + - ``add_bias_kv`` is ``False`` + - ``add_zero_attn`` is ``False`` + - ``batch_first`` is ``True`` and the input is batched + - ``kdim`` and ``vdim`` are equal to ``embed_dim`` + - at most one of ``key_padding_mask`` or ``attn_mask`` is passed + - if a `NestedTensor `_ is passed, neither ``key_padding_mask`` + nor ``attn_mask`` is passed + + If the optimized implementation is in use, a + `NestedTensor `_ can be passed for + ``query``/``key``/``value`` to represent padding more efficiently than using a + padding mask. In this case, a `NestedTensor `_ + will be returned, and an additional speedup proportional to the fraction of the input + that is padding can be expected. + + Args: + embed_dim: Total dimension of the model. + num_heads: Number of parallel attention heads. Note that ``embed_dim`` will be split + across ``num_heads`` (i.e. each head will have dimension ``embed_dim // num_heads``). + dropout: Dropout probability on ``attn_output_weights``. Default: ``0.0`` (no dropout). + bias: If specified, adds bias to input / output projection layers. Default: ``True``. + add_bias_kv: If specified, adds bias to the key and value sequences at dim=0. Default: ``False``. + add_zero_attn: If specified, adds a new batch of zeros to the key and value sequences at dim=1. + Default: ``False``. + kdim: Total number of features for keys. Default: ``None`` (uses ``kdim=embed_dim``). + vdim: Total number of features for values. Default: ``None`` (uses ``vdim=embed_dim``). + batch_first: If ``True``, then the input and output tensors are provided + as (batch, seq, feature). Default: ``False`` (seq, batch, feature). + + Examples:: + + >>> # xdoctest: +SKIP + >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads) + >>> attn_output, attn_output_weights = multihead_attn(query, key, value) + + """ + + __constants__ = ["batch_first"] + bias_k: Optional[torch.Tensor] + bias_v: Optional[torch.Tensor] + + def __init__( + self, + embed_dim, + num_heads, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + kdim=None, + vdim=None, + batch_first=False, + linear1_cls=Linear, + linear2_cls=Linear, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super(MultiheadAttention, self).__init__() + self.embed_dim = embed_dim + self.kdim = kdim if kdim is not None else embed_dim + self.vdim = vdim if vdim is not None else embed_dim + self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim + + self.num_heads = num_heads + self.dropout = dropout + self.batch_first = batch_first + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" + + if add_bias_kv: + self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) + self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) + else: + self.bias_k = self.bias_v = None + + if linear1_cls == Linear: + if not self._qkv_same_embed_dim: + self.q_proj_weight = Parameter( + torch.empty((embed_dim, embed_dim), **factory_kwargs), + ) + self.k_proj_weight = Parameter( + torch.empty((embed_dim, self.kdim), **factory_kwargs), + ) + self.v_proj_weight = Parameter( + torch.empty((embed_dim, self.vdim), **factory_kwargs), + ) + self.register_parameter("in_proj_weight", None) + else: + self.in_proj_weight = Parameter( + torch.empty((3 * embed_dim, embed_dim), **factory_kwargs), + ) + self.register_parameter("q_proj_weight", None) + self.register_parameter("k_proj_weight", None) + self.register_parameter("v_proj_weight", None) + + if bias: + self.in_proj_bias = Parameter(torch.empty(3 * embed_dim, **factory_kwargs)) + else: + self.register_parameter("in_proj_bias", None) + self.out_proj = NonDynamicallyQuantizableLinear( + embed_dim, + embed_dim, + bias=bias, + **factory_kwargs, + ) + + self._reset_parameters() + else: + if not self._qkv_same_embed_dim: + raise NotImplementedError + else: + self.in_proj_linear = linear1_cls( + embed_dim, + 3 * embed_dim, + bias=bias, + **factory_kwargs, + ) + self.in_proj_weight = self.in_proj_linear.weight + + self.register_parameter("q_proj_weight", None) + self.register_parameter("k_proj_weight", None) + self.register_parameter("v_proj_weight", None) + + if bias: + self.in_proj_bias = self.in_proj_linear.bias + else: + self.register_parameter("in_proj_bias", None) + + self.out_proj = linear2_cls( + embed_dim, + embed_dim, + bias=bias, + **factory_kwargs, + ) + + if self.bias_k is not None: + xavier_normal_(self.bias_k) + if self.bias_v is not None: + xavier_normal_(self.bias_v) + + self.add_zero_attn = add_zero_attn + + def _reset_parameters(self): + if self._qkv_same_embed_dim: + xavier_uniform_(self.in_proj_weight) + else: + xavier_uniform_(self.q_proj_weight) + xavier_uniform_(self.k_proj_weight) + xavier_uniform_(self.v_proj_weight) + + if self.in_proj_bias is not None: + constant_(self.in_proj_bias, 0.0) + constant_(self.out_proj.bias, 0.0) + + if self.bias_k is not None: + xavier_normal_(self.bias_k) + if self.bias_v is not None: + xavier_normal_(self.bias_v) + + def __setstate__(self, state): + # Support loading old MultiheadAttention checkpoints generated by v1.1.0 + if "_qkv_same_embed_dim" not in state: + state["_qkv_same_embed_dim"] = True + + super(MultiheadAttention, self).__setstate__(state) + + def forward( + self, + query: Tensor, + key: Tensor, + value: Tensor, + key_padding_mask: Optional[Tensor] = None, + need_weights: bool = True, + attn_mask: Optional[Tensor] = None, + average_attn_weights: bool = True, + cache=None, + ) -> Tuple[Tensor, Optional[Tensor]]: + r""" + Args: + query: Query embeddings of shape :math:`(L, E_q)` for unbatched input, :math:`(L, N, E_q)` when ``batch_first=False`` + or :math:`(N, L, E_q)` when ``batch_first=True``, where :math:`L` is the target sequence length, + :math:`N` is the batch size, and :math:`E_q` is the query embedding dimension ``embed_dim``. + Queries are compared against key-value pairs to produce the output. + See "Attention Is All You Need" for more details. + key: Key embeddings of shape :math:`(S, E_k)` for unbatched input, :math:`(S, N, E_k)` when ``batch_first=False`` + or :math:`(N, S, E_k)` when ``batch_first=True``, where :math:`S` is the source sequence length, + :math:`N` is the batch size, and :math:`E_k` is the key embedding dimension ``kdim``. + See "Attention Is All You Need" for more details. + value: Value embeddings of shape :math:`(S, E_v)` for unbatched input, :math:`(S, N, E_v)` when + ``batch_first=False`` or :math:`(N, S, E_v)` when ``batch_first=True``, where :math:`S` is the source + sequence length, :math:`N` is the batch size, and :math:`E_v` is the value embedding dimension ``vdim``. + See "Attention Is All You Need" for more details. + key_padding_mask: If specified, a mask of shape :math:`(N, S)` indicating which elements within ``key`` + to ignore for the purpose of attention (i.e. treat as "padding"). For unbatched `query`, shape should be :math:`(S)`. + Binary and byte masks are supported. + For a binary mask, a ``True`` value indicates that the corresponding ``key`` value will be ignored for + the purpose of attention. For a float mask, it will be directly added to the corresponding ``key`` value. + need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``. + Default: ``True``. + attn_mask: If specified, a 2D or 3D mask preventing attention to certain positions. Must be of shape + :math:`(L, S)` or :math:`(N\cdot\text{num\_heads}, L, S)`, where :math:`N` is the batch size, + :math:`L` is the target sequence length, and :math:`S` is the source sequence length. A 2D mask will be + broadcasted across the batch while a 3D mask allows for a different mask for each entry in the batch. + Binary, byte, and float masks are supported. For a binary mask, a ``True`` value indicates that the + corresponding position is not allowed to attend. For a byte mask, a non-zero value indicates that the + corresponding position is not allowed to attend. For a float mask, the mask values will be added to + the attention weight. + average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across + heads. Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an + effect when ``need_weights=True``. Default: ``True`` (i.e. average weights across heads) + + Outputs: + - **attn_output** - Attention outputs of shape :math:`(L, E)` when input is unbatched, + :math:`(L, N, E)` when ``batch_first=False`` or :math:`(N, L, E)` when ``batch_first=True``, + where :math:`L` is the target sequence length, :math:`N` is the batch size, and :math:`E` is the + embedding dimension ``embed_dim``. + - **attn_output_weights** - Only returned when ``need_weights=True``. If ``average_attn_weights=True``, + returns attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or + :math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and + :math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per + head of shape :math:`(\text{num\_heads}, L, S)` when input is unbatched or :math:`(N, \text{num\_heads}, L, S)`. + + .. note:: + `batch_first` argument is ignored for unbatched inputs. + """ + is_batched = query.dim() == 3 + if key_padding_mask is not None: + _kpm_dtype = key_padding_mask.dtype + if _kpm_dtype != torch.bool and not torch.is_floating_point( + key_padding_mask, + ): + raise AssertionError("only bool and floating types of key_padding_mask are supported") + why_not_fast_path = "" + if not is_batched: + why_not_fast_path = f"input not batched; expected query.dim() of 3 but got {query.dim()}" + elif query is not key or key is not value: + # When lifting this restriction, don't forget to either + # enforce that the dtypes all match or test cases where + # they don't! + why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)" + elif self.in_proj_bias is not None and query.dtype != self.in_proj_bias.dtype: + why_not_fast_path = ( + f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match" + ) + elif self.in_proj_weight is not None and query.dtype != self.in_proj_weight.dtype: + # this case will fail anyway, but at least they'll get a useful error message. + why_not_fast_path = ( + f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match" + ) + elif self.training: + why_not_fast_path = "training is enabled" + elif not self.batch_first: + why_not_fast_path = "batch_first was not True" + elif self.bias_k is not None: + why_not_fast_path = "self.bias_k was not None" + elif self.bias_v is not None: + why_not_fast_path = "self.bias_v was not None" + elif self.dropout: + why_not_fast_path = f"dropout was {self.dropout}, required zero" + elif self.add_zero_attn: + why_not_fast_path = "add_zero_attn was enabled" + elif not self._qkv_same_embed_dim: + why_not_fast_path = "_qkv_same_embed_dim was not True" + elif attn_mask is not None: + why_not_fast_path = "attn_mask was not None" + elif query.is_nested and key_padding_mask is not None: + why_not_fast_path = "key_padding_mask is not supported with NestedTensor input" + elif self.num_heads % 2 == 1: + why_not_fast_path = "num_heads is odd" + elif torch.is_autocast_enabled(): + why_not_fast_path = "autocast is enabled" + + if not why_not_fast_path: + tensor_args = ( + query, + key, + value, + self.in_proj_weight, + self.in_proj_bias, + self.out_proj.weight, + self.out_proj.bias, + ) + # We have to use list comprehensions below because TorchScript does not support + # generator expressions. + if torch.overrides.has_torch_function(tensor_args): + why_not_fast_path = "some Tensor argument has_torch_function" + elif not all([(x is None or x.is_cuda or "cpu" in str(x.device)) for x in tensor_args]): + why_not_fast_path = "some Tensor argument is neither CUDA nor CPU" + elif torch.is_grad_enabled() and any([x is not None and x.requires_grad for x in tensor_args]): + why_not_fast_path = "grad is enabled and at least one of query or the input/output projection weights or biases requires_grad" + if not why_not_fast_path: + return torch._native_multi_head_attention( + query, + key, + value, + self.embed_dim, + self.num_heads, + self.in_proj_weight, + self.in_proj_bias, + self.out_proj.weight, + self.out_proj.bias, + key_padding_mask if key_padding_mask is not None else attn_mask, + need_weights, + average_attn_weights, + 1 if key_padding_mask is not None else 0 if attn_mask is not None else None, + ) + + any_nested = query.is_nested or key.is_nested or value.is_nested + assert not any_nested, ( + "MultiheadAttention does not support NestedTensor outside of its fast path. " + + f"The fast path was not hit because {why_not_fast_path}" + ) + + if self.batch_first and is_batched: + # make sure that the transpose op does not affect the "is" property + if key is value: + if query is key: + query = key = value = query.transpose(1, 0) + else: + query, key = [x.transpose(1, 0) for x in (query, key)] + value = key + else: + query, key, value = [x.transpose(1, 0) for x in (query, key, value)] + + if not self._qkv_same_embed_dim: + attn_output, attn_output_weights = F.multi_head_attention_forward( + query, + key, + value, + self.embed_dim, + self.num_heads, + self.in_proj_weight, + self.in_proj_bias, + self.bias_k, + self.bias_v, + self.add_zero_attn, + self.dropout, + self.out_proj.weight, + self.out_proj.bias, + training=self.training, + key_padding_mask=key_padding_mask, + need_weights=need_weights, + attn_mask=attn_mask, + use_separate_proj_weight=True, + q_proj_weight=self.q_proj_weight, + k_proj_weight=self.k_proj_weight, + v_proj_weight=self.v_proj_weight, + average_attn_weights=average_attn_weights, + cache=cache, + ) + else: + attn_output, attn_output_weights = F.multi_head_attention_forward( + query, + key, + value, + self.embed_dim, + self.num_heads, + self.in_proj_weight, + self.in_proj_bias, + self.bias_k, + self.bias_v, + self.add_zero_attn, + self.dropout, + self.out_proj.weight, + self.out_proj.bias, + training=self.training, + key_padding_mask=key_padding_mask, + need_weights=need_weights, + attn_mask=attn_mask, + average_attn_weights=average_attn_weights, + cache=cache, + ) + if self.batch_first and is_batched: + return attn_output.transpose(1, 0), attn_output_weights + else: + return attn_output, attn_output_weights diff --git a/GPT_SoVITS/AR/modules/activation_onnx.py b/GPT_SoVITS/AR/modules/activation_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..c14ce40c9648831a172861951b2dfdc6d5cab0c6 --- /dev/null +++ b/GPT_SoVITS/AR/modules/activation_onnx.py @@ -0,0 +1,188 @@ +# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py +from typing import Optional, Tuple + +import torch +from torch import Tensor +from torch.nn import Linear, Module +from torch.nn.init import constant_, xavier_normal_, xavier_uniform_ +from torch.nn.modules.linear import NonDynamicallyQuantizableLinear +from torch.nn.parameter import Parameter + +from AR.modules.patched_mha_with_cache_onnx import multi_head_attention_forward_patched + + +class MultiheadAttention(Module): + __constants__ = ["batch_first"] + bias_k: Optional[torch.Tensor] + bias_v: Optional[torch.Tensor] + + def __init__( + self, + embed_dim, + num_heads, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + kdim=None, + vdim=None, + batch_first=False, + linear1_cls=Linear, + linear2_cls=Linear, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super(MultiheadAttention, self).__init__() + self.embed_dim = embed_dim + self.kdim = kdim if kdim is not None else embed_dim + self.vdim = vdim if vdim is not None else embed_dim + self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim + + self.num_heads = num_heads + self.dropout = dropout + self.batch_first = batch_first + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" + + if add_bias_kv: + self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) + self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) + else: + self.bias_k = self.bias_v = None + + if linear1_cls == Linear: + if not self._qkv_same_embed_dim: + self.q_proj_weight = Parameter( + torch.empty( + (embed_dim, embed_dim), + **factory_kwargs, + ) + ) + self.k_proj_weight = Parameter( + torch.empty( + (embed_dim, self.kdim), + **factory_kwargs, + ) + ) + self.v_proj_weight = Parameter( + torch.empty( + (embed_dim, self.vdim), + **factory_kwargs, + ) + ) + self.register_parameter("in_proj_weight", None) + else: + self.in_proj_weight = Parameter( + torch.empty( + (3 * embed_dim, embed_dim), + **factory_kwargs, + ) + ) + self.register_parameter("q_proj_weight", None) + self.register_parameter("k_proj_weight", None) + self.register_parameter("v_proj_weight", None) + + if bias: + self.in_proj_bias = Parameter( + torch.empty(3 * embed_dim, **factory_kwargs), + ) + else: + self.register_parameter("in_proj_bias", None) + self.out_proj = NonDynamicallyQuantizableLinear(embed_dim, embed_dim, bias=bias, **factory_kwargs) + + self._reset_parameters() + else: + if not self._qkv_same_embed_dim: + raise NotImplementedError + else: + self.in_proj_linear = linear1_cls( + embed_dim, + 3 * embed_dim, + bias=bias, + **factory_kwargs, + ) + self.in_proj_weight = self.in_proj_linear.weight + + self.register_parameter("q_proj_weight", None) + self.register_parameter("k_proj_weight", None) + self.register_parameter("v_proj_weight", None) + + if bias: + self.in_proj_bias = self.in_proj_linear.bias + else: + self.register_parameter("in_proj_bias", None) + + self.out_proj = linear2_cls( + embed_dim, + embed_dim, + bias=bias, + **factory_kwargs, + ) + + if self.bias_k is not None: + xavier_normal_(self.bias_k) + if self.bias_v is not None: + xavier_normal_(self.bias_v) + + self.add_zero_attn = add_zero_attn + + def _reset_parameters(self): + if self._qkv_same_embed_dim: + xavier_uniform_(self.in_proj_weight) + else: + xavier_uniform_(self.q_proj_weight) + xavier_uniform_(self.k_proj_weight) + xavier_uniform_(self.v_proj_weight) + + if self.in_proj_bias is not None: + constant_(self.in_proj_bias, 0.0) + constant_(self.out_proj.bias, 0.0) + + if self.bias_k is not None: + xavier_normal_(self.bias_k) + if self.bias_v is not None: + xavier_normal_(self.bias_v) + + def __setstate__(self, state): + # Support loading old MultiheadAttention checkpoints generated by v1.1.0 + if "_qkv_same_embed_dim" not in state: + state["_qkv_same_embed_dim"] = True + + super(MultiheadAttention, self).__setstate__(state) + + def forward( + self, + query: Tensor, + key: Tensor, + value: Tensor, + key_padding_mask: Optional[Tensor] = None, + need_weights: bool = True, + attn_mask: Optional[Tensor] = None, + average_attn_weights: bool = True, + cache=None, + ) -> Tuple[Tensor, Optional[Tensor]]: + any_nested = query.is_nested or key.is_nested or value.is_nested + query = key = value = query.transpose(1, 0) + attn_output = multi_head_attention_forward_patched( + query, + key, + value, + self.embed_dim, + self.num_heads, + self.in_proj_weight, + self.in_proj_bias, + self.bias_k, + self.bias_v, + self.add_zero_attn, + self.dropout, + self.out_proj.weight, + self.out_proj.bias, + training=self.training, + key_padding_mask=key_padding_mask, + need_weights=need_weights, + attn_mask=attn_mask, + average_attn_weights=average_attn_weights, + cache=cache, + ) + return attn_output.transpose(1, 0) diff --git a/GPT_SoVITS/AR/modules/embedding.py b/GPT_SoVITS/AR/modules/embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..39da560386b954f9b1626cf4a47e3b4b4d4195d1 --- /dev/null +++ b/GPT_SoVITS/AR/modules/embedding.py @@ -0,0 +1,78 @@ +# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py +import math + +import torch +from torch import nn + + +class TokenEmbedding(nn.Module): + def __init__( + self, + embedding_dim: int, + vocab_size: int, + dropout: float = 0.0, + ): + super().__init__() + + self.vocab_size = vocab_size + self.embedding_dim = embedding_dim + + self.dropout = torch.nn.Dropout(p=dropout) + self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim) + + @property + def weight(self) -> torch.Tensor: + return self.word_embeddings.weight + + def embedding(self, index: int) -> torch.Tensor: + return self.word_embeddings.weight[index : index + 1] + + def forward(self, x: torch.Tensor): + x = self.word_embeddings(x) + x = self.dropout(x) + return x + + +class SinePositionalEmbedding(nn.Module): + def __init__( + self, + embedding_dim: int, + dropout: float = 0.0, + scale: bool = False, + alpha: bool = False, + ): + super().__init__() + self.embedding_dim = embedding_dim + self.x_scale = math.sqrt(embedding_dim) if scale else 1.0 + self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha) + self.dropout = torch.nn.Dropout(p=dropout) + + self.reverse = False + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, 4000)) + + def extend_pe(self, x): + """Reset the positional encodings.""" + if self.pe is not None: + if self.pe.size(1) >= x.size(1): + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + pe = torch.zeros(x.size(1), self.embedding_dim) + if self.reverse: + position = torch.arange(x.size(1) - 1, -1, -1.0, dtype=torch.float32).unsqueeze(1) + else: + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, self.embedding_dim, 2, dtype=torch.float32) * -(math.log(10000.0) / self.embedding_dim) + ) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.pe = pe.to(device=x.device, dtype=x.dtype).detach() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + self.extend_pe(x) + output = x.unsqueeze(-1) if x.ndim == 2 else x + output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)] + return self.dropout(output) diff --git a/GPT_SoVITS/AR/modules/embedding_onnx.py b/GPT_SoVITS/AR/modules/embedding_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..c870013f0e4c1af463790ff3627fab3550fe4c93 --- /dev/null +++ b/GPT_SoVITS/AR/modules/embedding_onnx.py @@ -0,0 +1,63 @@ +# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py +import math + +import torch +from torch import nn + + +class TokenEmbedding(nn.Module): + def __init__( + self, + embedding_dim: int, + vocab_size: int, + dropout: float = 0.0, + ): + super().__init__() + + self.vocab_size = vocab_size + self.embedding_dim = embedding_dim + + self.dropout = torch.nn.Dropout(p=dropout) + self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim) + + @property + def weight(self) -> torch.Tensor: + return self.word_embeddings.weight + + def embedding(self, index: int) -> torch.Tensor: + return self.word_embeddings.weight[index : index + 1] + + def forward(self, x: torch.Tensor): + x = self.word_embeddings(x) + x = self.dropout(x) + return x + + +class SinePositionalEmbedding(nn.Module): + def __init__( + self, + embedding_dim: int, + dropout: float = 0.0, + scale: bool = False, + alpha: bool = False, + ): + super().__init__() + self.embedding_dim = embedding_dim + self.x_scale = math.sqrt(embedding_dim) if scale else 1.0 + self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha) + self.dropout = torch.nn.Dropout(p=dropout) + self.reverse = False + self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim)) + + def extend_pe(self, x): + position = torch.cumsum(torch.ones_like(x[:, :, 0]), dim=1).transpose(0, 1) + scpe = (position * self.div_term).unsqueeze(0) + pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0) + pe = pe.contiguous().view(1, -1, self.embedding_dim) + return pe + + def forward(self, x: torch.Tensor) -> torch.Tensor: + pe = self.extend_pe(x) + output = x.unsqueeze(-1) if x.ndim == 2 else x + output = output * self.x_scale + self.alpha * pe + return self.dropout(output) diff --git a/GPT_SoVITS/AR/modules/lr_schedulers.py b/GPT_SoVITS/AR/modules/lr_schedulers.py new file mode 100644 index 0000000000000000000000000000000000000000..707a911f9b7fbcf74d009143aa0c3e21605c0704 --- /dev/null +++ b/GPT_SoVITS/AR/modules/lr_schedulers.py @@ -0,0 +1,85 @@ +# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/modules/lr_schedulers.py +# reference: https://github.com/lifeiteng/vall-e +import math + +import torch +from matplotlib import pyplot as plt +from torch import nn +from torch.optim import Adam + + +class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler): + """ + Implements Warmup learning rate schedule until 'warmup_steps', going from 'init_lr' to 'peak_lr' for multiple optimizers. + """ + + def __init__( + self, + optimizer, + init_lr, + peak_lr, + end_lr, + warmup_steps=10000, + total_steps=400000, + current_step=0, + ): + self.init_lr = init_lr + self.peak_lr = peak_lr + self.end_lr = end_lr + self.optimizer = optimizer + self._warmup_rate = (peak_lr - init_lr) / warmup_steps + self._decay_rate = (end_lr - peak_lr) / (total_steps - warmup_steps) + self._current_step = current_step + self.lr = init_lr + self.warmup_steps = warmup_steps + self.total_steps = total_steps + self._last_lr = [self.lr] + + def set_lr(self, lr): + self._last_lr = [g["lr"] for g in self.optimizer.param_groups] + for g in self.optimizer.param_groups: + # g['lr'] = lr + g["lr"] = self.end_lr ###锁定用线性 + + def step(self): + if self._current_step < self.warmup_steps: + lr = self.init_lr + self._warmup_rate * self._current_step + + elif self._current_step > self.total_steps: + lr = self.end_lr + + else: + decay_ratio = (self._current_step - self.warmup_steps) / (self.total_steps - self.warmup_steps) + if decay_ratio < 0.0 or decay_ratio > 1.0: + raise RuntimeError("Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings.") + coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) + lr = self.end_lr + coeff * (self.peak_lr - self.end_lr) + + self.lr = lr = self.end_lr = 0.002 ###锁定用线性###不听话,直接锁定! + self.set_lr(lr) + self.lr = lr + self._current_step += 1 + return self.lr + + +if __name__ == "__main__": + m = nn.Linear(10, 10) + opt = Adam(m.parameters(), lr=1e-4) + s = WarmupCosineLRSchedule( + opt, + 1e-6, + 2e-4, + 1e-6, + warmup_steps=2000, + total_steps=20000, + current_step=0, + ) + lrs = [] + for i in range(25000): + s.step() + lrs.append(s.lr) + print(s.lr) + + plt.plot(lrs) + plt.plot(range(0, 25000), lrs) + plt.show() diff --git a/GPT_SoVITS/AR/modules/optim.py b/GPT_SoVITS/AR/modules/optim.py new file mode 100644 index 0000000000000000000000000000000000000000..aeebbee354fba8279042801895715a3934df2695 --- /dev/null +++ b/GPT_SoVITS/AR/modules/optim.py @@ -0,0 +1,593 @@ +# Copyright 2022 Xiaomi Corp. (authors: Daniel Povey) +# +# See ../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import contextlib +import logging +from collections import defaultdict +from typing import List, Tuple + +import torch +from torch import Tensor +from torch.optim import Optimizer + + +class BatchedOptimizer(Optimizer): + """ + This class adds to class Optimizer the capability to optimize parameters in batches: + it will stack the parameters and their grads for you so the optimizer can work + on tensors with an extra leading dimension. This is intended for speed with GPUs, + as it reduces the number of kernels launched in the optimizer. + + Args: + params: + """ + + def __init__(self, params, defaults): + super(BatchedOptimizer, self).__init__(params, defaults) + + @contextlib.contextmanager + def batched_params(self, param_group, group_params_names): + """ + This function returns (technically, yields) a list of + of tuples (p, state), where + p is a `fake` parameter that is stacked (over axis 0) from real parameters + that share the same shape, and its gradient is also stacked; + `state` is the state corresponding to this batch of parameters + (it will be physically located in the "state" for one of the real + parameters, the last one that has any particular shape and dtype). + + This function is decorated as a context manager so that it can + write parameters back to their "real" locations. + + The idea is, instead of doing: + + for p in group["params"]: + state = self.state[p] + ... + + you can do: + + with self.batched_params(group["params"]) as batches: + for p, state, p_names in batches: + ... + + + Args: + group: a parameter group, which is a list of parameters; should be + one of self.param_groups. + group_params_names: name for each parameter in group, + which is List[str]. + """ + batches = defaultdict(list) # `batches` maps from tuple (dtype_as_str,*shape) to list of nn.Parameter + batches_names = defaultdict(list) # `batches` maps from tuple (dtype_as_str,*shape) to list of str + + assert len(param_group) == len(group_params_names) + for p, named_p in zip(param_group, group_params_names): + key = (str(p.dtype), *p.shape) + batches[key].append(p) + batches_names[key].append(named_p) + + batches_names_keys = list(batches_names.keys()) + sorted_idx = sorted(range(len(batches_names)), key=lambda i: batches_names_keys[i]) + batches_names = [batches_names[batches_names_keys[idx]] for idx in sorted_idx] + batches = [batches[batches_names_keys[idx]] for idx in sorted_idx] + + stacked_params_dict = dict() + + # turn batches into a list, in deterministic order. + # tuples will contain tuples of (stacked_param, state, stacked_params_names), + # one for each batch in `batches`. + tuples = [] + + for batch, batch_names in zip(batches, batches_names): + p = batch[0] + # we arbitrarily store the state in the + # state corresponding to the 1st parameter in the + # group. class Optimizer will take care of saving/loading state. + state = self.state[p] + p_stacked = torch.stack(batch) + grad = torch.stack([torch.zeros_like(p) if p.grad is None else p.grad for p in batch]) + p_stacked.grad = grad + stacked_params_dict[key] = p_stacked + tuples.append((p_stacked, state, batch_names)) + + yield tuples # <-- calling code will do the actual optimization here! + + for (stacked_params, _state, _names), batch in zip(tuples, batches): + for i, p in enumerate(batch): # batch is list of Parameter + p.copy_(stacked_params[i]) + + +class ScaledAdam(BatchedOptimizer): + """ + Implements 'Scaled Adam', a variant of Adam where we scale each parameter's update + proportional to the norm of that parameter; and also learn the scale of the parameter, + in log space, subject to upper and lower limits (as if we had factored each parameter as + param = underlying_param * log_scale.exp()) + + + Args: + params: The parameters or param_groups to optimize (like other Optimizer subclasses) + lr: The learning rate. We will typically use a learning rate schedule that starts + at 0.03 and decreases over time, i.e. much higher than other common + optimizers. + clipping_scale: (e.g. 2.0) + A scale for gradient-clipping: if specified, the normalized gradients + over the whole model will be clipped to have 2-norm equal to + `clipping_scale` times the median 2-norm over the most recent period + of `clipping_update_period` minibatches. By "normalized gradients", + we mean after multiplying by the rms parameter value for this tensor + [for non-scalars]; this is appropriate because our update is scaled + by this quantity. + betas: beta1,beta2 are momentum constants for regular momentum, and moving sum-sq grad. + Must satisfy 0 < beta <= beta2 < 1. + scalar_lr_scale: A scaling factor on the learning rate, that we use to update the + scale of each parameter tensor and scalar parameters of the mode.. + If each parameter were decomposed + as p * p_scale.exp(), where (p**2).mean().sqrt() == 1.0, scalar_lr_scale + would be a the scaling factor on the learning rate of p_scale. + eps: A general-purpose epsilon to prevent division by zero + param_min_rms: Minimum root-mean-square value of parameter tensor, for purposes of + learning the scale on the parameters (we'll constrain the rms of each non-scalar + parameter tensor to be >= this value) + param_max_rms: Maximum root-mean-square value of parameter tensor, for purposes of + learning the scale on the parameters (we'll constrain the rms of each non-scalar + parameter tensor to be <= this value) + scalar_max: Maximum absolute value for scalar parameters (applicable if your + model has any parameters with numel() == 1). + size_update_period: The periodicity, in steps, with which we update the size (scale) + of the parameter tensor. This is provided to save a little time + in the update. + clipping_update_period: if clipping_scale is specified, this is the period + """ + + def __init__( + self, + params, + lr=3e-02, + clipping_scale=None, + betas=(0.9, 0.98), + scalar_lr_scale=0.1, + eps=1.0e-08, + param_min_rms=1.0e-05, + param_max_rms=3.0, + scalar_max=10.0, + size_update_period=4, + clipping_update_period=100, + parameters_names=None, + show_dominant_parameters=True, + ): + assert parameters_names is not None, ( + "Please prepare parameters_names,which is a List[List[str]]. Each List[str] is for a groupand each str is for a parameter" + ) + defaults = dict( + lr=lr, + clipping_scale=clipping_scale, + betas=betas, + scalar_lr_scale=scalar_lr_scale, + eps=eps, + param_min_rms=param_min_rms, + param_max_rms=param_max_rms, + scalar_max=scalar_max, + size_update_period=size_update_period, + clipping_update_period=clipping_update_period, + ) + + super(ScaledAdam, self).__init__(params, defaults) + assert len(self.param_groups) == len(parameters_names) + self.parameters_names = parameters_names + self.show_dominant_parameters = show_dominant_parameters + + def __setstate__(self, state): + super(ScaledAdam, self).__setstate__(state) + + @torch.no_grad() + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + batch = True + + for group, group_params_names in zip(self.param_groups, self.parameters_names): + with self.batched_params(group["params"], group_params_names) as batches: + # batches is list of pairs (stacked_param, state). stacked_param is like + # a regular parameter, and will have a .grad, but the 1st dim corresponds to + # a stacking dim, it is not a real dim. + + if len(batches[0][1]) == 0: # if len(first state) == 0: not yet initialized + clipping_scale = 1 + else: + clipping_scale = self._get_clipping_scale(group, batches) + + for p, state, _ in batches: + # Perform optimization step. + # grad is not going to be None, we handled that when creating the batches. + grad = p.grad + if grad.is_sparse: + raise RuntimeError("ScaledAdam optimizer does not support sparse gradients") + # State initialization + if len(state) == 0: + self._init_state(group, p, state) + + self._step_one_batch(group, p, state, clipping_scale) + + return loss + + def _init_state(self, group: dict, p: Tensor, state: dict): + """ + Initializes state dict for parameter 'p'. Assumes that dim 0 of tensor p + is actually the batch dimension, corresponding to batched-together + parameters of a given shape. + + + Args: + group: Dict to look up configuration values. + p: The parameter that we are initializing the state for + state: Dict from string to whatever state we are initializing + """ + size_update_period = group["size_update_period"] + + state["step"] = 0 + + kwargs = {"device": p.device, "dtype": p.dtype} + + # 'delta' implements conventional momentum. There are + # several different kinds of update going on, so rather than + # compute "exp_avg" like in Adam, we store and decay a + # parameter-change "delta", which combines all forms of + # update. this is equivalent to how it's done in Adam, + # except for the first few steps. + state["delta"] = torch.zeros_like(p, memory_format=torch.preserve_format) + + batch_size = p.shape[0] + numel = p.numel() // batch_size + numel = p.numel() + + if numel > 1: + # "param_rms" just periodically records the scalar root-mean-square value of + # the parameter tensor. + # it has a shape like (batch_size, 1, 1, 1, 1) + param_rms = (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt() + state["param_rms"] = param_rms + + state["scale_exp_avg_sq"] = torch.zeros_like(param_rms) + state["scale_grads"] = torch.zeros(size_update_period, *param_rms.shape, **kwargs) + + # exp_avg_sq is the weighted sum of scaled gradients. as in Adam. + state["exp_avg_sq"] = torch.zeros_like(p, memory_format=torch.preserve_format) + + def _get_clipping_scale(self, group: dict, tuples: List[Tuple[Tensor, dict, List[str]]]) -> float: + """ + Returns a scalar factor <= 1.0 that dictates gradient clipping, i.e. we will scale the gradients + by this amount before applying the rest of the update. + + Args: + group: the parameter group, an item in self.param_groups + tuples: a list of tuples of (param, state, param_names) + where param is a batched set of parameters, + with a .grad (1st dim is batch dim) + and state is the state-dict where optimization parameters are kept. + param_names is a List[str] while each str is name for a parameter + in batched set of parameters "param". + """ + assert len(tuples) >= 1 + clipping_scale = group["clipping_scale"] + (first_p, first_state, _) = tuples[0] + step = first_state["step"] + if clipping_scale is None or step == 0: + # no clipping. return early on step == 0 because the other + # parameters' state won't have been initialized yet. + return 1.0 + clipping_update_period = group["clipping_update_period"] + + tot_sumsq = torch.tensor(0.0, device=first_p.device) + for p, state, param_names in tuples: + grad = p.grad + if grad.is_sparse: + raise RuntimeError("ScaledAdam optimizer does not support sparse gradients") + if p.numel() == p.shape[0]: # a batch of scalars + tot_sumsq += (grad**2).sum() # sum() to change shape [1] to [] + else: + tot_sumsq += ((grad * state["param_rms"]) ** 2).sum() + + tot_norm = tot_sumsq.sqrt() + if "model_norms" not in first_state: + first_state["model_norms"] = torch.zeros(clipping_update_period, device=p.device) + first_state["model_norms"][step % clipping_update_period] = tot_norm + + if step % clipping_update_period == 0: + # Print some stats. + # We don't reach here if step == 0 because we would have returned + # above. + sorted_norms = first_state["model_norms"].sort()[0].to("cpu") + quartiles = [] + for n in range(0, 5): + index = min( + clipping_update_period - 1, + (clipping_update_period // 4) * n, + ) + quartiles.append(sorted_norms[index].item()) + + median = quartiles[2] + threshold = clipping_scale * median + first_state["model_norm_threshold"] = threshold + percent_clipped = ( + first_state["num_clipped"] * 100.0 / clipping_update_period if "num_clipped" in first_state else 0.0 + ) + first_state["num_clipped"] = 0 + quartiles = " ".join(["%.3e" % x for x in quartiles]) + logging.info( + f"Clipping_scale={clipping_scale}, grad-norm quartiles {quartiles}, threshold={threshold:.3e}, percent-clipped={percent_clipped:.1f}" + ) + + if step < clipping_update_period: + return 1.0 # We have not yet estimated a norm to clip to. + else: + try: + model_norm_threshold = first_state["model_norm_threshold"] + except KeyError: + logging.info( + "Warning: model_norm_threshold not in state: possibly you changed config when restarting, adding clipping_scale option?" + ) + return 1.0 + ans = min(1.0, (model_norm_threshold / (tot_norm + 1.0e-20)).item()) + if ans < 1.0: + first_state["num_clipped"] += 1 + if ans < 0.1: + logging.warn(f"Scaling gradients by {ans}, model_norm_threshold={model_norm_threshold}") + if self.show_dominant_parameters: + assert p.shape[0] == len(param_names) + self._show_gradient_dominating_parameter(tuples, tot_sumsq) + return ans + + def _show_gradient_dominating_parameter(self, tuples: List[Tuple[Tensor, dict, List[str]]], tot_sumsq: Tensor): + """ + Show information of parameter wihch dominanting tot_sumsq. + + Args: + tuples: a list of tuples of (param, state, param_names) + where param is a batched set of parameters, + with a .grad (1st dim is batch dim) + and state is the state-dict where optimization parameters are kept. + param_names is a List[str] while each str is name for a parameter + in batched set of parameters "param". + tot_sumsq: sumsq of all parameters. Though it's could be calculated + from tuples, we still pass it to save some time. + """ + all_sumsq_orig = {} + for p, state, batch_param_names in tuples: + # p is a stacked batch parameters. + batch_grad = p.grad + if p.numel() == p.shape[0]: # a batch of scalars + batch_sumsq_orig = batch_grad**2 + # Dummpy values used by following `zip` statement. + batch_rms_orig = torch.ones(p.shape[0]) + else: + batch_rms_orig = state["param_rms"] + batch_sumsq_orig = ((batch_grad * batch_rms_orig) ** 2).sum(dim=list(range(1, batch_grad.ndim))) + + for name, sumsq_orig, rms, grad in zip( + batch_param_names, + batch_sumsq_orig, + batch_rms_orig, + batch_grad, + ): + proportion_orig = sumsq_orig / tot_sumsq + all_sumsq_orig[name] = (proportion_orig, sumsq_orig, rms, grad) + + assert torch.isclose( + sum([value[0] for value in all_sumsq_orig.values()]).cpu(), + torch.tensor(1.0), + ) + sorted_by_proportion = { + k: v + for k, v in sorted( + all_sumsq_orig.items(), + key=lambda item: item[1][0], + reverse=True, + ) + } + dominant_param_name = next(iter(sorted_by_proportion)) + ( + dominant_proportion, + dominant_sumsq, + dominant_rms, + dominant_grad, + ) = sorted_by_proportion[dominant_param_name] + logging.info( + f"Parameter Dominanting tot_sumsq {dominant_param_name}" + f" with proportion {dominant_proportion:.2f}," + f" where dominant_sumsq=(grad_sumsq*orig_rms_sq)" + f"={dominant_sumsq:.3e}," + f" grad_sumsq = {(dominant_grad**2).sum():.3e}," + f" orig_rms_sq={(dominant_rms**2).item():.3e}" + ) + + def _step_one_batch(self, group: dict, p: Tensor, state: dict, clipping_scale: float): + """ + Do the step for one parameter, which is actually going to be a batch of + `real` parameters, with dim 0 as the batch dim. + Args: + group: dict to look up configuration values + p: parameter to update (actually multiple parameters stacked together + as a batch) + state: state-dict for p, to look up the optimizer state + """ + lr = group["lr"] + size_update_period = group["size_update_period"] + beta1 = group["betas"][0] + + grad = p.grad + if clipping_scale != 1.0: + grad = grad * clipping_scale + step = state["step"] + delta = state["delta"] + + delta.mul_(beta1) + batch_size = p.shape[0] + numel = p.numel() // batch_size + if numel > 1: + # Update the size/scale of p, and set param_rms + scale_grads = state["scale_grads"] + scale_grads[step % size_update_period] = (p * grad).sum(dim=list(range(1, p.ndim)), keepdim=True) + if step % size_update_period == size_update_period - 1: + param_rms = state["param_rms"] # shape: (batch_size, 1, 1, ..) + param_rms.copy_((p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()) + if step > 0: + # self._size_update() learns the overall scale on the + # parameter, by shrinking or expanding it. + self._size_update(group, scale_grads, p, state) + + if numel == 1: + # For parameters with 1 element we just use regular Adam. + # Updates delta. + self._step_scalar(group, p, state) + else: + self._step(group, p, state) + + state["step"] = step + 1 + + def _size_update( + self, + group: dict, + scale_grads: Tensor, + p: Tensor, + state: dict, + ) -> None: + """ + Called only where p.numel() > 1, this updates the scale of the parameter. + If we imagine: p = underlying_param * scale.exp(), and we are doing + gradient descent on underlying param and on scale, this function does the update + on `scale`. + + Args: + group: dict to look up configuration values + scale_grads: a tensor of shape (size_update_period, batch_size, 1, 1,...) containing + grads w.r.t. the scales. + p: The parameter to update + state: The state-dict of p + """ + + param_rms = state["param_rms"] + beta1, beta2 = group["betas"] + size_lr = group["lr"] * group["scalar_lr_scale"] + param_min_rms = group["param_min_rms"] + param_max_rms = group["param_max_rms"] + eps = group["eps"] + step = state["step"] + batch_size = p.shape[0] + + size_update_period = scale_grads.shape[0] + # correct beta2 for the size update period: we will have + # faster decay at this level. + beta2_corr = beta2**size_update_period + + scale_exp_avg_sq = state["scale_exp_avg_sq"] # shape: (batch_size, 1, 1, ..) + scale_exp_avg_sq.mul_(beta2_corr).add_( + (scale_grads**2).mean(dim=0), # mean over dim `size_update_period` + alpha=1 - beta2_corr, + ) # shape is (batch_size, 1, 1, ...) + + # The 1st time we reach here is when size_step == 1. + size_step = (step + 1) // size_update_period + bias_correction2 = 1 - beta2_corr**size_step + # we don't bother with bias_correction1; this will help prevent divergence + # at the start of training. + + denom = scale_exp_avg_sq.sqrt() + eps + + scale_step = -size_lr * (bias_correction2**0.5) * scale_grads.sum(dim=0) / denom + + is_too_small = param_rms < param_min_rms + is_too_large = param_rms > param_max_rms + + # when the param gets too small, just don't shrink it any further. + scale_step.masked_fill_(is_too_small, 0.0) + # when it gets too large, stop it from getting any larger. + scale_step.masked_fill_(is_too_large, -size_lr * size_update_period) + delta = state["delta"] + # the factor of (1-beta1) relates to momentum. + delta.add_(p * scale_step, alpha=(1 - beta1)) + + def _step(self, group: dict, p: Tensor, state: dict): + """ + This function does the core update of self.step(), in the case where the members of + the batch have more than 1 element. + + Args: + group: A dict which will be used to look up configuration values + p: The parameter to be updated + grad: The grad of p + state: The state-dict corresponding to parameter p + + This function modifies p. + """ + grad = p.grad + lr = group["lr"] + beta1, beta2 = group["betas"] + eps = group["eps"] + param_min_rms = group["param_min_rms"] + step = state["step"] + + exp_avg_sq = state["exp_avg_sq"] + exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=(1 - beta2)) + + this_step = state["step"] - (state["zero_step"] if "zero_step" in state else 0) + bias_correction2 = 1 - beta2 ** (this_step + 1) + if bias_correction2 < 0.99: + # note: not in-place. + exp_avg_sq = exp_avg_sq * (1.0 / bias_correction2) + + denom = exp_avg_sq.sqrt() + denom += eps + grad = grad / denom + + alpha = -lr * (1 - beta1) * state["param_rms"].clamp(min=param_min_rms) + + delta = state["delta"] + delta.add_(grad * alpha) + p.add_(delta) + + def _step_scalar(self, group: dict, p: Tensor, state: dict): + """ + A simplified form of the core update for scalar tensors, where we cannot get a good + estimate of the parameter rms. + """ + beta1, beta2 = group["betas"] + scalar_max = group["scalar_max"] + eps = group["eps"] + lr = group["lr"] * group["scalar_lr_scale"] + grad = p.grad + + exp_avg_sq = state["exp_avg_sq"] # shape: (batch_size,) + exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) + + # bias_correction2 is like in Adam. Don't bother with bias_correction1; + # slower update at the start will help stability anyway. + bias_correction2 = 1 - beta2 ** (state["step"] + 1) + denom = (exp_avg_sq / bias_correction2).sqrt() + eps + + delta = state["delta"] + delta.add_(grad / denom, alpha=-lr * (1 - beta1)) + p.clamp_(min=-scalar_max, max=scalar_max) + p.add_(delta) diff --git a/GPT_SoVITS/AR/modules/patched_mha_with_cache.py b/GPT_SoVITS/AR/modules/patched_mha_with_cache.py new file mode 100644 index 0000000000000000000000000000000000000000..5bffcea63c7081571defc308b601b748fe4eb797 --- /dev/null +++ b/GPT_SoVITS/AR/modules/patched_mha_with_cache.py @@ -0,0 +1,428 @@ +from torch.nn.functional import * +from torch.nn.functional import ( + _mha_shape_check, + _canonical_mask, + _none_or_dtype, + _in_projection_packed, +) +import torch +# Tensor = torch.Tensor +# from typing import Callable, List, Optional, Tuple, Union + + +def multi_head_attention_forward_patched( + query, + key, + value, + embed_dim_to_check, + num_heads, + in_proj_weight, + in_proj_bias, + bias_k, + bias_v, + add_zero_attn, + dropout_p: float, + out_proj_weight, + out_proj_bias, + training=True, + key_padding_mask=None, + need_weights=True, + attn_mask=None, + use_separate_proj_weight=False, + q_proj_weight=None, + k_proj_weight=None, + v_proj_weight=None, + static_k=None, + static_v=None, + average_attn_weights=True, + is_causal=False, + cache=None, +): + r""" + Args: + query, key, value: map a query and a set of key-value pairs to an output. + See "Attention Is All You Need" for more details. + embed_dim_to_check: total dimension of the model. + num_heads: parallel attention heads. + in_proj_weight, in_proj_bias: input projection weight and bias. + bias_k, bias_v: bias of the key and value sequences to be added at dim=0. + add_zero_attn: add a new batch of zeros to the key and + value sequences at dim=1. + dropout_p: probability of an element to be zeroed. + out_proj_weight, out_proj_bias: the output projection weight and bias. + training: apply dropout if is ``True``. + key_padding_mask: if provided, specified padding elements in the key will + be ignored by the attention. This is an binary mask. When the value is True, + the corresponding value on the attention layer will be filled with -inf. + need_weights: output attn_output_weights. + Default: `True` + Note: `needs_weight` defaults to `True`, but should be set to `False` + For best performance when attention weights are not nedeeded. + *Setting needs_weights to `True` + leads to a significant performance degradation.* + attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all + the batches while a 3D mask allows to specify a different mask for the entries of each batch. + is_causal: If specified, applies a causal mask as attention mask, and ignores + attn_mask for computing scaled dot product attention. + Default: ``False``. + .. warning:: + is_causal is provides a hint that the attn_mask is the + causal mask.Providing incorrect hints can result in + incorrect execution, including forward and backward + compatibility. + use_separate_proj_weight: the function accept the proj. weights for query, key, + and value in different forms. If false, in_proj_weight will be used, which is + a combination of q_proj_weight, k_proj_weight, v_proj_weight. + q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias. + static_k, static_v: static key and value used for attention operators. + average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across heads. + Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an effect + when ``need_weights=True.``. Default: True + + + Shape: + Inputs: + - query: :math:`(L, E)` or :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is + the embedding dimension. + - key: :math:`(S, E)` or :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is + the embedding dimension. + - value: :math:`(S, E)` or :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is + the embedding dimension. + - key_padding_mask: :math:`(S)` or :math:`(N, S)` where N is the batch size, S is the source sequence length. + If a FloatTensor is provided, it will be directly added to the value. + If a BoolTensor is provided, the positions with the + value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged. + - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length. + 3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length, + S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked + positions. If a BoolTensor is provided, positions with ``True`` + are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor + is provided, it will be added to the attention weight. + - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length, + N is the batch size, E is the embedding dimension. E/num_heads is the head dimension. + - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length, + N is the batch size, E is the embedding dimension. E/num_heads is the head dimension. + + Outputs: + - attn_output: :math:`(L, E)` or :math:`(L, N, E)` where L is the target sequence length, N is the batch size, + E is the embedding dimension. + - attn_output_weights: Only returned when ``need_weights=True``. If ``average_attn_weights=True``, returns + attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or + :math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and + :math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per + head of shape :math:`(num_heads, L, S)` when input is unbatched or :math:`(N, num_heads, L, S)`. + """ + tens_ops = ( + query, + key, + value, + in_proj_weight, + in_proj_bias, + bias_k, + bias_v, + out_proj_weight, + out_proj_bias, + ) + if has_torch_function(tens_ops): + return handle_torch_function( + multi_head_attention_forward, + tens_ops, + query, + key, + value, + embed_dim_to_check, + num_heads, + in_proj_weight, + in_proj_bias, + bias_k, + bias_v, + add_zero_attn, + dropout_p, + out_proj_weight, + out_proj_bias, + training=training, + key_padding_mask=key_padding_mask, + need_weights=need_weights, + attn_mask=attn_mask, + is_causal=is_causal, + use_separate_proj_weight=use_separate_proj_weight, + q_proj_weight=q_proj_weight, + k_proj_weight=k_proj_weight, + v_proj_weight=v_proj_weight, + static_k=static_k, + static_v=static_v, + average_attn_weights=average_attn_weights, + cache=cache, + ) + + is_batched = _mha_shape_check(query, key, value, key_padding_mask, attn_mask, num_heads) + + # For unbatched input, we unsqueeze at the expected batch-dim to pretend that the input + # is batched, run the computation and before returning squeeze the + # batch dimension so that the output doesn't carry this temporary batch dimension. + if not is_batched: + # unsqueeze if the input is unbatched + query = query.unsqueeze(1) + key = key.unsqueeze(1) + value = value.unsqueeze(1) + if key_padding_mask is not None: + key_padding_mask = key_padding_mask.unsqueeze(0) + + # set up shape vars + tgt_len, bsz, embed_dim = query.shape + src_len, _, _ = key.shape + + key_padding_mask = _canonical_mask( + mask=key_padding_mask, + mask_name="key_padding_mask", + other_type=_none_or_dtype(attn_mask), + other_name="attn_mask", + target_type=query.dtype, + ) + + if is_causal and attn_mask is None: + raise RuntimeError( + "Need attn_mask if specifying the is_causal hint. " + "You may use the Transformer module method " + "`generate_square_subsequent_mask` to create this mask." + ) + + if is_causal and key_padding_mask is None and not need_weights: + # when we have a kpm or need weights, we need attn_mask + # Otherwise, we use the is_causal hint go as is_causal + # indicator to SDPA. + attn_mask = None + else: + attn_mask = _canonical_mask( + mask=attn_mask, + mask_name="attn_mask", + other_type=None, + other_name="", + target_type=query.dtype, + check_other=False, + ) + + if key_padding_mask is not None: + # We have the attn_mask, and use that to merge kpm into it. + # Turn off use of is_causal hint, as the merged mask is no + # longer causal. + is_causal = False + + assert embed_dim == embed_dim_to_check, ( + f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}" + ) + if isinstance(embed_dim, torch.Tensor): + # embed_dim can be a tensor when JIT tracing + head_dim = embed_dim.div(num_heads, rounding_mode="trunc") + else: + head_dim = embed_dim // num_heads + assert head_dim * num_heads == embed_dim, f"embed_dim {embed_dim} not divisible by num_heads {num_heads}" + if use_separate_proj_weight: + # allow MHA to have different embedding dimensions when separate projection weights are used + assert key.shape[:2] == value.shape[:2], ( + f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}" + ) + else: + assert key.shape == value.shape, f"key shape {key.shape} does not match value shape {value.shape}" + + # + # compute in-projection + # + if not use_separate_proj_weight: + assert in_proj_weight is not None, "use_separate_proj_weight is False but in_proj_weight is None" + q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias) + else: + assert q_proj_weight is not None, "use_separate_proj_weight is True but q_proj_weight is None" + assert k_proj_weight is not None, "use_separate_proj_weight is True but k_proj_weight is None" + assert v_proj_weight is not None, "use_separate_proj_weight is True but v_proj_weight is None" + if in_proj_bias is None: + b_q = b_k = b_v = None + else: + b_q, b_k, b_v = in_proj_bias.chunk(3) + q, k, v = _in_projection( + query, + key, + value, + q_proj_weight, + k_proj_weight, + v_proj_weight, + b_q, + b_k, + b_v, + ) + if cache != None: + if cache["first_infer"] == 1: + cache["k"][cache["stage"]] = k + # print(0,cache["k"].shape) + cache["v"][cache["stage"]] = v + else: ###12个layer每个都要留自己的cache_kv + # print(1,cache["k"].shape) + cache["k"][cache["stage"]] = torch.cat( + [cache["k"][cache["stage"]], k], 0 + ) ##本来时序是1,但是proj的时候可能transpose了所以时序到0维了 + cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]], v], 0) + # print(2, cache["k"].shape) + src_len = cache["k"][cache["stage"]].shape[0] + k = cache["k"][cache["stage"]] + v = cache["v"][cache["stage"]] + # if attn_mask is not None: + # attn_mask=attn_mask[-1:,] + # print(attn_mask.shape,attn_mask) + cache["stage"] = (cache["stage"] + 1) % cache["all_stage"] + # print(2333,cache) + # prep attention mask + + attn_mask = _canonical_mask( + mask=attn_mask, + mask_name="attn_mask", + other_type=None, + other_name="", + target_type=q.dtype, + check_other=False, + ) + + if attn_mask is not None: + # ensure attn_mask's dim is 3 + if attn_mask.dim() == 2: + correct_2d_size = (tgt_len, src_len) + if attn_mask.shape != correct_2d_size: + raise RuntimeError( + f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}." + ) + attn_mask = attn_mask.unsqueeze(0) + elif attn_mask.dim() == 3: + correct_3d_size = (bsz * num_heads, tgt_len, src_len) + if attn_mask.shape != correct_3d_size: + raise RuntimeError( + f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}." + ) + else: + raise RuntimeError(f"attn_mask's dimension {attn_mask.dim()} is not supported") + + # add bias along batch dimension (currently second) + if bias_k is not None and bias_v is not None: + assert static_k is None, "bias cannot be added to static key." + assert static_v is None, "bias cannot be added to static value." + k = torch.cat([k, bias_k.repeat(1, bsz, 1)]) + v = torch.cat([v, bias_v.repeat(1, bsz, 1)]) + if attn_mask is not None: + attn_mask = pad(attn_mask, (0, 1)) + if key_padding_mask is not None: + key_padding_mask = pad(key_padding_mask, (0, 1)) + else: + assert bias_k is None + assert bias_v is None + + # + # reshape q, k, v for multihead attention and make em batch first + # + q = q.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1) + if static_k is None: + k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1) + else: + # TODO finish disentangling control flow so we don't do in-projections when statics are passed + assert static_k.size(0) == bsz * num_heads, ( + f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}" + ) + assert static_k.size(2) == head_dim, f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}" + k = static_k + if static_v is None: + v = v.view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1) + else: + # TODO finish disentangling control flow so we don't do in-projections when statics are passed + assert static_v.size(0) == bsz * num_heads, ( + f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}" + ) + assert static_v.size(2) == head_dim, f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}" + v = static_v + + # add zero attention along batch dimension (now first) + if add_zero_attn: + zero_attn_shape = (bsz * num_heads, 1, head_dim) + k = torch.cat([k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1) + v = torch.cat([v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1) + if attn_mask is not None: + attn_mask = pad(attn_mask, (0, 1)) + if key_padding_mask is not None: + key_padding_mask = pad(key_padding_mask, (0, 1)) + + # update source sequence length after adjustments + src_len = k.size(1) + + # merge key padding and attention masks + if key_padding_mask is not None: + assert key_padding_mask.shape == ( + bsz, + src_len, + ), f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}" + key_padding_mask = ( + key_padding_mask.view(bsz, 1, 1, src_len).expand(-1, num_heads, -1, -1).reshape(bsz * num_heads, 1, src_len) + ) + if attn_mask is None: + attn_mask = key_padding_mask + else: + attn_mask = attn_mask + key_padding_mask + + # adjust dropout probability + if not training: + dropout_p = 0.0 + + # + # (deep breath) calculate attention and out projection + # + + if need_weights: + B, Nt, E = q.shape + q_scaled = q / math.sqrt(E) + + assert not (is_causal and attn_mask is None), "FIXME: is_causal not implemented for need_weights" + + if attn_mask is not None: + attn_output_weights = torch.baddbmm(attn_mask, q_scaled, k.transpose(-2, -1)) + else: + attn_output_weights = torch.bmm(q_scaled, k.transpose(-2, -1)) + attn_output_weights = softmax(attn_output_weights, dim=-1) + if dropout_p > 0.0: + attn_output_weights = dropout(attn_output_weights, p=dropout_p) + + attn_output = torch.bmm(attn_output_weights, v) + + attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim) + attn_output = linear(attn_output, out_proj_weight, out_proj_bias) + attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1)) + + # optionally average attention weights over heads + attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len) + if average_attn_weights: + attn_output_weights = attn_output_weights.mean(dim=1) + + if not is_batched: + # squeeze the output if input was unbatched + attn_output = attn_output.squeeze(1) + attn_output_weights = attn_output_weights.squeeze(0) + return attn_output, attn_output_weights + else: + # attn_mask can be either (L,S) or (N*num_heads, L, S) + # if attn_mask's shape is (1, L, S) we need to unsqueeze to (1, 1, L, S) + # in order to match the input for SDPA of (N, num_heads, L, S) + if attn_mask is not None: + if attn_mask.size(0) == 1 and attn_mask.dim() == 3: + attn_mask = attn_mask.unsqueeze(0) + else: + attn_mask = attn_mask.view(bsz, num_heads, -1, src_len) + + q = q.view(bsz, num_heads, tgt_len, head_dim) + k = k.view(bsz, num_heads, src_len, head_dim) + v = v.view(bsz, num_heads, src_len, head_dim) + + # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True): + attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal) + + attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim) + + attn_output = linear(attn_output, out_proj_weight, out_proj_bias) + attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1)) + if not is_batched: + # squeeze the output if input was unbatched + attn_output = attn_output.squeeze(1) + return attn_output, None diff --git a/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py b/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..8144c9c6b930dc825d56deb6b71229c037efb405 --- /dev/null +++ b/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py @@ -0,0 +1,85 @@ +from torch.nn.functional import * +from torch.nn.functional import ( + _canonical_mask, +) + + +def multi_head_attention_forward_patched( + query, + key, + value, + embed_dim_to_check: int, + num_heads: int, + in_proj_weight, + in_proj_bias: Optional[Tensor], + bias_k: Optional[Tensor], + bias_v: Optional[Tensor], + add_zero_attn: bool, + dropout_p: float, + out_proj_weight: Tensor, + out_proj_bias: Optional[Tensor], + training: bool = True, + key_padding_mask: Optional[Tensor] = None, + need_weights: bool = True, + attn_mask: Optional[Tensor] = None, + use_separate_proj_weight: bool = False, + q_proj_weight: Optional[Tensor] = None, + k_proj_weight: Optional[Tensor] = None, + v_proj_weight: Optional[Tensor] = None, + static_k: Optional[Tensor] = None, + static_v: Optional[Tensor] = None, + average_attn_weights: bool = True, + is_causal: bool = False, + cache=None, +) -> Tuple[Tensor, Optional[Tensor]]: + # set up shape vars + _, _, embed_dim = query.shape + attn_mask = _canonical_mask( + mask=attn_mask, + mask_name="attn_mask", + other_type=None, + other_name="", + target_type=query.dtype, + check_other=False, + ) + head_dim = embed_dim // num_heads + + proj_qkv = linear(query, in_proj_weight, in_proj_bias) + proj_qkv = proj_qkv.unflatten(-1, (3, query.size(-1))).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous() + q, k, v = proj_qkv[0], proj_qkv[1], proj_qkv[2] + + if cache["first_infer"] == 1: + cache["k"][cache["stage"]] = k + cache["v"][cache["stage"]] = v + else: + cache["k"][cache["stage"]] = torch.cat([cache["k"][cache["stage"]][:-1], k], 0) + cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]][:-1], v], 0) + k = cache["k"][cache["stage"]] + v = cache["v"][cache["stage"]] + cache["stage"] = (cache["stage"] + 1) % cache["all_stage"] + + attn_mask = _canonical_mask( + mask=attn_mask, + mask_name="attn_mask", + other_type=None, + other_name="", + target_type=q.dtype, + check_other=False, + ) + attn_mask = attn_mask.unsqueeze(0) + + q = q.view(-1, num_heads, head_dim).transpose(0, 1) + k = k.view(-1, num_heads, head_dim).transpose(0, 1) + v = v.view(-1, num_heads, head_dim).transpose(0, 1) + + dropout_p = 0.0 + attn_mask = attn_mask.unsqueeze(0) + q = q.view(num_heads, -1, head_dim).unsqueeze(0) + k = k.view(num_heads, -1, head_dim).unsqueeze(0) + v = v.view(num_heads, -1, head_dim).unsqueeze(0) + attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal) + attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim) + attn_output = linear(attn_output, out_proj_weight, out_proj_bias) + attn_output = attn_output.view(-1, 1, attn_output.size(1)) + + return attn_output diff --git a/GPT_SoVITS/AR/modules/scaling.py b/GPT_SoVITS/AR/modules/scaling.py new file mode 100644 index 0000000000000000000000000000000000000000..aae1453316adc42b7ed17b7f0a6c776a78347e6a --- /dev/null +++ b/GPT_SoVITS/AR/modules/scaling.py @@ -0,0 +1,320 @@ +# Copyright 2022 Xiaomi Corp. (authors: Daniel Povey) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import random +from typing import Optional +from typing import Tuple + +import torch +import torch.nn as nn +from torch import Tensor + + +class DoubleSwishFunction(torch.autograd.Function): + """ + double_swish(x) = x * torch.sigmoid(x-1) + This is a definition, originally motivated by its close numerical + similarity to swish(swish(x)), where swish(x) = x * sigmoid(x). + + Memory-efficient derivative computation: + double_swish(x) = x * s, where s(x) = torch.sigmoid(x-1) + double_swish'(x) = d/dx double_swish(x) = x * s'(x) + x' * s(x) = x * s'(x) + s(x). + Now, s'(x) = s(x) * (1-s(x)). + double_swish'(x) = x * s'(x) + s(x). + = x * s(x) * (1-s(x)) + s(x). + = double_swish(x) * (1-s(x)) + s(x) + ... so we just need to remember s(x) but not x itself. + """ + + @staticmethod + def forward(ctx, x: Tensor) -> Tensor: + requires_grad = x.requires_grad + x_dtype = x.dtype + if x.dtype == torch.float16: + x = x.to(torch.float32) + + s = torch.sigmoid(x - 1.0) + y = x * s + + if requires_grad: + deriv = y * (1 - s) + s + # notes on derivative of x * sigmoid(x - 1): + # https://www.wolframalpha.com/input?i=d%2Fdx+%28x+*+sigmoid%28x-1%29%29 + # min \simeq -0.043638. Take floor as -0.043637 so it's a lower bund + # max \simeq 1.1990. Take ceil to be 1.2 so it's an upper bound. + # the combination of "+ torch.rand_like(deriv)" and casting to torch.uint8 (which + # floors), should be expectation-preserving. + floor = -0.043637 + ceil = 1.2 + d_scaled = (deriv - floor) * (255.0 / (ceil - floor)) + torch.rand_like(deriv) + if __name__ == "__main__": + # for self-testing only. + assert d_scaled.min() >= 0.0 + assert d_scaled.max() < 256.0 + d_int = d_scaled.to(torch.uint8) + ctx.save_for_backward(d_int) + if x.dtype == torch.float16 or torch.is_autocast_enabled(): + y = y.to(torch.float16) + return y + + @staticmethod + def backward(ctx, y_grad: Tensor) -> Tensor: + (d,) = ctx.saved_tensors + # the same constants as used in forward pass. + floor = -0.043637 + ceil = 1.2 + d = d * ((ceil - floor) / 255.0) + floor + return y_grad * d + + +class DoubleSwish(torch.nn.Module): + def forward(self, x: Tensor) -> Tensor: + """Return double-swish activation function which is an approximation to Swish(Swish(x)), + that we approximate closely with x * sigmoid(x-1). + """ + if torch.jit.is_scripting() or torch.jit.is_tracing(): + return x * torch.sigmoid(x - 1.0) + return DoubleSwishFunction.apply(x) + + +class ActivationBalancerFunction(torch.autograd.Function): + @staticmethod + def forward( + ctx, + x: Tensor, + scale_factor: Tensor, + sign_factor: Optional[Tensor], + channel_dim: int, + ) -> Tensor: + if channel_dim < 0: + channel_dim += x.ndim + ctx.channel_dim = channel_dim + xgt0 = x > 0 + if sign_factor is None: + ctx.save_for_backward(xgt0, scale_factor) + else: + ctx.save_for_backward(xgt0, scale_factor, sign_factor) + return x + + @staticmethod + def backward(ctx, x_grad: Tensor) -> Tuple[Tensor, None, None, None]: + if len(ctx.saved_tensors) == 3: + xgt0, scale_factor, sign_factor = ctx.saved_tensors + for _ in range(ctx.channel_dim, x_grad.ndim - 1): + scale_factor = scale_factor.unsqueeze(-1) + sign_factor = sign_factor.unsqueeze(-1) + factor = sign_factor + scale_factor * (xgt0.to(x_grad.dtype) - 0.5) + else: + xgt0, scale_factor = ctx.saved_tensors + for _ in range(ctx.channel_dim, x_grad.ndim - 1): + scale_factor = scale_factor.unsqueeze(-1) + factor = scale_factor * (xgt0.to(x_grad.dtype) - 0.5) + neg_delta_grad = x_grad.abs() * factor + return ( + x_grad - neg_delta_grad, + None, + None, + None, + ) + + +def _compute_scale_factor( + x: Tensor, + channel_dim: int, + min_abs: float, + max_abs: float, + gain_factor: float, + max_factor: float, +) -> Tensor: + if channel_dim < 0: + channel_dim += x.ndim + sum_dims = [d for d in range(x.ndim) if d != channel_dim] + x_abs_mean = torch.mean(x.abs(), dim=sum_dims).to(torch.float32) + + if min_abs == 0.0: + below_threshold = 0.0 + else: + # below_threshold is 0 if x_abs_mean > min_abs, can be at most max_factor if + # x_abs)_mean , min_abs. + below_threshold = ((min_abs - x_abs_mean) * (gain_factor / min_abs)).clamp(min=0, max=max_factor) + + above_threshold = ((x_abs_mean - max_abs) * (gain_factor / max_abs)).clamp(min=0, max=max_factor) + + return below_threshold - above_threshold + + +def _compute_sign_factor( + x: Tensor, + channel_dim: int, + min_positive: float, + max_positive: float, + gain_factor: float, + max_factor: float, +) -> Tensor: + if channel_dim < 0: + channel_dim += x.ndim + sum_dims = [d for d in range(x.ndim) if d != channel_dim] + proportion_positive = torch.mean((x > 0).to(torch.float32), dim=sum_dims) + if min_positive == 0.0: + factor1 = 0.0 + else: + # 0 if proportion_positive >= min_positive, else can be + # as large as max_factor. + factor1 = ((min_positive - proportion_positive) * (gain_factor / min_positive)).clamp_(min=0, max=max_factor) + + if max_positive == 1.0: + factor2 = 0.0 + else: + # 0 if self.proportion_positive <= max_positive, else can be + # as large as -max_factor. + factor2 = ((proportion_positive - max_positive) * (gain_factor / (1.0 - max_positive))).clamp_( + min=0, max=max_factor + ) + sign_factor = factor1 - factor2 + # require min_positive != 0 or max_positive != 1: + assert not isinstance(sign_factor, float) + return sign_factor + + +class ActivationBalancer(torch.nn.Module): + """ + Modifies the backpropped derivatives of a function to try to encourage, for + each channel, that it is positive at least a proportion `threshold` of the + time. It does this by multiplying negative derivative values by up to + (1+max_factor), and positive derivative values by up to (1-max_factor), + interpolated from 1 at the threshold to those extremal values when none + of the inputs are positive. + + Args: + num_channels: the number of channels + channel_dim: the dimension/axis corresponding to the channel, e.g. + -1, 0, 1, 2; will be interpreted as an offset from x.ndim if negative. + min_positive: the minimum, per channel, of the proportion of the time + that (x > 0), below which we start to modify the derivatives. + max_positive: the maximum, per channel, of the proportion of the time + that (x > 0), above which we start to modify the derivatives. + max_factor: the maximum factor by which we modify the derivatives for + either the sign constraint or the magnitude constraint; + e.g. with max_factor=0.02, the the derivatives would be multiplied by + values in the range [0.98..1.02]. + sign_gain_factor: determines the 'gain' with which we increase the + change in gradient once the constraints on min_positive and max_positive + are violated. + scale_gain_factor: determines the 'gain' with which we increase the + change in gradient once the constraints on min_abs and max_abs + are violated. + min_abs: the minimum average-absolute-value difference from the mean + value per channel, which we allow, before we start to modify + the derivatives to prevent this. + max_abs: the maximum average-absolute-value difference from the mean + value per channel, which we allow, before we start to modify + the derivatives to prevent this. + min_prob: determines the minimum probability with which we modify the + gradients for the {min,max}_positive and {min,max}_abs constraints, + on each forward(). This is done randomly to prevent all layers + from doing it at the same time. Early in training we may use + higher probabilities than this; it will decay to this value. + """ + + def __init__( + self, + num_channels: int, + channel_dim: int, + min_positive: float = 0.05, + max_positive: float = 0.95, + max_factor: float = 0.04, + sign_gain_factor: float = 0.01, + scale_gain_factor: float = 0.02, + min_abs: float = 0.2, + max_abs: float = 100.0, + min_prob: float = 0.1, + ): + super(ActivationBalancer, self).__init__() + self.num_channels = num_channels + self.channel_dim = channel_dim + self.min_positive = min_positive + self.max_positive = max_positive + self.max_factor = max_factor + self.min_abs = min_abs + self.max_abs = max_abs + self.min_prob = min_prob + self.sign_gain_factor = sign_gain_factor + self.scale_gain_factor = scale_gain_factor + + # count measures how many times the forward() function has been called. + # We occasionally sync this to a tensor called `count`, that exists to + # make sure it is synced to disk when we load and save the model. + self.cpu_count = 0 + self.register_buffer("count", torch.tensor(0, dtype=torch.int64)) + + def forward(self, x: Tensor) -> Tensor: + if torch.jit.is_scripting() or not x.requires_grad or torch.jit.is_tracing(): + return _no_op(x) + + count = self.cpu_count + self.cpu_count += 1 + + if random.random() < 0.01: + # Occasionally sync self.cpu_count with self.count. + # count affects the decay of 'prob'. don't do this on every iter, + # because syncing with the GPU is slow. + self.cpu_count = max(self.cpu_count, self.count.item()) + self.count.fill_(self.cpu_count) + + # the prob of doing some work exponentially decreases from 0.5 till it hits + # a floor at min_prob (==0.1, by default) + prob = max(self.min_prob, 0.5 ** (1 + (count / 4000.0))) + + if random.random() < prob: + sign_gain_factor = 0.5 + if self.min_positive != 0.0 or self.max_positive != 1.0: + sign_factor = _compute_sign_factor( + x, + self.channel_dim, + self.min_positive, + self.max_positive, + gain_factor=self.sign_gain_factor / prob, + max_factor=self.max_factor, + ) + else: + sign_factor = None + + scale_factor = _compute_scale_factor( + x.detach(), + self.channel_dim, + min_abs=self.min_abs, + max_abs=self.max_abs, + gain_factor=self.scale_gain_factor / prob, + max_factor=self.max_factor, + ) + return ActivationBalancerFunction.apply( + x, + scale_factor, + sign_factor, + self.channel_dim, + ) + else: + return _no_op(x) + + +def BalancedDoubleSwish(d_model, channel_dim=-1, max_abs=10.0, min_prob=0.25) -> nn.Sequential: + """ + ActivationBalancer -> DoubleSwish + """ + balancer = ActivationBalancer(d_model, channel_dim=channel_dim, max_abs=max_abs, min_prob=min_prob) + return nn.Sequential( + balancer, + DoubleSwish(), + ) diff --git a/GPT_SoVITS/AR/modules/transformer.py b/GPT_SoVITS/AR/modules/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..1bf21cdbbc21006af785534d0e528da703dd68d3 --- /dev/null +++ b/GPT_SoVITS/AR/modules/transformer.py @@ -0,0 +1,362 @@ +# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/transformer.py +import copy +import numbers +from functools import partial +from typing import Any +from typing import Callable +from typing import List +from typing import Optional +from typing import Tuple +from typing import Union + +import torch +from AR.modules.activation import MultiheadAttention +from AR.modules.scaling import BalancedDoubleSwish +from torch import nn +from torch import Tensor +from torch.nn import functional as F + +_shape_t = Union[int, List[int], torch.Size] + + +class LayerNorm(nn.Module): + __constants__ = ["normalized_shape", "eps", "elementwise_affine"] + normalized_shape: Tuple[int, ...] + eps: float + elementwise_affine: bool + + def __init__( + self, + normalized_shape: _shape_t, + eps: float = 1e-5, + elementwise_affine: bool = True, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super(LayerNorm, self).__init__() + if isinstance(normalized_shape, numbers.Integral): + # mypy error: incompatible types in assignment + normalized_shape = (normalized_shape,) # type: ignore[assignment] + self.normalized_shape = tuple(normalized_shape) # type: ignore[arg-type] + self.eps = eps + self.elementwise_affine = elementwise_affine + if self.elementwise_affine: + self.weight = nn.Parameter(torch.empty(self.normalized_shape, **factory_kwargs)) + self.bias = nn.Parameter(torch.empty(self.normalized_shape, **factory_kwargs)) + else: + self.register_parameter("weight", None) + self.register_parameter("bias", None) + + self.reset_parameters() + + def reset_parameters(self) -> None: + if self.elementwise_affine: + nn.init.ones_(self.weight) + nn.init.zeros_(self.bias) + + def forward(self, input: Tensor, embedding: Any = None) -> Tensor: + if isinstance(input, tuple): + input, embedding = input + return ( + F.layer_norm( + input, + self.normalized_shape, + self.weight, + self.bias, + self.eps, + ), + embedding, + ) + + assert embedding is None + return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps) + + def extra_repr(self) -> str: + return "{normalized_shape}, eps={eps}, elementwise_affine={elementwise_affine}".format(**self.__dict__) + + +class IdentityNorm(nn.Module): + def __init__( + self, + d_model: int, + eps: float = 1e-5, + device=None, + dtype=None, + ) -> None: + super(IdentityNorm, self).__init__() + + def forward(self, input: Tensor, embedding: Any = None) -> Tensor: + if isinstance(input, tuple): + return input + + assert embedding is None + return input + + +class TransformerEncoder(nn.Module): + r"""TransformerEncoder is a stack of N encoder layers. Users can build the + BERT(https://arxiv.org/abs/1810.04805) model with corresponding parameters. + + Args: + encoder_layer: an instance of the TransformerEncoderLayer() class (required). + num_layers: the number of sub-encoder-layers in the encoder (required). + norm: the layer normalization component (optional). + enable_nested_tensor: if True, input will automatically convert to nested tensor + (and convert back on output). This will improve the overall performance of + TransformerEncoder when padding rate is high. Default: ``True`` (enabled). + + Examples:: + >>> encoder_layer = TransformerEncoderLayer(d_model=512, nhead=8) + >>> transformer_encoder = TransformerEncoder(encoder_layer, num_layers=6) + >>> src = torch.rand(10, 32, 512) + >>> out = transformer_encoder(src) + """ + + __constants__ = ["norm"] + + def __init__(self, encoder_layer, num_layers, norm=None): + super(TransformerEncoder, self).__init__() + self.layers = _get_clones(encoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + + def forward( + self, + src: Tensor, + mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + return_layer_states: bool = False, + cache=None, + ) -> Tensor: + r"""Pass the input through the encoder layers in turn. + + Args: + src: the sequence to the encoder (required). + mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + return_layer_states: return layers' state (optional). + + Shape: + see the docs in Transformer class. + """ + if return_layer_states: + layer_states = [] # layers' output + output = src + for mod in self.layers: + output = mod( + output, + src_mask=mask, + src_key_padding_mask=src_key_padding_mask, + cache=cache, + ) + layer_states.append(output[0]) + + if self.norm is not None: + output = self.norm(output) + + return layer_states, output + + output = src + for mod in self.layers: + output = mod( + output, + src_mask=mask, + src_key_padding_mask=src_key_padding_mask, + cache=cache, + ) + + if self.norm is not None: + output = self.norm(output) + + return output + + +class TransformerEncoderLayer(nn.Module): + __constants__ = ["batch_first", "norm_first"] + + def __init__( + self, + d_model: int, + nhead: int, + dim_feedforward: int = 2048, + dropout: float = 0.1, + activation: Union[str, Callable[[Tensor], Tensor]] = F.relu, + batch_first: bool = False, + norm_first: bool = False, + device=None, + dtype=None, + linear1_self_attention_cls: nn.Module = nn.Linear, + linear2_self_attention_cls: nn.Module = nn.Linear, + linear1_feedforward_cls: nn.Module = nn.Linear, + linear2_feedforward_cls: nn.Module = nn.Linear, + layer_norm_cls: nn.Module = LayerNorm, + layer_norm_eps: float = 1e-5, + adaptive_layer_norm=False, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super(TransformerEncoderLayer, self).__init__() + # print(233333333333,d_model,nhead) + # import os + # os._exit(2333333) + self.self_attn = MultiheadAttention( + d_model, # 512 16 + nhead, + dropout=dropout, + batch_first=batch_first, + linear1_cls=linear1_self_attention_cls, + linear2_cls=linear2_self_attention_cls, + **factory_kwargs, + ) + + # Implementation of Feedforward model + self.linear1 = linear1_feedforward_cls(d_model, dim_feedforward, **factory_kwargs) + self.dropout = nn.Dropout(dropout) + self.linear2 = linear2_feedforward_cls(dim_feedforward, d_model, **factory_kwargs) + + self.norm_first = norm_first + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + # Legacy string support for activation function. + if isinstance(activation, str): + activation = _get_activation_fn(activation) + elif isinstance(activation, partial): + activation = activation(d_model) + elif activation == BalancedDoubleSwish: + activation = BalancedDoubleSwish(d_model) + + # # We can't test self.activation in forward() in TorchScript, + # # so stash some information about it instead. + # if activation is F.relu or isinstance(activation, torch.nn.ReLU): + # self.activation_relu_or_gelu = 1 + # elif activation is F.gelu or isinstance(activation, torch.nn.GELU): + # self.activation_relu_or_gelu = 2 + # else: + # self.activation_relu_or_gelu = 0 + self.activation = activation + + norm1 = layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs) + if layer_norm_cls == IdentityNorm: + norm2 = BalancedBasicNorm(d_model, eps=layer_norm_eps, **factory_kwargs) + else: + norm2 = layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs) + + if adaptive_layer_norm: + self.norm1 = AdaptiveLayerNorm(d_model, norm1) + self.norm2 = AdaptiveLayerNorm(d_model, norm2) + else: + self.norm1 = norm1 + self.norm2 = norm2 + + def __setstate__(self, state): + super(TransformerEncoderLayer, self).__setstate__(state) + if not hasattr(self, "activation"): + self.activation = F.relu + + def forward( + self, + src: Tensor, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + cache=None, + ) -> Tensor: + r"""Pass the input through the encoder layer. + + Args: + src: the sequence to the encoder layer (required). + src_mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + + Shape: + see the docs in Transformer class. + """ + x, stage_embedding = src, None + is_src_tuple = False + if isinstance(src, tuple): + x, stage_embedding = src + is_src_tuple = True + + if src_key_padding_mask is not None: + _skpm_dtype = src_key_padding_mask.dtype + if _skpm_dtype != torch.bool and not torch.is_floating_point(src_key_padding_mask): + raise AssertionError("only bool and floating types of key_padding_mask are supported") + + if self.norm_first: + x = x + self._sa_block( + self.norm1(x, stage_embedding), + src_mask, + src_key_padding_mask, + cache=cache, + ) + x = x + self._ff_block(self.norm2(x, stage_embedding)) + else: + x = self.norm1( + x + self._sa_block(x, src_mask, src_key_padding_mask, cache=cache), + stage_embedding, + ) + x = self.norm2(x + self._ff_block(x), stage_embedding) + + if is_src_tuple: + return (x, stage_embedding) + return x + + # self-attention block + def _sa_block( + self, + x: Tensor, + attn_mask: Optional[Tensor], + key_padding_mask: Optional[Tensor], + cache=None, + ) -> Tensor: + # print(x.shape,attn_mask.shape,key_padding_mask) + # torch.Size([1, 188, 512]) torch.Size([188, 188]) None + # import os + # os._exit(23333) + x = self.self_attn( + x, + x, + x, + attn_mask=attn_mask, + key_padding_mask=key_padding_mask, + need_weights=False, + cache=cache, + )[0] + return self.dropout1(x) + + # feed forward block + def _ff_block(self, x: Tensor) -> Tensor: + x = self.linear2(self.dropout(self.activation(self.linear1(x)))) + return self.dropout2(x) + + +class AdaptiveLayerNorm(nn.Module): + r"""Adaptive Layer Normalization""" + + def __init__(self, d_model, norm) -> None: + super(AdaptiveLayerNorm, self).__init__() + self.project_layer = nn.Linear(d_model, 2 * d_model) + self.norm = norm + self.d_model = d_model + self.eps = self.norm.eps + + def forward(self, input: Tensor, embedding: Tensor = None) -> Tensor: + if isinstance(input, tuple): + input, embedding = input + weight, bias = torch.split( + self.project_layer(embedding), + split_size_or_sections=self.d_model, + dim=-1, + ) + return (weight * self.norm(input) + bias, embedding) + + weight, bias = torch.split( + self.project_layer(embedding), + split_size_or_sections=self.d_model, + dim=-1, + ) + return weight * self.norm(input) + bias + + +def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) diff --git a/GPT_SoVITS/AR/modules/transformer_onnx.py b/GPT_SoVITS/AR/modules/transformer_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..fa1702548551ccd5166c50ca238a58b136144454 --- /dev/null +++ b/GPT_SoVITS/AR/modules/transformer_onnx.py @@ -0,0 +1,281 @@ +# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/transformer.py +import copy +import numbers +from functools import partial +from typing import Any +from typing import Callable +from typing import List +from typing import Optional +from typing import Tuple +from typing import Union + +import torch +from AR.modules.activation_onnx import MultiheadAttention +from AR.modules.scaling import BalancedDoubleSwish +from torch import nn +from torch import Tensor +from torch.nn import functional as F + +_shape_t = Union[int, List[int], torch.Size] + + +class LayerNorm(nn.Module): + __constants__ = ["normalized_shape", "eps", "elementwise_affine"] + normalized_shape: Tuple[int, ...] + eps: float + elementwise_affine: bool + + def __init__( + self, + normalized_shape: _shape_t, + eps: float = 1e-5, + elementwise_affine: bool = True, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super(LayerNorm, self).__init__() + if isinstance(normalized_shape, numbers.Integral): + # mypy error: incompatible types in assignment + normalized_shape = (normalized_shape,) # type: ignore[assignment] + self.normalized_shape = tuple(normalized_shape) # type: ignore[arg-type] + self.eps = eps + self.elementwise_affine = elementwise_affine + if self.elementwise_affine: + self.weight = nn.Parameter(torch.empty(self.normalized_shape, **factory_kwargs)) + self.bias = nn.Parameter(torch.empty(self.normalized_shape, **factory_kwargs)) + else: + self.register_parameter("weight", None) + self.register_parameter("bias", None) + + self.reset_parameters() + + def reset_parameters(self) -> None: + if self.elementwise_affine: + nn.init.ones_(self.weight) + nn.init.zeros_(self.bias) + + def forward(self, input: Tensor, embedding: Any = None) -> Tensor: + if isinstance(input, tuple): + input, embedding = input + return ( + F.layer_norm( + input, + self.normalized_shape, + self.weight, + self.bias, + self.eps, + ), + embedding, + ) + + assert embedding is None + return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps) + + def extra_repr(self) -> str: + return "{normalized_shape}, eps={eps}, elementwise_affine={elementwise_affine}".format(**self.__dict__) + + +class IdentityNorm(nn.Module): + def __init__( + self, + d_model: int, + eps: float = 1e-5, + device=None, + dtype=None, + ) -> None: + super(IdentityNorm, self).__init__() + + def forward(self, input: Tensor, embedding: Any = None) -> Tensor: + if isinstance(input, tuple): + return input + + assert embedding is None + return input + + +class TransformerEncoder(nn.Module): + r"""TransformerEncoder is a stack of N encoder layers. Users can build the + BERT(https://arxiv.org/abs/1810.04805) model with corresponding parameters. + + Args: + encoder_layer: an instance of the TransformerEncoderLayer() class (required). + num_layers: the number of sub-encoder-layers in the encoder (required). + norm: the layer normalization component (optional). + enable_nested_tensor: if True, input will automatically convert to nested tensor + (and convert back on output). This will improve the overall performance of + TransformerEncoder when padding rate is high. Default: ``True`` (enabled). + + Examples:: + >>> encoder_layer = TransformerEncoderLayer(d_model=512, nhead=8) + >>> transformer_encoder = TransformerEncoder(encoder_layer, num_layers=6) + >>> src = torch.rand(10, 32, 512) + >>> out = transformer_encoder(src) + """ + + __constants__ = ["norm"] + + def __init__(self, encoder_layer, num_layers, norm=None): + super(TransformerEncoder, self).__init__() + self.layers = _get_clones(encoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + + def forward( + self, + src: Tensor, + mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + return_layer_states: bool = False, + cache=None, + ) -> Tensor: + output = src + for mod in self.layers: + output = mod( + output, + src_mask=mask, + src_key_padding_mask=src_key_padding_mask, + cache=cache, + ) + + if self.norm is not None: + output = self.norm(output) + + return output + + +class TransformerEncoderLayer(nn.Module): + __constants__ = ["batch_first", "norm_first"] + + def __init__( + self, + d_model: int, + nhead: int, + dim_feedforward: int = 2048, + dropout: float = 0.1, + activation: Union[str, Callable[[Tensor], Tensor]] = F.relu, + batch_first: bool = False, + norm_first: bool = False, + device=None, + dtype=None, + linear1_self_attention_cls: nn.Module = nn.Linear, + linear2_self_attention_cls: nn.Module = nn.Linear, + linear1_feedforward_cls: nn.Module = nn.Linear, + linear2_feedforward_cls: nn.Module = nn.Linear, + layer_norm_cls: nn.Module = LayerNorm, + layer_norm_eps: float = 1e-5, + adaptive_layer_norm=False, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super(TransformerEncoderLayer, self).__init__() + self.self_attn = MultiheadAttention( + d_model, # 512 16 + nhead, + dropout=dropout, + batch_first=batch_first, + linear1_cls=linear1_self_attention_cls, + linear2_cls=linear2_self_attention_cls, + **factory_kwargs, + ) + self.linear1 = linear1_feedforward_cls(d_model, dim_feedforward, **factory_kwargs) + self.dropout = nn.Dropout(dropout) + self.linear2 = linear2_feedforward_cls(dim_feedforward, d_model, **factory_kwargs) + self.norm_first = norm_first + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + if isinstance(activation, str): + activation = _get_activation_fn(activation) + elif isinstance(activation, partial): + activation = activation(d_model) + elif activation == BalancedDoubleSwish: + activation = BalancedDoubleSwish(d_model) + self.activation = activation + + norm1 = layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs) + if layer_norm_cls == IdentityNorm: + norm2 = BalancedBasicNorm(d_model, eps=layer_norm_eps, **factory_kwargs) + else: + norm2 = layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs) + + if adaptive_layer_norm: + self.norm1 = AdaptiveLayerNorm(d_model, norm1) + self.norm2 = AdaptiveLayerNorm(d_model, norm2) + else: + self.norm1 = norm1 + self.norm2 = norm2 + + def __setstate__(self, state): + super(TransformerEncoderLayer, self).__setstate__(state) + if not hasattr(self, "activation"): + self.activation = F.relu + + def forward( + self, + src: Tensor, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + cache=None, + ) -> Tensor: + x = src + stage_embedding = None + x = self.norm1( + x + self._sa_block(x, src_mask, src_key_padding_mask, cache=cache), + stage_embedding, + ) + x = self.norm2(x + self._ff_block(x), stage_embedding) + + return x + + def _sa_block( + self, + x: Tensor, + attn_mask: Optional[Tensor], + key_padding_mask: Optional[Tensor], + cache=None, + ) -> Tensor: + x = self.self_attn( + x, + x, + x, + attn_mask=attn_mask, + key_padding_mask=key_padding_mask, + need_weights=False, + cache=cache, + ) + return self.dropout1(x) + + def _ff_block(self, x: Tensor) -> Tensor: + x = self.linear2(self.dropout(self.activation(self.linear1(x)))) + return self.dropout2(x) + + +class AdaptiveLayerNorm(nn.Module): + r"""Adaptive Layer Normalization""" + + def __init__(self, d_model, norm) -> None: + super(AdaptiveLayerNorm, self).__init__() + self.project_layer = nn.Linear(d_model, 2 * d_model) + self.norm = norm + self.d_model = d_model + self.eps = self.norm.eps + + def forward(self, input: Tensor, embedding: Tensor = None) -> Tensor: + if isinstance(input, tuple): + input, embedding = input + weight, bias = torch.split( + self.project_layer(embedding), + split_size_or_sections=self.d_model, + dim=-1, + ) + return (weight * self.norm(input) + bias, embedding) + + weight, bias = torch.split( + self.project_layer(embedding), + split_size_or_sections=self.d_model, + dim=-1, + ) + return weight * self.norm(input) + bias + + +def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) diff --git a/GPT_SoVITS/AR/text_processing/__init__.py b/GPT_SoVITS/AR/text_processing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/GPT_SoVITS/AR/text_processing/phonemizer.py b/GPT_SoVITS/AR/text_processing/phonemizer.py new file mode 100644 index 0000000000000000000000000000000000000000..1003040e282c51e4e240a122bce4f3b87a09b38f --- /dev/null +++ b/GPT_SoVITS/AR/text_processing/phonemizer.py @@ -0,0 +1,72 @@ +# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/phonemizer.py +# reference: https://github.com/lifeiteng/vall-e +import itertools +import re +from typing import Dict +from typing import List + +import regex +from gruut import sentences +from gruut.const import Sentence +from gruut.const import Word +from AR.text_processing.symbols import SYMBOL_TO_ID + + +class GruutPhonemizer: + def __init__(self, language: str): + self._phonemizer = sentences + self.lang = language + self.symbol_to_id = SYMBOL_TO_ID + self._special_cases_dict: Dict[str] = { + r"\.\.\.": "... ", + ";": "; ", + ":": ": ", + ",": ", ", + r"\.": ". ", + "!": "! ", + r"\?": "? ", + "—": "—", + "…": "… ", + "«": "«", + "»": "»", + } + self._punctuation_regexp: str = rf"([{''.join(self._special_cases_dict.keys())}])" + + def _normalize_punctuation(self, text: str) -> str: + text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text) + text = regex.sub(rf"{self._punctuation_regexp}(\pL)", r"\1 \2", text) + text = regex.sub(r"\pZ+", r" ", text) + return text.strip() + + def _convert_punctuation(self, word: Word) -> str: + if not word.phonemes: + return "" + if word.phonemes[0] in ["‖", "|"]: + return word.text.strip() + + phonemes = "".join(word.phonemes) + # remove modifier characters ˈˌː with regex + phonemes = re.sub(r"[ˈˌː͡]", "", phonemes) + return phonemes.strip() + + def phonemize(self, text: str, espeak: bool = False) -> str: + text_to_phonemize: str = self._normalize_punctuation(text) + sents: List[Sentence] = [sent for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak)] + words: List[str] = [self._convert_punctuation(word) for word in itertools.chain(*sents)] + return " ".join(words) + + def transform(self, phonemes): + # convert phonemes to ids + # dictionary is in symbols.py + return [self.symbol_to_id[p] for p in phonemes if p in self.symbol_to_id.keys()] + + +if __name__ == "__main__": + phonemizer = GruutPhonemizer("en-us") + # text -> IPA + phonemes = phonemizer.phonemize("Hello, wor-ld ?") + print("phonemes:", phonemes) + print("len(phonemes):", len(phonemes)) + phoneme_ids = phonemizer.transform(phonemes) + print("phoneme_ids:", phoneme_ids) + print("len(phoneme_ids):", len(phoneme_ids)) diff --git a/GPT_SoVITS/AR/text_processing/symbols.py b/GPT_SoVITS/AR/text_processing/symbols.py new file mode 100644 index 0000000000000000000000000000000000000000..f7ef57faf5b83cb2417b4f9244244dc9939153aa --- /dev/null +++ b/GPT_SoVITS/AR/text_processing/symbols.py @@ -0,0 +1,12 @@ +# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/symbols.py +# reference: https://github.com/lifeiteng/vall-e +PAD = "_" +PUNCTUATION = ';:,.!?¡¿—…"«»“” ' +LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" +IPA_LETTERS = ( + "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" +) +SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS) +SPACE_ID = SYMBOLS.index(" ") +SYMBOL_TO_ID = {s: i for i, s in enumerate(SYMBOLS)} +ID_TO_SYMBOL = {i: s for i, s in enumerate(SYMBOLS)} diff --git a/GPT_SoVITS/AR/utils/__init__.py b/GPT_SoVITS/AR/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4a9cb4d58504c17a96634ddd98f22cadad9365de --- /dev/null +++ b/GPT_SoVITS/AR/utils/__init__.py @@ -0,0 +1,36 @@ +import re + + +def str2bool(str): + return True if str.lower() == "true" else False + + +def get_newest_ckpt(string_list): + # 定义一个正则表达式模式,用于匹配字符串中的数字 + pattern = r"epoch=(\d+)-step=(\d+)\.ckpt" + + # 使用正则表达式提取每个字符串中的数字信息,并创建一个包含元组的列表 + extracted_info = [] + for string in string_list: + match = re.match(pattern, string) + if match: + epoch = int(match.group(1)) + step = int(match.group(2)) + extracted_info.append((epoch, step, string)) + # 按照 epoch 后面的数字和 step 后面的数字进行排序 + sorted_info = sorted(extracted_info, key=lambda x: (x[0], x[1]), reverse=True) + # 获取最新的 ckpt 文件名 + newest_ckpt = sorted_info[0][2] + return newest_ckpt + + +# 文本存在且不为空时 return True +def check_txt_file(file_path): + try: + with open(file_path, "r") as file: + text = file.readline().strip() + assert text.strip() != "" + return text + except Exception: + return False + return False diff --git a/GPT_SoVITS/AR/utils/initialize.py b/GPT_SoVITS/AR/utils/initialize.py new file mode 100644 index 0000000000000000000000000000000000000000..ee7c713823f57572ab8f7045ceba21e8e2619e4c --- /dev/null +++ b/GPT_SoVITS/AR/utils/initialize.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +"""Initialize modules for espnet2 neural networks.""" + +import torch +from typeguard import check_argument_types + + +def initialize(model: torch.nn.Module, init: str): + """Initialize weights of a neural network module. + + Parameters are initialized using the given method or distribution. + + Custom initialization routines can be implemented into submodules + as function `espnet_initialization_fn` within the custom module. + + Args: + model: Target. + init: Method of initialization. + """ + assert check_argument_types() + print("init with", init) + + # weight init + for p in model.parameters(): + if p.dim() > 1: + if init == "xavier_uniform": + torch.nn.init.xavier_uniform_(p.data) + elif init == "xavier_normal": + torch.nn.init.xavier_normal_(p.data) + elif init == "kaiming_uniform": + torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu") + elif init == "kaiming_normal": + torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu") + else: + raise ValueError("Unknown initialization: " + init) + # bias init + for name, p in model.named_parameters(): + if ".bias" in name and p.dim() == 1: + p.data.zero_() diff --git a/GPT_SoVITS/AR/utils/io.py b/GPT_SoVITS/AR/utils/io.py new file mode 100644 index 0000000000000000000000000000000000000000..a6475cb6b114787acfde5d73e1552cf58e04997b --- /dev/null +++ b/GPT_SoVITS/AR/utils/io.py @@ -0,0 +1,30 @@ +import sys + +import torch +import yaml + + +def load_yaml_config(path): + with open(path) as f: + config = yaml.full_load(f) + return config + + +def save_config_to_yaml(config, path): + assert path.endswith(".yaml") + with open(path, "w") as f: + f.write(yaml.dump(config)) + f.close() + + +def write_args(args, path): + args_dict = dict((name, getattr(args, name)) for name in dir(args) if not name.startswith("_")) + with open(path, "a") as args_file: + args_file.write("==> torch version: {}\n".format(torch.__version__)) + args_file.write("==> cudnn version: {}\n".format(torch.backends.cudnn.version())) + args_file.write("==> Cmd:\n") + args_file.write(str(sys.argv)) + args_file.write("\n==> args:\n") + for k, v in sorted(args_dict.items()): + args_file.write(" %s: %s\n" % (str(k), str(v))) + args_file.close() diff --git a/GPT_SoVITS/download.py b/GPT_SoVITS/download.py new file mode 100644 index 0000000000000000000000000000000000000000..fc4ead63bfe3c15326212a6ebabe2dac166e0ff2 --- /dev/null +++ b/GPT_SoVITS/download.py @@ -0,0 +1,13 @@ +import os +import sys + +now_dir = os.getcwd() +sys.path.insert(0, now_dir) +from text.g2pw import G2PWPinyin + +g2pw = G2PWPinyin( + model_dir="GPT_SoVITS/text/G2PWModel", + model_source="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", + v_to_u=False, + neutral_tone_with_five=True, +) diff --git a/GPT_SoVITS/export_torch_script.py b/GPT_SoVITS/export_torch_script.py new file mode 100644 index 0000000000000000000000000000000000000000..69817a3763b140430ad68907be0766823c170056 --- /dev/null +++ b/GPT_SoVITS/export_torch_script.py @@ -0,0 +1,861 @@ +# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_model.py +# reference: https://github.com/lifeiteng/vall-e +import argparse +from typing import Optional +from my_utils import load_audio +import torch +import torchaudio + +from torch import IntTensor, LongTensor, Tensor, nn +from torch.nn import functional as F + +from transformers import AutoModelForMaskedLM, AutoTokenizer +from feature_extractor import cnhubert + +from AR.models.t2s_lightning_module import Text2SemanticLightningModule +from module.models_onnx import SynthesizerTrn + +from inference_webui import get_phones_and_bert + +import os +import soundfile + +default_config = { + "embedding_dim": 512, + "hidden_dim": 512, + "num_head": 8, + "num_layers": 12, + "num_codebook": 8, + "p_dropout": 0.0, + "vocab_size": 1024 + 1, + "phoneme_vocab_size": 512, + "EOS": 1024, +} + + +def get_raw_t2s_model(dict_s1) -> Text2SemanticLightningModule: + config = dict_s1["config"] + config["model"]["dropout"] = float(config["model"]["dropout"]) + t2s_model = Text2SemanticLightningModule(config, "****", is_train=False) + t2s_model.load_state_dict(dict_s1["weight"]) + t2s_model = t2s_model.eval() + return t2s_model + + +@torch.jit.script +def logits_to_probs( + logits, + previous_tokens: Optional[torch.Tensor] = None, + temperature: float = 1.0, + top_k: Optional[int] = None, + top_p: Optional[int] = None, + repetition_penalty: float = 1.0, +): + # if previous_tokens is not None: + # previous_tokens = previous_tokens.squeeze() + # print(logits.shape,previous_tokens.shape) + # pdb.set_trace() + if previous_tokens is not None and repetition_penalty != 1.0: + previous_tokens = previous_tokens.long() + score = torch.gather(logits, dim=1, index=previous_tokens) + score = torch.where(score < 0, score * repetition_penalty, score / repetition_penalty) + logits.scatter_(dim=1, index=previous_tokens, src=score) + + if top_p is not None and top_p < 1.0: + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cum_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1) + sorted_indices_to_remove = cum_probs > top_p + sorted_indices_to_remove[:, 0] = False # keep at least one option + indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove) + logits = logits.masked_fill(indices_to_remove, -float("Inf")) + + logits = logits / max(temperature, 1e-5) + + if top_k is not None: + v, _ = torch.topk(logits, min(top_k, logits.size(-1))) + pivot = v[:, -1].unsqueeze(-1) + logits = torch.where(logits < pivot, -float("Inf"), logits) + + probs = torch.nn.functional.softmax(logits, dim=-1) + return probs + + +@torch.jit.script +def multinomial_sample_one_no_sync(probs_sort): + # Does multinomial sampling without a cuda synchronization + q = torch.randn_like(probs_sort) + return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int) + + +@torch.jit.script +def sample( + logits, + previous_tokens, + temperature: float = 1.0, + top_k: Optional[int] = None, + top_p: Optional[int] = None, + repetition_penalty: float = 1.0, +): + probs = logits_to_probs( + logits=logits, + previous_tokens=previous_tokens, + temperature=temperature, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + ) + idx_next = multinomial_sample_one_no_sync(probs) + return idx_next, probs + + +@torch.jit.script +def spectrogram_torch(y: Tensor, n_fft: int, sampling_rate: int, hop_size: int, win_size: int, center: bool = False): + hann_window = torch.hann_window(win_size, device=y.device, dtype=y.dtype) + y = torch.nn.functional.pad( + y.unsqueeze(1), + (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), + mode="reflect", + ) + y = y.squeeze(1) + spec = torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window, + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=False, + ) + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + return spec + + +class DictToAttrRecursive(dict): + def __init__(self, input_dict): + super().__init__(input_dict) + for key, value in input_dict.items(): + if isinstance(value, dict): + value = DictToAttrRecursive(value) + self[key] = value + setattr(self, key, value) + + def __getattr__(self, item): + try: + return self[item] + except KeyError: + raise AttributeError(f"Attribute {item} not found") + + def __setattr__(self, key, value): + if isinstance(value, dict): + value = DictToAttrRecursive(value) + super(DictToAttrRecursive, self).__setitem__(key, value) + super().__setattr__(key, value) + + def __delattr__(self, item): + try: + del self[item] + except KeyError: + raise AttributeError(f"Attribute {item} not found") + + +@torch.jit.script +class T2SMLP: + def __init__(self, w1, b1, w2, b2): + self.w1 = w1 + self.b1 = b1 + self.w2 = w2 + self.b2 = b2 + + def forward(self, x): + x = F.relu(F.linear(x, self.w1, self.b1)) + x = F.linear(x, self.w2, self.b2) + return x + + +@torch.jit.script +class T2SBlock: + def __init__( + self, + num_heads: int, + hidden_dim: int, + mlp: T2SMLP, + qkv_w, + qkv_b, + out_w, + out_b, + norm_w1, + norm_b1, + norm_eps1: float, + norm_w2, + norm_b2, + norm_eps2: float, + ): + self.num_heads = num_heads + self.mlp = mlp + self.hidden_dim: int = hidden_dim + self.qkv_w = qkv_w + self.qkv_b = qkv_b + self.out_w = out_w + self.out_b = out_b + self.norm_w1 = norm_w1 + self.norm_b1 = norm_b1 + self.norm_eps1 = norm_eps1 + self.norm_w2 = norm_w2 + self.norm_b2 = norm_b2 + self.norm_eps2 = norm_eps2 + + self.false = torch.tensor(False, dtype=torch.bool) + + @torch.jit.ignore + def to_mask(self, x: torch.Tensor, padding_mask: Optional[torch.Tensor]): + if padding_mask is None: + return x + + if padding_mask.dtype == torch.bool: + return x.masked_fill(padding_mask, 0) + else: + return x * padding_mask + + def process_prompt(self, x: torch.Tensor, attn_mask: torch.Tensor, padding_mask: Optional[torch.Tensor] = None): + q, k, v = F.linear(self.to_mask(x, padding_mask), self.qkv_w, self.qkv_b).chunk(3, dim=-1) + + batch_size = q.shape[0] + q_len = q.shape[1] + kv_len = k.shape[1] + + q = self.to_mask(q, padding_mask) + k_cache = self.to_mask(k, padding_mask) + v_cache = self.to_mask(v, padding_mask) + + q = q.view(batch_size, q_len, self.num_heads, -1).transpose(1, 2) + k = k_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2) + v = v_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2) + + attn = F.scaled_dot_product_attention(q, k, v, ~attn_mask) + + attn = attn.permute(2, 0, 1, 3).reshape(batch_size * q_len, self.hidden_dim) + attn = attn.view(q_len, batch_size, self.hidden_dim).transpose(1, 0) + attn = F.linear(self.to_mask(attn, padding_mask), self.out_w, self.out_b) + + if padding_mask is not None: + for i in range(batch_size): + # mask = padding_mask[i,:,0] + if self.false.device != padding_mask.device: + self.false = self.false.to(padding_mask.device) + idx = torch.where(padding_mask[i, :, 0] == self.false)[0] + x_item = x[i, idx, :].unsqueeze(0) + attn_item = attn[i, idx, :].unsqueeze(0) + x_item = x_item + attn_item + x_item = F.layer_norm(x_item, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1) + x_item = x_item + self.mlp.forward(x_item) + x_item = F.layer_norm( + x_item, + [self.hidden_dim], + self.norm_w2, + self.norm_b2, + self.norm_eps2, + ) + x[i, idx, :] = x_item.squeeze(0) + x = self.to_mask(x, padding_mask) + else: + x = x + attn + x = F.layer_norm(x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1) + x = x + self.mlp.forward(x) + x = F.layer_norm( + x, + [self.hidden_dim], + self.norm_w2, + self.norm_b2, + self.norm_eps2, + ) + return x, k_cache, v_cache + + def decode_next_token(self, x: torch.Tensor, k_cache: torch.Tensor, v_cache: torch.Tensor): + q, k, v = F.linear(x, self.qkv_w, self.qkv_b).chunk(3, dim=-1) + + k_cache = torch.cat([k_cache, k], dim=1) + v_cache = torch.cat([v_cache, v], dim=1) + + batch_size = q.shape[0] + q_len = q.shape[1] + kv_len = k_cache.shape[1] + + q = q.view(batch_size, q_len, self.num_heads, -1).transpose(1, 2) + k = k_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2) + v = v_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2) + + attn = F.scaled_dot_product_attention(q, k, v) + + attn = attn.permute(2, 0, 1, 3).reshape(batch_size * q_len, self.hidden_dim) + attn = attn.view(q_len, batch_size, self.hidden_dim).transpose(1, 0) + attn = F.linear(attn, self.out_w, self.out_b) + + x = x + attn + x = F.layer_norm(x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1) + x = x + self.mlp.forward(x) + x = F.layer_norm( + x, + [self.hidden_dim], + self.norm_w2, + self.norm_b2, + self.norm_eps2, + ) + return x, k_cache, v_cache + + +@torch.jit.script +class T2STransformer: + def __init__(self, num_blocks: int, blocks: list[T2SBlock]): + self.num_blocks: int = num_blocks + self.blocks = blocks + + def process_prompt(self, x: torch.Tensor, attn_mask: torch.Tensor, padding_mask: Optional[torch.Tensor] = None): + k_cache: list[torch.Tensor] = [] + v_cache: list[torch.Tensor] = [] + for i in range(self.num_blocks): + x, k_cache_, v_cache_ = self.blocks[i].process_prompt(x, attn_mask, padding_mask) + k_cache.append(k_cache_) + v_cache.append(v_cache_) + return x, k_cache, v_cache + + def decode_next_token(self, x: torch.Tensor, k_cache: list[torch.Tensor], v_cache: list[torch.Tensor]): + for i in range(self.num_blocks): + x, k_cache[i], v_cache[i] = self.blocks[i].decode_next_token(x, k_cache[i], v_cache[i]) + return x, k_cache, v_cache + + +class VitsModel(nn.Module): + def __init__(self, vits_path): + super().__init__() + # dict_s2 = torch.load(vits_path,map_location="cpu") + dict_s2 = torch.load(vits_path) + self.hps = dict_s2["config"] + if dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322: + self.hps["model"]["version"] = "v1" + else: + self.hps["model"]["version"] = "v2" + + self.hps = DictToAttrRecursive(self.hps) + self.hps.model.semantic_frame_rate = "25hz" + self.vq_model = SynthesizerTrn( + self.hps.data.filter_length // 2 + 1, + self.hps.train.segment_size // self.hps.data.hop_length, + n_speakers=self.hps.data.n_speakers, + **self.hps.model, + ) + self.vq_model.eval() + self.vq_model.load_state_dict(dict_s2["weight"], strict=False) + + def forward(self, text_seq, pred_semantic, ref_audio, speed=1.0): + refer = spectrogram_torch( + ref_audio, + self.hps.data.filter_length, + self.hps.data.sampling_rate, + self.hps.data.hop_length, + self.hps.data.win_length, + center=False, + ) + return self.vq_model(pred_semantic, text_seq, refer, speed)[0, 0] + + +class T2SModel(nn.Module): + def __init__(self, raw_t2s: Text2SemanticLightningModule): + super(T2SModel, self).__init__() + self.model_dim = raw_t2s.model.model_dim + self.embedding_dim = raw_t2s.model.embedding_dim + self.num_head = raw_t2s.model.num_head + self.num_layers = raw_t2s.model.num_layers + self.vocab_size = raw_t2s.model.vocab_size + self.phoneme_vocab_size = raw_t2s.model.phoneme_vocab_size + # self.p_dropout = float(raw_t2s.model.p_dropout) + self.EOS: int = int(raw_t2s.model.EOS) + self.norm_first = raw_t2s.model.norm_first + assert self.EOS == self.vocab_size - 1 + self.hz = 50 + + self.bert_proj = raw_t2s.model.bert_proj + self.ar_text_embedding = raw_t2s.model.ar_text_embedding + self.ar_text_position = raw_t2s.model.ar_text_position + self.ar_audio_embedding = raw_t2s.model.ar_audio_embedding + self.ar_audio_position = raw_t2s.model.ar_audio_position + + # self.t2s_transformer = T2STransformer(self.num_layers, blocks) + # self.t2s_transformer = raw_t2s.model.t2s_transformer + + blocks = [] + h = raw_t2s.model.h + + for i in range(self.num_layers): + layer = h.layers[i] + t2smlp = T2SMLP(layer.linear1.weight, layer.linear1.bias, layer.linear2.weight, layer.linear2.bias) + + block = T2SBlock( + self.num_head, + self.model_dim, + t2smlp, + layer.self_attn.in_proj_weight, + layer.self_attn.in_proj_bias, + layer.self_attn.out_proj.weight, + layer.self_attn.out_proj.bias, + layer.norm1.weight, + layer.norm1.bias, + layer.norm1.eps, + layer.norm2.weight, + layer.norm2.bias, + layer.norm2.eps, + ) + + blocks.append(block) + + self.t2s_transformer = T2STransformer(self.num_layers, blocks) + + # self.ar_predict_layer = nn.Linear(self.model_dim, self.vocab_size, bias=False) + self.ar_predict_layer = raw_t2s.model.ar_predict_layer + # self.loss_fct = nn.CrossEntropyLoss(reduction="sum") + self.max_sec = raw_t2s.config["data"]["max_sec"] + self.top_k = int(raw_t2s.config["inference"]["top_k"]) + self.early_stop_num = torch.LongTensor([self.hz * self.max_sec]) + + def forward( + self, + prompts: LongTensor, + ref_seq: LongTensor, + text_seq: LongTensor, + ref_bert: torch.Tensor, + text_bert: torch.Tensor, + top_k: LongTensor, + ): + bert = torch.cat([ref_bert.T, text_bert.T], 1) + all_phoneme_ids = torch.cat([ref_seq, text_seq], 1) + bert = bert.unsqueeze(0) + + x = self.ar_text_embedding(all_phoneme_ids) + x = x + self.bert_proj(bert.transpose(1, 2)) + x: torch.Tensor = self.ar_text_position(x) + + early_stop_num = self.early_stop_num + + # [1,N,512] [1,N] + # y, k, v, y_emb, x_example = self.first_stage_decoder(x, prompts) + y = prompts + # x_example = x[:,:,0] * 0.0 + + x_len = x.shape[1] + x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool) + + y_emb = self.ar_audio_embedding(y) + y_len = y_emb.shape[1] + prefix_len = y.shape[1] + y_pos = self.ar_audio_position(y_emb) + xy_pos = torch.concat([x, y_pos], dim=1) + + bsz = x.shape[0] + src_len = x_len + y_len + x_attn_mask_pad = F.pad( + x_attn_mask, + (0, y_len), ###xx的纯0扩展到xx纯0+xy纯1,(x,x+y) + value=True, + ) + y_attn_mask = F.pad( ###yy的右上1扩展到左边xy的0,(y,x+y) + torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1), + (x_len, 0), + value=False, + ) + xy_attn_mask = ( + torch.concat([x_attn_mask_pad, y_attn_mask], dim=0) + .unsqueeze(0) + .expand(bsz * self.num_head, -1, -1) + .view(bsz, self.num_head, src_len, src_len) + .to(device=x.device, dtype=torch.bool) + ) + + idx = 0 + top_k = int(top_k) + + xy_dec, k_cache, v_cache = self.t2s_transformer.process_prompt(xy_pos, xy_attn_mask, None) + + logits = self.ar_predict_layer(xy_dec[:, -1]) + logits = logits[:, :-1] + samples = sample(logits, y, top_k=top_k, top_p=1, repetition_penalty=1.35, temperature=1.0)[0] + y = torch.concat([y, samples], dim=1) + y_emb = self.ar_audio_embedding(y[:, -1:]) + xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[ + :, y_len + idx + ].to(dtype=y_emb.dtype, device=y_emb.device) + + stop = False + # for idx in range(1, 50): + for idx in range(1, 1500): + # [1, N] [N_layer, N, 1, 512] [N_layer, N, 1, 512] [1, N, 512] [1] [1, N, 512] [1, N] + # y, k, v, y_emb, logits, samples = self.stage_decoder(y, k, v, y_emb, x_example) + xy_dec, k_cache, v_cache = self.t2s_transformer.decode_next_token(xy_pos, k_cache, v_cache) + logits = self.ar_predict_layer(xy_dec[:, -1]) + + if idx < 11: ###至少预测出10个token不然不给停止(0.4s) + logits = logits[:, :-1] + + samples = sample(logits, y, top_k=top_k, top_p=1, repetition_penalty=1.35, temperature=1.0)[0] + + y = torch.concat([y, samples], dim=1) + + if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: + stop = True + if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS: + stop = True + if stop: + if y.shape[1] == 0: + y = torch.concat([y, torch.zeros_like(samples)], dim=1) + break + + y_emb = self.ar_audio_embedding(y[:, -1:]) + xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[ + :, y_len + idx + ].to(dtype=y_emb.dtype, device=y_emb.device) + + y[0, -1] = 0 + + return y[:, -idx:].unsqueeze(0) + + +bert_path = os.environ.get("bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large") +cnhubert_base_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base" +cnhubert.cnhubert_base_path = cnhubert_base_path + + +@torch.jit.script +def build_phone_level_feature(res: Tensor, word2ph: IntTensor): + phone_level_feature = [] + for i in range(word2ph.shape[0]): + repeat_feature = res[i].repeat(word2ph[i].item(), 1) + phone_level_feature.append(repeat_feature) + phone_level_feature = torch.cat(phone_level_feature, dim=0) + # [sum(word2ph), 1024] + return phone_level_feature + + +class MyBertModel(torch.nn.Module): + def __init__(self, bert_model): + super(MyBertModel, self).__init__() + self.bert = bert_model + + def forward( + self, input_ids: torch.Tensor, attention_mask: torch.Tensor, token_type_ids: torch.Tensor, word2ph: IntTensor + ): + outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) + # res = torch.cat(outputs["hidden_states"][-3:-2], -1)[0][1:-1] + res = torch.cat(outputs[1][-3:-2], -1)[0][1:-1] + return build_phone_level_feature(res, word2ph) + + +class SSLModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.ssl = cnhubert.get_model().model + + def forward(self, ref_audio_16k) -> torch.Tensor: + ssl_content = self.ssl(ref_audio_16k)["last_hidden_state"].transpose(1, 2) + return ssl_content + + +class ExportSSLModel(torch.nn.Module): + def __init__(self, ssl: SSLModel): + super().__init__() + self.ssl = ssl + + def forward(self, ref_audio: torch.Tensor): + return self.ssl(ref_audio) + + @torch.jit.export + def resample(self, ref_audio: torch.Tensor, src_sr: int, dst_sr: int) -> torch.Tensor: + audio = resamplex(ref_audio, src_sr, dst_sr).float() + return audio + + +def export_bert(output_path): + tokenizer = AutoTokenizer.from_pretrained(bert_path) + + text = "叹息声一声接着一声传出,木兰对着房门织布.听不见织布机织布的声音,只听见木兰在叹息.问木兰在想什么?问木兰在惦记什么?木兰答道,我也没有在想什么,也没有在惦记什么." + ref_bert_inputs = tokenizer(text, return_tensors="pt") + word2ph = [] + for c in text: + if c in [",", "。", ":", "?", ",", ".", "?"]: + word2ph.append(1) + else: + word2ph.append(2) + ref_bert_inputs["word2ph"] = torch.Tensor(word2ph).int() + + bert_model = AutoModelForMaskedLM.from_pretrained(bert_path, output_hidden_states=True, torchscript=True) + my_bert_model = MyBertModel(bert_model) + + ref_bert_inputs = { + "input_ids": ref_bert_inputs["input_ids"], + "attention_mask": ref_bert_inputs["attention_mask"], + "token_type_ids": ref_bert_inputs["token_type_ids"], + "word2ph": ref_bert_inputs["word2ph"], + } + + torch._dynamo.mark_dynamic(ref_bert_inputs["input_ids"], 1) + torch._dynamo.mark_dynamic(ref_bert_inputs["attention_mask"], 1) + torch._dynamo.mark_dynamic(ref_bert_inputs["token_type_ids"], 1) + torch._dynamo.mark_dynamic(ref_bert_inputs["word2ph"], 0) + + my_bert_model = torch.jit.trace(my_bert_model, example_kwarg_inputs=ref_bert_inputs) + output_path = os.path.join(output_path, "bert_model.pt") + my_bert_model.save(output_path) + print("#### exported bert ####") + + +def export(gpt_path, vits_path, ref_audio_path, ref_text, output_path, export_bert_and_ssl=False, device="cpu"): + if not os.path.exists(output_path): + os.makedirs(output_path) + print(f"目录已创建: {output_path}") + else: + print(f"目录已存在: {output_path}") + + ref_audio = torch.tensor([load_audio(ref_audio_path, 16000)]).float() + ssl = SSLModel() + if export_bert_and_ssl: + s = ExportSSLModel(torch.jit.trace(ssl, example_inputs=(ref_audio))) + ssl_path = os.path.join(output_path, "ssl_model.pt") + torch.jit.script(s).save(ssl_path) + print("#### exported ssl ####") + export_bert(output_path) + else: + s = ExportSSLModel(ssl) + + print(f"device: {device}") + + ref_seq_id, ref_bert_T, ref_norm_text = get_phones_and_bert(ref_text, "all_zh", "v2") + ref_seq = torch.LongTensor([ref_seq_id]).to(device) + ref_bert = ref_bert_T.T.to(ref_seq.device) + text_seq_id, text_bert_T, norm_text = get_phones_and_bert( + "这是一条测试语音,说什么无所谓,只是给它一个例子", "all_zh", "v2" + ) + text_seq = torch.LongTensor([text_seq_id]).to(device) + text_bert = text_bert_T.T.to(text_seq.device) + + ssl_content = ssl(ref_audio).to(device) + + # vits_path = "SoVITS_weights_v2/xw_e8_s216.pth" + vits = VitsModel(vits_path).to(device) + vits.eval() + + # gpt_path = "GPT_weights_v2/xw-e15.ckpt" + # dict_s1 = torch.load(gpt_path, map_location=device) + dict_s1 = torch.load(gpt_path) + raw_t2s = get_raw_t2s_model(dict_s1).to(device) + print("#### get_raw_t2s_model ####") + print(raw_t2s.config) + t2s_m = T2SModel(raw_t2s) + t2s_m.eval() + t2s = torch.jit.script(t2s_m).to(device) + print("#### script t2s_m ####") + + print("vits.hps.data.sampling_rate:", vits.hps.data.sampling_rate) + gpt_sovits = GPT_SoVITS(t2s, vits).to(device) + gpt_sovits.eval() + + ref_audio_sr = s.resample(ref_audio, 16000, 32000).to(device) + + torch._dynamo.mark_dynamic(ssl_content, 2) + torch._dynamo.mark_dynamic(ref_audio_sr, 1) + torch._dynamo.mark_dynamic(ref_seq, 1) + torch._dynamo.mark_dynamic(text_seq, 1) + torch._dynamo.mark_dynamic(ref_bert, 0) + torch._dynamo.mark_dynamic(text_bert, 0) + + top_k = torch.LongTensor([5]).to(device) + + with torch.no_grad(): + gpt_sovits_export = torch.jit.trace( + gpt_sovits, example_inputs=(ssl_content, ref_audio_sr, ref_seq, text_seq, ref_bert, text_bert, top_k) + ) + + gpt_sovits_path = os.path.join(output_path, "gpt_sovits_model.pt") + gpt_sovits_export.save(gpt_sovits_path) + print("#### exported gpt_sovits ####") + + +@torch.jit.script +def parse_audio(ref_audio): + ref_audio_16k = torchaudio.functional.resample(ref_audio, 48000, 16000).float() # .to(ref_audio.device) + ref_audio_sr = torchaudio.functional.resample(ref_audio, 48000, 32000).float() # .to(ref_audio.device) + return ref_audio_16k, ref_audio_sr + + +@torch.jit.script +def resamplex(ref_audio: torch.Tensor, src_sr: int, dst_sr: int) -> torch.Tensor: + return torchaudio.functional.resample(ref_audio, src_sr, dst_sr).float() + + +class GPT_SoVITS(nn.Module): + def __init__(self, t2s: T2SModel, vits: VitsModel): + super().__init__() + self.t2s = t2s + self.vits = vits + + def forward( + self, + ssl_content: torch.Tensor, + ref_audio_sr: torch.Tensor, + ref_seq: Tensor, + text_seq: Tensor, + ref_bert: Tensor, + text_bert: Tensor, + top_k: LongTensor, + speed=1.0, + ): + codes = self.vits.vq_model.extract_latent(ssl_content) + prompt_semantic = codes[0, 0] + prompts = prompt_semantic.unsqueeze(0) + + pred_semantic = self.t2s(prompts, ref_seq, text_seq, ref_bert, text_bert, top_k) + audio = self.vits(text_seq, pred_semantic, ref_audio_sr, speed) + return audio + + +def test(): + parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool") + parser.add_argument("--gpt_model", required=True, help="Path to the GPT model file") + parser.add_argument("--sovits_model", required=True, help="Path to the SoVITS model file") + parser.add_argument("--ref_audio", required=True, help="Path to the reference audio file") + parser.add_argument("--ref_text", required=True, help="Path to the reference text file") + parser.add_argument("--output_path", required=True, help="Path to the output directory") + + args = parser.parse_args() + gpt_path = args.gpt_model + vits_path = args.sovits_model + ref_audio_path = args.ref_audio + ref_text = args.ref_text + + tokenizer = AutoTokenizer.from_pretrained(bert_path) + # bert_model = AutoModelForMaskedLM.from_pretrained(bert_path,output_hidden_states=True,torchscript=True) + # bert = MyBertModel(bert_model) + my_bert = torch.jit.load("onnx/bert_model.pt", map_location="cuda") + + # dict_s1 = torch.load(gpt_path, map_location="cuda") + # raw_t2s = get_raw_t2s_model(dict_s1) + # t2s = T2SModel(raw_t2s) + # t2s.eval() + # t2s = torch.jit.load("onnx/xw/t2s_model.pt",map_location='cuda') + + # vits_path = "SoVITS_weights_v2/xw_e8_s216.pth" + # vits = VitsModel(vits_path) + # vits.eval() + + # ssl = ExportSSLModel(SSLModel()).to('cuda') + # ssl.eval() + ssl = torch.jit.load("onnx/by/ssl_model.pt", map_location="cuda") + + # gpt_sovits = GPT_SoVITS(t2s,vits) + gpt_sovits = torch.jit.load("onnx/by/gpt_sovits_model.pt", map_location="cuda") + + ref_seq_id, ref_bert_T, ref_norm_text = get_phones_and_bert(ref_text, "all_zh", "v2") + ref_seq = torch.LongTensor([ref_seq_id]) + ref_bert = ref_bert_T.T.to(ref_seq.device) + # text_seq_id,text_bert_T,norm_text = get_phones_and_bert("昨天晚上看见征兵文书,知道君主在大规模征兵,那么多卷征兵文册,每一卷上都有父亲的名字.","all_zh",'v2') + text = "昨天晚上看见征兵文书,知道君主在大规模征兵,那么多卷征兵文册,每一卷上都有父亲的名字." + + text_seq_id, text_bert_T, norm_text = get_phones_and_bert(text, "all_zh", "v2") + + test_bert = tokenizer(text, return_tensors="pt") + word2ph = [] + for c in text: + if c in [",", "。", ":", "?", "?", ",", "."]: + word2ph.append(1) + else: + word2ph.append(2) + test_bert["word2ph"] = torch.Tensor(word2ph).int() + + test_bert = my_bert( + test_bert["input_ids"].to("cuda"), + test_bert["attention_mask"].to("cuda"), + test_bert["token_type_ids"].to("cuda"), + test_bert["word2ph"].to("cuda"), + ) + + text_seq = torch.LongTensor([text_seq_id]) + text_bert = text_bert_T.T.to(text_seq.device) + + print("text_bert:", text_bert.shape, text_bert) + print("test_bert:", test_bert.shape, test_bert) + print(torch.allclose(text_bert.to("cuda"), test_bert)) + + print("text_seq:", text_seq.shape) + print("text_bert:", text_bert.shape, text_bert.type()) + + # [1,N] + ref_audio = torch.tensor([load_audio(ref_audio_path, 16000)]).float().to("cuda") + print("ref_audio:", ref_audio.shape) + + ref_audio_sr = ssl.resample(ref_audio, 16000, 32000) + print("start ssl") + ssl_content = ssl(ref_audio) + + print("start gpt_sovits:") + print("ssl_content:", ssl_content.shape) + print("ref_audio_sr:", ref_audio_sr.shape) + print("ref_seq:", ref_seq.shape) + ref_seq = ref_seq.to("cuda") + print("text_seq:", text_seq.shape) + text_seq = text_seq.to("cuda") + print("ref_bert:", ref_bert.shape) + ref_bert = ref_bert.to("cuda") + print("text_bert:", text_bert.shape) + text_bert = text_bert.to("cuda") + + top_k = torch.LongTensor([5]).to("cuda") + + with torch.no_grad(): + audio = gpt_sovits(ssl_content, ref_audio_sr, ref_seq, text_seq, ref_bert, test_bert, top_k) + print("start write wav") + soundfile.write("out.wav", audio.detach().cpu().numpy(), 32000) + + +import text +import json + + +def export_symbel(version="v2"): + if version == "v1": + symbols = text._symbol_to_id_v1 + with open("onnx/symbols_v1.json", "w") as file: + json.dump(symbols, file, indent=4) + else: + symbols = text._symbol_to_id_v2 + with open("onnx/symbols_v2.json", "w") as file: + json.dump(symbols, file, indent=4) + + +def main(): + parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool") + parser.add_argument("--gpt_model", required=True, help="Path to the GPT model file") + parser.add_argument("--sovits_model", required=True, help="Path to the SoVITS model file") + parser.add_argument("--ref_audio", required=True, help="Path to the reference audio file") + parser.add_argument("--ref_text", required=True, help="Path to the reference text file") + parser.add_argument("--output_path", required=True, help="Path to the output directory") + parser.add_argument("--export_common_model", action="store_true", help="Export Bert and SSL model") + parser.add_argument("--device", help="Device to use") + + args = parser.parse_args() + export( + gpt_path=args.gpt_model, + vits_path=args.sovits_model, + ref_audio_path=args.ref_audio, + ref_text=args.ref_text, + output_path=args.output_path, + device=args.device, + export_bert_and_ssl=args.export_common_model, + ) + + +import inference_webui + +if __name__ == "__main__": + inference_webui.is_half = False + inference_webui.dtype = torch.float32 + main() + # test() diff --git a/GPT_SoVITS/export_torch_script_v3.py b/GPT_SoVITS/export_torch_script_v3.py new file mode 100644 index 0000000000000000000000000000000000000000..b34495a7adc3486ea2eb482c43c41e20ddbc7c48 --- /dev/null +++ b/GPT_SoVITS/export_torch_script_v3.py @@ -0,0 +1,1035 @@ +import os +from export_torch_script import ( + T2SModel, + get_raw_t2s_model, + resamplex, + spectrogram_torch, +) +from f5_tts.model.backbones.dit import DiT +from inference_webui import get_phones_and_bert +import librosa +from module import commons +from module.mel_processing import mel_spectrogram_torch +from module.models_onnx import CFM, SynthesizerTrnV3 +import numpy as np +import torch._dynamo.config +import torchaudio +import logging +import uvicorn +import torch +import soundfile +from librosa.filters import mel as librosa_mel_fn + + +from inference_webui import get_spepc, norm_spec, resample, ssl_model + +logging.config.dictConfig(uvicorn.config.LOGGING_CONFIG) +logger = logging.getLogger("uvicorn") + +is_half = True +device = "cuda" if torch.cuda.is_available() else "cpu" +now_dir = os.getcwd() + + +class MelSpectrgram(torch.nn.Module): + def __init__( + self, + dtype, + device, + n_fft, + num_mels, + sampling_rate, + hop_size, + win_size, + fmin, + fmax, + center=False, + ): + super().__init__() + self.hann_window = torch.hann_window(1024).to(device=device, dtype=dtype) + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) + self.mel_basis = torch.from_numpy(mel).to(dtype=dtype, device=device) + self.n_fft: int = n_fft + self.hop_size: int = hop_size + self.win_size: int = win_size + self.center: bool = center + + def forward(self, y): + y = torch.nn.functional.pad( + y.unsqueeze(1), + ( + int((self.n_fft - self.hop_size) / 2), + int((self.n_fft - self.hop_size) / 2), + ), + mode="reflect", + ) + y = y.squeeze(1) + spec = torch.stft( + y, + self.n_fft, + hop_length=self.hop_size, + win_length=self.win_size, + window=self.hann_window, + center=self.center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=False, + ) + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-9) + spec = torch.matmul(self.mel_basis, spec) + # spec = spectral_normalize_torch(spec) + spec = torch.log(torch.clamp(spec, min=1e-5)) + return spec + + +class ExportDitBlocks(torch.nn.Module): + def __init__(self, dit: DiT): + super().__init__() + self.transformer_blocks = dit.transformer_blocks + self.norm_out = dit.norm_out + self.proj_out = dit.proj_out + self.depth = dit.depth + + def forward(self, x, t, mask, rope): + for block in self.transformer_blocks: + x = block(x, t, mask=mask, rope=(rope, 1.0)) + x = self.norm_out(x, t) + output = self.proj_out(x) + return output + + +class ExportDitEmbed(torch.nn.Module): + def __init__(self, dit: DiT): + super().__init__() + self.time_embed = dit.time_embed + self.d_embed = dit.d_embed + self.text_embed = dit.text_embed + self.input_embed = dit.input_embed + self.rotary_embed = dit.rotary_embed + self.rotary_embed.inv_freq.to(device) + + def forward( + self, + x0: torch.Tensor, # nosied input audio # noqa: F722 + cond0: torch.Tensor, # masked cond audio # noqa: F722 + x_lens: torch.Tensor, + time: torch.Tensor, # time step # noqa: F821 F722 + dt_base_bootstrap: torch.Tensor, + text0: torch.Tensor, # noqa: F722#####condition feature + ): + x = x0.transpose(2, 1) + cond = cond0.transpose(2, 1) + text = text0.transpose(2, 1) + mask = commons.sequence_mask(x_lens, max_length=x.size(1)).to(x.device) + + t = self.time_embed(time) + self.d_embed(dt_base_bootstrap) + text_embed = self.text_embed(text, x.shape[1]) + rope_t = torch.arange(x.shape[1], device=device) + rope, _ = self.rotary_embed(rope_t) + x = self.input_embed(x, cond, text_embed) + return x, t, mask, rope + + +class ExportDiT(torch.nn.Module): + def __init__(self, dit: DiT): + super().__init__() + if dit != None: + self.embed = ExportDitEmbed(dit) + self.blocks = ExportDitBlocks(dit) + else: + self.embed = None + self.blocks = None + + def forward( # x, prompt_x, x_lens, t, style,cond + self, # d is channel,n is T + x0: torch.Tensor, # nosied input audio # noqa: F722 + cond0: torch.Tensor, # masked cond audio # noqa: F722 + x_lens: torch.Tensor, + time: torch.Tensor, # time step # noqa: F821 F722 + dt_base_bootstrap: torch.Tensor, + text0: torch.Tensor, # noqa: F722#####condition feature + ): + x, t, mask, rope = self.embed(x0, cond0, x_lens, time, dt_base_bootstrap, text0) + output = self.blocks(x, t, mask, rope) + return output + + +class ExportCFM(torch.nn.Module): + def __init__(self, cfm: CFM): + super().__init__() + self.cfm = cfm + + def forward( + self, + fea_ref: torch.Tensor, + fea_todo_chunk: torch.Tensor, + mel2: torch.Tensor, + sample_steps: torch.LongTensor, + ): + T_min = fea_ref.size(2) + fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1) + cfm_res = self.cfm(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps) + cfm_res = cfm_res[:, :, mel2.shape[2] :] + mel2 = cfm_res[:, :, -T_min:] + fea_ref = fea_todo_chunk[:, :, -T_min:] + return cfm_res, fea_ref, mel2 + + +mel_fn = lambda x: mel_spectrogram_torch( + x, + **{ + "n_fft": 1024, + "win_size": 1024, + "hop_size": 256, + "num_mels": 100, + "sampling_rate": 24000, + "fmin": 0, + "fmax": None, + "center": False, + }, +) + +spec_min = -12 +spec_max = 2 + + +@torch.jit.script +def norm_spec(x): + spec_min = -12 + spec_max = 2 + return (x - spec_min) / (spec_max - spec_min) * 2 - 1 + + +def denorm_spec(x): + spec_min = -12 + spec_max = 2 + return (x + 1) / 2 * (spec_max - spec_min) + spec_min + + +class ExportGPTSovitsHalf(torch.nn.Module): + def __init__(self, hps, t2s_m: T2SModel, vq_model: SynthesizerTrnV3): + super().__init__() + self.hps = hps + self.t2s_m = t2s_m + self.vq_model = vq_model + self.mel2 = MelSpectrgram( + dtype=torch.float32, + device=device, + n_fft=1024, + num_mels=100, + sampling_rate=24000, + hop_size=256, + win_size=1024, + fmin=0, + fmax=None, + center=False, + ) + # self.dtype = dtype + self.filter_length: int = hps.data.filter_length + self.sampling_rate: int = hps.data.sampling_rate + self.hop_length: int = hps.data.hop_length + self.win_length: int = hps.data.win_length + + def forward( + self, + ssl_content, + ref_audio_32k: torch.FloatTensor, + phoneme_ids0, + phoneme_ids1, + bert1, + bert2, + top_k, + ): + refer = spectrogram_torch( + ref_audio_32k, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ).to(ssl_content.dtype) + + codes = self.vq_model.extract_latent(ssl_content) + prompt_semantic = codes[0, 0] + prompt = prompt_semantic.unsqueeze(0) + # print('extract_latent',codes.shape,datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + + pred_semantic = self.t2s_m(prompt, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k) + # print('t2s_m',pred_semantic.shape,datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + + ge = self.vq_model.create_ge(refer) + # print('create_ge',datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + + prompt_ = prompt.unsqueeze(0) + fea_ref = self.vq_model(prompt_, phoneme_ids0, ge) + # print('fea_ref',datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + # print(prompt_.shape, phoneme_ids0.shape, ge.shape) + # print(fea_ref.shape) + + ref_24k = resamplex(ref_audio_32k, 32000, 24000) + mel2 = norm_spec(self.mel2(ref_24k)).to(ssl_content.dtype) + T_min = min(mel2.shape[2], fea_ref.shape[2]) + mel2 = mel2[:, :, :T_min] + fea_ref = fea_ref[:, :, :T_min] + if T_min > 468: + mel2 = mel2[:, :, -468:] + fea_ref = fea_ref[:, :, -468:] + T_min = 468 + + fea_todo = self.vq_model(pred_semantic, phoneme_ids1, ge) + # print('fea_todo',datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + # print(pred_semantic.shape, phoneme_ids1.shape, ge.shape) + # print(fea_todo.shape) + + return fea_ref, fea_todo, mel2 + + +class GPTSoVITSV3(torch.nn.Module): + def __init__(self, gpt_sovits_half, cfm, bigvgan): + super().__init__() + self.gpt_sovits_half = gpt_sovits_half + self.cfm = cfm + self.bigvgan = bigvgan + + def forward( + self, + ssl_content, + ref_audio_32k: torch.FloatTensor, + phoneme_ids0: torch.LongTensor, + phoneme_ids1: torch.LongTensor, + bert1, + bert2, + top_k: torch.LongTensor, + sample_steps: torch.LongTensor, + ): + # current_time = datetime.now() + # print("gpt_sovits_half",current_time.strftime("%Y-%m-%d %H:%M:%S")) + fea_ref, fea_todo, mel2 = self.gpt_sovits_half( + ssl_content, ref_audio_32k, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k + ) + chunk_len = 934 - fea_ref.shape[2] + wav_gen_list = [] + idx = 0 + wav_gen_length = fea_todo.shape[2] * 256 + while 1: + # current_time = datetime.now() + # print("idx:",idx,current_time.strftime("%Y-%m-%d %H:%M:%S")) + fea_todo_chunk = fea_todo[:, :, idx : idx + chunk_len] + if fea_todo_chunk.shape[-1] == 0: + break + + # 因为导出的模型在不同shape时会重新编译还是怎么的,会卡顿10s这样, + # 所以在这里补0让他shape维持不变 + # 但是这样会导致生成的音频长度不对,所以在最后截取一下。 + # 经过 bigvgan 之后音频长度就是 fea_todo.shape[2] * 256 + complete_len = chunk_len - fea_todo_chunk.shape[-1] + if complete_len != 0: + fea_todo_chunk = torch.cat( + [ + fea_todo_chunk, + torch.zeros(1, 512, complete_len).to(fea_todo_chunk.device).to(fea_todo_chunk.dtype), + ], + 2, + ) + + cfm_res, fea_ref, mel2 = self.cfm(fea_ref, fea_todo_chunk, mel2, sample_steps) + idx += chunk_len + + cfm_res = denorm_spec(cfm_res) + bigvgan_res = self.bigvgan(cfm_res) + wav_gen_list.append(bigvgan_res) + + wav_gen = torch.cat(wav_gen_list, 2) + return wav_gen[0][0][:wav_gen_length] + + +def init_bigvgan(): + global bigvgan_model + from BigVGAN import bigvgan + + bigvgan_model = bigvgan.BigVGAN.from_pretrained( + "%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,), + use_cuda_kernel=False, + ) # if True, RuntimeError: Ninja is required to load C++ extensions + # remove weight norm in the model and set to eval mode + bigvgan_model.remove_weight_norm() + bigvgan_model = bigvgan_model.eval() + if is_half == True: + bigvgan_model = bigvgan_model.half().to(device) + else: + bigvgan_model = bigvgan_model.to(device) + + +class Sovits: + def __init__(self, vq_model: SynthesizerTrnV3, cfm: CFM, hps): + self.vq_model = vq_model + self.hps = hps + cfm.estimator = ExportDiT(cfm.estimator) + self.cfm = cfm + + +class DictToAttrRecursive(dict): + def __init__(self, input_dict): + super().__init__(input_dict) + for key, value in input_dict.items(): + if isinstance(value, dict): + value = DictToAttrRecursive(value) + self[key] = value + setattr(self, key, value) + + def __getattr__(self, item): + try: + return self[item] + except KeyError: + raise AttributeError(f"Attribute {item} not found") + + def __setattr__(self, key, value): + if isinstance(value, dict): + value = DictToAttrRecursive(value) + super(DictToAttrRecursive, self).__setitem__(key, value) + super().__setattr__(key, value) + + def __delattr__(self, item): + try: + del self[item] + except KeyError: + raise AttributeError(f"Attribute {item} not found") + + +from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new + + +def get_sovits_weights(sovits_path): + path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth" + is_exist_s2gv3 = os.path.exists(path_sovits_v3) + + version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path) + if if_lora_v3 == True and is_exist_s2gv3 == False: + logger.info("SoVITS V3 底模缺失,无法加载相应 LoRA 权重") + + dict_s2 = load_sovits_new(sovits_path) + hps = dict_s2["config"] + hps = DictToAttrRecursive(hps) + hps.model.semantic_frame_rate = "25hz" + if "enc_p.text_embedding.weight" not in dict_s2["weight"]: + hps.model.version = "v2" # v3model,v2sybomls + elif dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322: + hps.model.version = "v1" + else: + hps.model.version = "v2" + + if model_version == "v3": + hps.model.version = "v3" + + logger.info(f"hps: {hps}") + + vq_model = SynthesizerTrnV3( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ) + # init_bigvgan() + model_version = hps.model.version + logger.info(f"模型版本: {model_version}") + + if is_half == True: + vq_model = vq_model.half().to(device) + else: + vq_model = vq_model.to(device) + vq_model.load_state_dict(dict_s2["weight"], strict=False) + vq_model.eval() + + cfm = vq_model.cfm + del vq_model.cfm + + sovits = Sovits(vq_model, cfm, hps) + return sovits + + +logger.info(f"torch version {torch.__version__}") +# ssl_model = cnhubert.get_model() +# if is_half: +# ssl_model = ssl_model.half().to(device) +# else: +# ssl_model = ssl_model.to(device) + + +def export_cfm( + e_cfm: ExportCFM, + mu: torch.Tensor, + x_lens: torch.LongTensor, + prompt: torch.Tensor, + n_timesteps: torch.IntTensor, + temperature=1.0, +): + cfm = e_cfm.cfm + + B, T = mu.size(0), mu.size(1) + x = torch.randn([B, cfm.in_channels, T], device=mu.device, dtype=mu.dtype) * temperature + print("x:", x.shape, x.dtype) + prompt_len = prompt.size(-1) + prompt_x = torch.zeros_like(x, dtype=mu.dtype) + prompt_x[..., :prompt_len] = prompt[..., :prompt_len] + x[..., :prompt_len] = 0.0 + mu = mu.transpose(2, 1) + + ntimestep = int(n_timesteps) + + t = torch.tensor(0.0, dtype=x.dtype, device=x.device) + d = torch.tensor(1.0 / ntimestep, dtype=x.dtype, device=x.device) + + t_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * t + d_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * d + + print( + "cfm input shapes:", + x.shape, + prompt_x.shape, + x_lens.shape, + t_tensor.shape, + d_tensor.shape, + mu.shape, + ) + + print("cfm input dtypes:", x.dtype, prompt_x.dtype, x_lens.dtype, t_tensor.dtype, d_tensor.dtype, mu.dtype) + + estimator: ExportDiT = torch.jit.trace( + cfm.estimator, + optimize=True, + example_inputs=(x, prompt_x, x_lens, t_tensor, d_tensor, mu), + ) + estimator.save("onnx/ad/estimator.pt") + # torch.onnx.export( + # cfm.estimator, + # (x, prompt_x, x_lens, t_tensor, d_tensor, mu), + # "onnx/ad/dit.onnx", + # input_names=["x", "prompt_x", "x_lens", "t", "d", "mu"], + # output_names=["output"], + # dynamic_axes={ + # "x": [2], + # "prompt_x": [2], + # "mu": [2], + # }, + # ) + print("save estimator ok") + cfm.estimator = estimator + export_cfm = torch.jit.script(e_cfm) + export_cfm.save("onnx/ad/cfm.pt") + # sovits.cfm = cfm + # cfm.save("onnx/ad/cfm.pt") + return export_cfm + + +def export(): + sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/s2Gv3.pth") + + init_bigvgan() + + dict_s1 = torch.load("GPT_SoVITS/pretrained_models/s1v3.ckpt") + raw_t2s = get_raw_t2s_model(dict_s1).to(device) + print("#### get_raw_t2s_model ####") + print(raw_t2s.config) + + if is_half: + raw_t2s = raw_t2s.half().to(device) + + t2s_m = T2SModel(raw_t2s) + t2s_m.eval() + script_t2s = torch.jit.script(t2s_m).to(device) + + hps = sovits.hps + ref_wav_path = "onnx/ad/ref.wav" + speed = 1.0 + sample_steps = 32 + dtype = torch.float16 if is_half == True else torch.float32 + refer = get_spepc(hps, ref_wav_path).to(device).to(dtype) + zero_wav = np.zeros( + int(hps.data.sampling_rate * 0.3), + dtype=np.float16 if is_half == True else np.float32, + ) + + with torch.no_grad(): + wav16k, sr = librosa.load(ref_wav_path, sr=16000) + wav16k = torch.from_numpy(wav16k) + zero_wav_torch = torch.from_numpy(zero_wav) + + if is_half == True: + wav16k = wav16k.half().to(device) + zero_wav_torch = zero_wav_torch.half().to(device) + else: + wav16k = wav16k.to(device) + zero_wav_torch = zero_wav_torch.to(device) + wav16k = torch.cat([wav16k, zero_wav_torch]) + ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2) # .float() + codes = sovits.vq_model.extract_latent(ssl_content) + prompt_semantic = codes[0, 0] + prompt = prompt_semantic.unsqueeze(0).to(device) + + phones1, bert1, norm_text1 = get_phones_and_bert( + "你这老坏蛋,我找了你这么久,真没想到在这里找到你。他说。", "all_zh", "v3" + ) + phones2, bert2, norm_text2 = get_phones_and_bert( + "这是一个简单的示例,真没想到这么简单就完成了。The King and His Stories.Once there was a king. He likes to write stories, but his stories were not good. As people were afraid of him, they all said his stories were good.After reading them, the writer at once turned to the soldiers and said: Take me back to prison, please.", + "auto", + "v3", + ) + phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0) + phoneme_ids1 = torch.LongTensor(phones2).to(device).unsqueeze(0) + + # codes = sovits.vq_model.extract_latent(ssl_content) + # prompt_semantic = codes[0, 0] + # prompts = prompt_semantic.unsqueeze(0) + + top_k = torch.LongTensor([15]).to(device) + print("topk", top_k) + + bert1 = bert1.T.to(device) + bert2 = bert2.T.to(device) + print( + prompt.dtype, + phoneme_ids0.dtype, + phoneme_ids1.dtype, + bert1.dtype, + bert2.dtype, + top_k.dtype, + ) + print( + prompt.shape, + phoneme_ids0.shape, + phoneme_ids1.shape, + bert1.shape, + bert2.shape, + top_k.shape, + ) + pred_semantic = t2s_m(prompt, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k) + + ge = sovits.vq_model.create_ge(refer) + prompt_ = prompt.unsqueeze(0) + + torch._dynamo.mark_dynamic(prompt_, 2) + torch._dynamo.mark_dynamic(phoneme_ids0, 1) + + fea_ref = sovits.vq_model(prompt_, phoneme_ids0, ge) + + inputs = { + "forward": (prompt_, phoneme_ids0, ge), + "extract_latent": ssl_content, + "create_ge": refer, + } + + trace_vq_model = torch.jit.trace_module(sovits.vq_model, inputs, optimize=True) + trace_vq_model.save("onnx/ad/vq_model.pt") + + print(fea_ref.shape, fea_ref.dtype, ge.shape) + print(prompt_.shape, phoneme_ids0.shape, ge.shape) + + # vq_model = torch.jit.trace( + # sovits.vq_model, + # optimize=True, + # # strict=False, + # example_inputs=(prompt_, phoneme_ids0, ge), + # ) + # vq_model = sovits.vq_model + vq_model = trace_vq_model + + gpt_sovits_half = ExportGPTSovitsHalf(sovits.hps, script_t2s, trace_vq_model) + torch.jit.script(gpt_sovits_half).save("onnx/ad/gpt_sovits_v3_half.pt") + + ref_audio, sr = torchaudio.load(ref_wav_path) + ref_audio = ref_audio.to(device).float() + if ref_audio.shape[0] == 2: + ref_audio = ref_audio.mean(0).unsqueeze(0) + if sr != 24000: + ref_audio = resample(ref_audio, sr) + # mel2 = mel_fn(ref_audio) + mel2 = norm_spec(mel_fn(ref_audio)) + T_min = min(mel2.shape[2], fea_ref.shape[2]) + fea_ref = fea_ref[:, :, :T_min] + print("fea_ref:", fea_ref.shape, T_min) + if T_min > 468: + mel2 = mel2[:, :, -468:] + fea_ref = fea_ref[:, :, -468:] + T_min = 468 + chunk_len = 934 - T_min + mel2 = mel2.to(dtype) + + # fea_todo, ge = sovits.vq_model(pred_semantic,y_lengths, phoneme_ids1, ge) + fea_todo = vq_model(pred_semantic, phoneme_ids1, ge) + + cfm_resss = [] + idx = 0 + sample_steps = torch.LongTensor([sample_steps]).to(device) + export_cfm_ = ExportCFM(sovits.cfm) + while 1: + print("idx:", idx) + fea_todo_chunk = fea_todo[:, :, idx : idx + chunk_len] + if fea_todo_chunk.shape[-1] == 0: + break + + print( + "export_cfm:", + fea_ref.shape, + fea_todo_chunk.shape, + mel2.shape, + sample_steps.shape, + ) + if idx == 0: + fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1) + export_cfm_ = export_cfm( + export_cfm_, + fea, + torch.LongTensor([fea.size(1)]).to(fea.device), + mel2, + sample_steps, + ) + # torch.onnx.export( + # export_cfm_, + # ( + # fea_ref, + # fea_todo_chunk, + # mel2, + # sample_steps, + # ), + # "onnx/ad/cfm.onnx", + # input_names=["fea_ref", "fea_todo_chunk", "mel2", "sample_steps"], + # output_names=["cfm_res", "fea_ref_", "mel2_"], + # dynamic_axes={ + # "fea_ref": [2], + # "fea_todo_chunk": [2], + # "mel2": [2], + # }, + # ) + + idx += chunk_len + + cfm_res, fea_ref, mel2 = export_cfm_(fea_ref, fea_todo_chunk, mel2, sample_steps) + cfm_resss.append(cfm_res) + continue + + cmf_res = torch.cat(cfm_resss, 2) + cmf_res = denorm_spec(cmf_res).to(device) + print("cmf_res:", cmf_res.shape, cmf_res.dtype) + with torch.inference_mode(): + cmf_res_rand = torch.randn(1, 100, 934).to(device).to(dtype) + torch._dynamo.mark_dynamic(cmf_res_rand, 2) + bigvgan_model_ = torch.jit.trace(bigvgan_model, optimize=True, example_inputs=(cmf_res_rand,)) + bigvgan_model_.save("onnx/ad/bigvgan_model.pt") + wav_gen = bigvgan_model(cmf_res) + print("wav_gen:", wav_gen.shape, wav_gen.dtype) + audio = wav_gen[0][0].cpu().detach().numpy() + + sr = 24000 + soundfile.write("out.export.wav", (audio * 32768).astype(np.int16), sr) + + +from datetime import datetime + + +def test_export( + todo_text, + gpt_sovits_v3_half, + cfm, + bigvgan, + output, +): + # hps = sovits.hps + ref_wav_path = "onnx/ad/ref.wav" + speed = 1.0 + sample_steps = 8 + + dtype = torch.float16 if is_half == True else torch.float32 + + zero_wav = np.zeros( + int(16000 * 0.3), + dtype=np.float16 if is_half == True else np.float32, + ) + + with torch.no_grad(): + wav16k, sr = librosa.load(ref_wav_path, sr=16000) + wav16k = torch.from_numpy(wav16k) + zero_wav_torch = torch.from_numpy(zero_wav) + + if is_half == True: + wav16k = wav16k.half().to(device) + zero_wav_torch = zero_wav_torch.half().to(device) + else: + wav16k = wav16k.to(device) + zero_wav_torch = zero_wav_torch.to(device) + wav16k = torch.cat([wav16k, zero_wav_torch]) + ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2) # .float() + + ref_audio_32k, _ = librosa.load(ref_wav_path, sr=32000) + ref_audio_32k = torch.from_numpy(ref_audio_32k).unsqueeze(0).to(device).float() + + phones1, bert1, norm_text1 = get_phones_and_bert( + "你这老坏蛋,我找了你这么久,真没想到在这里找到你。他说。", "all_zh", "v3" + ) + phones2, bert2, norm_text2 = get_phones_and_bert( + todo_text, + "zh", + "v3", + ) + phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0) + phoneme_ids1 = torch.LongTensor(phones2).to(device).unsqueeze(0) + + bert1 = bert1.T.to(device) + bert2 = bert2.T.to(device) + top_k = torch.LongTensor([15]).to(device) + + current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + logger.info("start inference %s", current_time) + print( + ssl_content.shape, + ref_audio_32k.shape, + phoneme_ids0.shape, + phoneme_ids1.shape, + bert1.shape, + bert2.shape, + top_k.shape, + ) + fea_ref, fea_todo, mel2 = gpt_sovits_v3_half( + ssl_content, ref_audio_32k, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k + ) + chunk_len = 934 - fea_ref.shape[2] + print(fea_ref.shape, fea_todo.shape, mel2.shape) + + cfm_resss = [] + sample_steps = torch.LongTensor([sample_steps]) + idx = 0 + current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + logger.info("start cfm %s", current_time) + wav_gen_length = fea_todo.shape[2] * 256 + + while 1: + current_time = datetime.now() + print("idx:", idx, current_time.strftime("%Y-%m-%d %H:%M:%S")) + fea_todo_chunk = fea_todo[:, :, idx : idx + chunk_len] + if fea_todo_chunk.shape[-1] == 0: + break + + complete_len = chunk_len - fea_todo_chunk.shape[-1] + if complete_len != 0: + fea_todo_chunk = torch.cat([fea_todo_chunk, torch.zeros(1, 512, complete_len).to(device).to(dtype)], 2) + + cfm_res, fea_ref, mel2 = cfm(fea_ref, fea_todo_chunk, mel2, sample_steps) + # if complete_len > 0 : + # cfm_res = cfm_res[:, :, :-complete_len] + # fea_ref = fea_ref[:, :, :-complete_len] + # mel2 = mel2[:, :, :-complete_len] + + idx += chunk_len + + current_time = datetime.now() + print("cfm end", current_time.strftime("%Y-%m-%d %H:%M:%S")) + cfm_res = denorm_spec(cfm_res).to(device) + bigvgan_res = bigvgan(cfm_res) + cfm_resss.append(bigvgan_res) + + current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + logger.info("start bigvgan %s", current_time) + wav_gen = torch.cat(cfm_resss, 2) + # cmf_res = denorm_spec(cmf_res) + # cmf_res = cmf_res.to(device) + # print("cmf_res:", cmf_res.shape) + + # cmf_res = torch.cat([cmf_res,torch.zeros([1,100,2000-cmf_res.size(2)],device=device,dtype=cmf_res.dtype)], 2) + + # wav_gen = bigvgan(cmf_res) + print("wav_gen:", wav_gen.shape, wav_gen.dtype) + wav_gen = wav_gen[:, :, :wav_gen_length] + + audio = wav_gen[0][0].cpu().detach().numpy() + logger.info("end bigvgan %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + sr = 24000 + soundfile.write(output, (audio * 32768).astype(np.int16), sr) + + +def test_export1( + todo_text, + gpt_sovits_v3, + output, +): + # hps = sovits.hps + ref_wav_path = "onnx/ad/ref.wav" + speed = 1.0 + sample_steps = torch.LongTensor([16]) + + dtype = torch.float16 if is_half == True else torch.float32 + + zero_wav = np.zeros( + int(24000 * 0.3), + dtype=np.float16 if is_half == True else np.float32, + ) + + with torch.no_grad(): + wav16k, sr = librosa.load(ref_wav_path, sr=16000) + wav16k = torch.from_numpy(wav16k) + zero_wav_torch = torch.from_numpy(zero_wav) + + if is_half == True: + wav16k = wav16k.half().to(device) + zero_wav_torch = zero_wav_torch.half().to(device) + else: + wav16k = wav16k.to(device) + zero_wav_torch = zero_wav_torch.to(device) + wav16k = torch.cat([wav16k, zero_wav_torch]) + ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2) # .float() + print("ssl_content:", ssl_content.shape, ssl_content.dtype) + + ref_audio_32k, _ = librosa.load(ref_wav_path, sr=32000) + ref_audio_32k = torch.from_numpy(ref_audio_32k).unsqueeze(0).to(device).float() + + phones1, bert1, norm_text1 = get_phones_and_bert( + "你这老坏蛋,我找了你这么久,真没想到在这里找到你。他说。", "all_zh", "v3" + ) + phones2, bert2, norm_text2 = get_phones_and_bert( + todo_text, + "zh", + "v3", + ) + phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0) + phoneme_ids1 = torch.LongTensor(phones2).to(device).unsqueeze(0) + + bert1 = bert1.T.to(device) + bert2 = bert2.T.to(device) + top_k = torch.LongTensor([15]).to(device) + + current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + logger.info("start inference %s", current_time) + print( + ssl_content.shape, + ref_audio_32k.shape, + phoneme_ids0.shape, + phoneme_ids1.shape, + bert1.shape, + bert2.shape, + top_k.shape, + ) + wav_gen = gpt_sovits_v3(ssl_content, ref_audio_32k, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k, sample_steps) + print("wav_gen:", wav_gen.shape, wav_gen.dtype) + + wav_gen = torch.cat([wav_gen, zero_wav_torch], 0) + + audio = wav_gen.cpu().detach().numpy() + logger.info("end bigvgan %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + sr = 24000 + soundfile.write(output, (audio * 32768).astype(np.int16), sr) + + +import time + + +def test_(): + sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/s2Gv3.pth") + + # cfm = ExportCFM(sovits.cfm) + # cfm.cfm.estimator = dit + sovits.cfm = None + + cfm = torch.jit.load("onnx/ad/cfm.pt", map_location=device) + # cfm = torch.jit.optimize_for_inference(cfm) + cfm = cfm.half().to(device) + + cfm.eval() + + logger.info("cfm ok") + + dict_s1 = torch.load("GPT_SoVITS/pretrained_models/s1v3.ckpt") + # v2 的 gpt 也可以用 + # dict_s1 = torch.load("GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt") + raw_t2s = get_raw_t2s_model(dict_s1).to(device) + print("#### get_raw_t2s_model ####") + print(raw_t2s.config) + if is_half: + raw_t2s = raw_t2s.half().to(device) + t2s_m = T2SModel(raw_t2s).half().to(device) + t2s_m.eval() + t2s_m = torch.jit.script(t2s_m) + t2s_m.eval() + # t2s_m.top_k = 15 + logger.info("t2s_m ok") + + vq_model: torch.jit.ScriptModule = torch.jit.load("onnx/ad/vq_model.pt", map_location=device) + # vq_model = torch.jit.optimize_for_inference(vq_model) + # vq_model = vq_model.half().to(device) + vq_model.eval() + # vq_model = sovits.vq_model + logger.info("vq_model ok") + + # gpt_sovits_v3_half = torch.jit.load("onnx/ad/gpt_sovits_v3_half.pt") + # gpt_sovits_v3_half = torch.jit.optimize_for_inference(gpt_sovits_v3_half) + # gpt_sovits_v3_half = gpt_sovits_v3_half.half() + # gpt_sovits_v3_half = gpt_sovits_v3_half.cuda() + # gpt_sovits_v3_half.eval() + gpt_sovits_v3_half = ExportGPTSovitsHalf(sovits.hps, t2s_m, vq_model) + logger.info("gpt_sovits_v3_half ok") + + # init_bigvgan() + # global bigvgan_model + bigvgan_model = torch.jit.load("onnx/ad/bigvgan_model.pt") + # bigvgan_model = torch.jit.optimize_for_inference(bigvgan_model) + bigvgan_model = bigvgan_model.half() + bigvgan_model = bigvgan_model.cuda() + bigvgan_model.eval() + + logger.info("bigvgan ok") + + gpt_sovits_v3 = GPTSoVITSV3(gpt_sovits_v3_half, cfm, bigvgan_model) + gpt_sovits_v3 = torch.jit.script(gpt_sovits_v3) + gpt_sovits_v3.save("onnx/ad/gpt_sovits_v3.pt") + gpt_sovits_v3 = gpt_sovits_v3.half().to(device) + gpt_sovits_v3.eval() + print("save gpt_sovits_v3 ok") + + time.sleep(5) + # print("thread:", torch.get_num_threads()) + # print("thread:", torch.get_num_interop_threads()) + # torch.set_num_interop_threads(1) + # torch.set_num_threads(1) + + test_export1( + "汗流浃背了呀!老弟~ My uncle has two dogs. One is big and the other is small. He likes them very much. He often plays with them. He takes them for a walk every day. He says they are his good friends. He is very happy with them. 最后还是我得了 MVP....", + gpt_sovits_v3, + "out.wav", + ) + + test_export1( + "你小子是什么来路.汗流浃背了呀!老弟~ My uncle has two dogs. He is very happy with them. 最后还是我得了 MVP!", + gpt_sovits_v3, + "out2.wav", + ) + + # test_export( + # "汗流浃背了呀!老弟~ My uncle has two dogs. One is big and the other is small. He likes them very much. He often plays with them. He takes them for a walk every day. He says they are his good friends. He is very happy with them. 最后还是我得了 MVP. 哈哈哈...", + # gpt_sovits_v3_half, + # cfm, + # bigvgan_model, + # "out2.wav", + # ) + + +def test_export_gpt_sovits_v3(): + gpt_sovits_v3 = torch.jit.load("onnx/ad/gpt_sovits_v3.pt", map_location=device) + # test_export1( + # "汗流浃背了呀!老弟~ My uncle has two dogs. One is big and the other is small. He likes them very much. He often plays with them. He takes them for a walk every day. He says they are his good friends. He is very happy with them. 最后还是我得了 MVP....", + # gpt_sovits_v3, + # "out3.wav", + # ) + # test_export1( + # "你小子是什么来路.汗流浃背了呀!老弟~ My uncle has two dogs. He is very happy with them. 最后还是我得了 MVP!", + # gpt_sovits_v3, + # "out4.wav", + # ) + test_export1( + "风萧萧兮易水寒,壮士一去兮不复还.", + gpt_sovits_v3, + "out5.wav", + ) + + +with torch.no_grad(): + # export() + test_() + # test_export_gpt_sovits_v3() diff --git a/GPT_SoVITS/inference_cli.py b/GPT_SoVITS/inference_cli.py new file mode 100644 index 0000000000000000000000000000000000000000..459a3d3632f599768465c16f6d889f47af5fe271 --- /dev/null +++ b/GPT_SoVITS/inference_cli.py @@ -0,0 +1,86 @@ +import argparse +import os +import soundfile as sf + +from tools.i18n.i18n import I18nAuto +from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav + +i18n = I18nAuto() + + +def synthesize( + GPT_model_path, + SoVITS_model_path, + ref_audio_path, + ref_text_path, + ref_language, + target_text_path, + target_language, + output_path, +): + # Read reference text + with open(ref_text_path, "r", encoding="utf-8") as file: + ref_text = file.read() + + # Read target text + with open(target_text_path, "r", encoding="utf-8") as file: + target_text = file.read() + + # Change model weights + change_gpt_weights(gpt_path=GPT_model_path) + change_sovits_weights(sovits_path=SoVITS_model_path) + + # Synthesize audio + synthesis_result = get_tts_wav( + ref_wav_path=ref_audio_path, + prompt_text=ref_text, + prompt_language=i18n(ref_language), + text=target_text, + text_language=i18n(target_language), + top_p=1, + temperature=1, + ) + + result_list = list(synthesis_result) + + if result_list: + last_sampling_rate, last_audio_data = result_list[-1] + output_wav_path = os.path.join(output_path, "output.wav") + sf.write(output_wav_path, last_audio_data, last_sampling_rate) + print(f"Audio saved to {output_wav_path}") + + +def main(): + parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool") + parser.add_argument("--gpt_model", required=True, help="Path to the GPT model file") + parser.add_argument("--sovits_model", required=True, help="Path to the SoVITS model file") + parser.add_argument("--ref_audio", required=True, help="Path to the reference audio file") + parser.add_argument("--ref_text", required=True, help="Path to the reference text file") + parser.add_argument( + "--ref_language", required=True, choices=["中文", "英文", "日文"], help="Language of the reference audio" + ) + parser.add_argument("--target_text", required=True, help="Path to the target text file") + parser.add_argument( + "--target_language", + required=True, + choices=["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"], + help="Language of the target text", + ) + parser.add_argument("--output_path", required=True, help="Path to the output directory") + + args = parser.parse_args() + + synthesize( + args.gpt_model, + args.sovits_model, + args.ref_audio, + args.ref_text, + args.ref_language, + args.target_text, + args.target_language, + args.output_path, + ) + + +if __name__ == "__main__": + main() diff --git a/GPT_SoVITS/inference_gui.py b/GPT_SoVITS/inference_gui.py new file mode 100644 index 0000000000000000000000000000000000000000..379f7fa8cdb32b4b56db8b242717c23bdb51eca0 --- /dev/null +++ b/GPT_SoVITS/inference_gui.py @@ -0,0 +1,316 @@ +import os +import sys +from PyQt5.QtCore import QEvent +from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QLineEdit, QPushButton, QTextEdit +from PyQt5.QtWidgets import QGridLayout, QVBoxLayout, QWidget, QFileDialog, QStatusBar, QComboBox +import soundfile as sf + +from tools.i18n.i18n import I18nAuto + +i18n = I18nAuto() + +from inference_webui import gpt_path, sovits_path, change_gpt_weights, change_sovits_weights, get_tts_wav + + +class GPTSoVITSGUI(QMainWindow): + GPT_Path = gpt_path + SoVITS_Path = sovits_path + + def __init__(self): + super().__init__() + + self.setWindowTitle("GPT-SoVITS GUI") + self.setGeometry(800, 450, 950, 850) + + self.setStyleSheet(""" + QWidget { + background-color: #a3d3b1; + } + + QTabWidget::pane { + background-color: #a3d3b1; + } + + QTabWidget::tab-bar { + alignment: left; + } + + QTabBar::tab { + background: #8da4bf; + color: #ffffff; + padding: 8px; + } + + QTabBar::tab:selected { + background: #2a3f54; + } + + QLabel { + color: #000000; + } + + QPushButton { + background-color: #4CAF50; + color: white; + padding: 8px; + border: 1px solid #4CAF50; + border-radius: 4px; + } + + QPushButton:hover { + background-color: #45a049; + border: 1px solid #45a049; + box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.1); + } + """) + + license_text = ( + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. " + "如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE." + ) + license_label = QLabel(license_text) + license_label.setWordWrap(True) + + self.GPT_model_label = QLabel("选择GPT模型:") + self.GPT_model_input = QLineEdit() + self.GPT_model_input.setPlaceholderText("拖拽或选择文件") + self.GPT_model_input.setText(self.GPT_Path) + self.GPT_model_input.setReadOnly(True) + self.GPT_model_button = QPushButton("选择GPT模型文件") + self.GPT_model_button.clicked.connect(self.select_GPT_model) + + self.SoVITS_model_label = QLabel("选择SoVITS模型:") + self.SoVITS_model_input = QLineEdit() + self.SoVITS_model_input.setPlaceholderText("拖拽或选择文件") + self.SoVITS_model_input.setText(self.SoVITS_Path) + self.SoVITS_model_input.setReadOnly(True) + self.SoVITS_model_button = QPushButton("选择SoVITS模型文件") + self.SoVITS_model_button.clicked.connect(self.select_SoVITS_model) + + self.ref_audio_label = QLabel("上传参考音频:") + self.ref_audio_input = QLineEdit() + self.ref_audio_input.setPlaceholderText("拖拽或选择文件") + self.ref_audio_input.setReadOnly(True) + self.ref_audio_button = QPushButton("选择音频文件") + self.ref_audio_button.clicked.connect(self.select_ref_audio) + + self.ref_text_label = QLabel("参考音频文本:") + self.ref_text_input = QLineEdit() + self.ref_text_input.setPlaceholderText("直接输入文字或上传文本") + self.ref_text_button = QPushButton("上传文本") + self.ref_text_button.clicked.connect(self.upload_ref_text) + + self.ref_language_label = QLabel("参考音频语言:") + self.ref_language_combobox = QComboBox() + self.ref_language_combobox.addItems(["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"]) + self.ref_language_combobox.setCurrentText("多语种混合") + + self.target_text_label = QLabel("合成目标文本:") + self.target_text_input = QLineEdit() + self.target_text_input.setPlaceholderText("直接输入文字或上传文本") + self.target_text_button = QPushButton("上传文本") + self.target_text_button.clicked.connect(self.upload_target_text) + + self.target_language_label = QLabel("合成音频语言:") + self.target_language_combobox = QComboBox() + self.target_language_combobox.addItems(["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"]) + self.target_language_combobox.setCurrentText("多语种混合") + + self.output_label = QLabel("输出音频路径:") + self.output_input = QLineEdit() + self.output_input.setPlaceholderText("拖拽或选择文件") + self.output_input.setReadOnly(True) + self.output_button = QPushButton("选择文件夹") + self.output_button.clicked.connect(self.select_output_path) + + self.output_text = QTextEdit() + self.output_text.setReadOnly(True) + + self.add_drag_drop_events( + [ + self.GPT_model_input, + self.SoVITS_model_input, + self.ref_audio_input, + self.ref_text_input, + self.target_text_input, + self.output_input, + ] + ) + + self.synthesize_button = QPushButton("合成") + self.synthesize_button.clicked.connect(self.synthesize) + + self.clear_output_button = QPushButton("清空输出") + self.clear_output_button.clicked.connect(self.clear_output) + + self.status_bar = QStatusBar() + + main_layout = QVBoxLayout() + + input_layout = QGridLayout(self) + input_layout.setSpacing(10) + + input_layout.addWidget(license_label, 0, 0, 1, 3) + + input_layout.addWidget(self.GPT_model_label, 1, 0) + input_layout.addWidget(self.GPT_model_input, 2, 0, 1, 2) + input_layout.addWidget(self.GPT_model_button, 2, 2) + + input_layout.addWidget(self.SoVITS_model_label, 3, 0) + input_layout.addWidget(self.SoVITS_model_input, 4, 0, 1, 2) + input_layout.addWidget(self.SoVITS_model_button, 4, 2) + + input_layout.addWidget(self.ref_audio_label, 5, 0) + input_layout.addWidget(self.ref_audio_input, 6, 0, 1, 2) + input_layout.addWidget(self.ref_audio_button, 6, 2) + + input_layout.addWidget(self.ref_language_label, 7, 0) + input_layout.addWidget(self.ref_language_combobox, 8, 0, 1, 1) + input_layout.addWidget(self.ref_text_label, 9, 0) + input_layout.addWidget(self.ref_text_input, 10, 0, 1, 2) + input_layout.addWidget(self.ref_text_button, 10, 2) + + input_layout.addWidget(self.target_language_label, 11, 0) + input_layout.addWidget(self.target_language_combobox, 12, 0, 1, 1) + input_layout.addWidget(self.target_text_label, 13, 0) + input_layout.addWidget(self.target_text_input, 14, 0, 1, 2) + input_layout.addWidget(self.target_text_button, 14, 2) + + input_layout.addWidget(self.output_label, 15, 0) + input_layout.addWidget(self.output_input, 16, 0, 1, 2) + input_layout.addWidget(self.output_button, 16, 2) + + main_layout.addLayout(input_layout) + + output_layout = QVBoxLayout() + output_layout.addWidget(self.output_text) + main_layout.addLayout(output_layout) + + main_layout.addWidget(self.synthesize_button) + + main_layout.addWidget(self.clear_output_button) + + main_layout.addWidget(self.status_bar) + + self.central_widget = QWidget() + self.central_widget.setLayout(main_layout) + self.setCentralWidget(self.central_widget) + + def dragEnterEvent(self, event): + if event.mimeData().hasUrls(): + event.acceptProposedAction() + + def dropEvent(self, event): + if event.mimeData().hasUrls(): + file_paths = [url.toLocalFile() for url in event.mimeData().urls()] + if len(file_paths) == 1: + self.update_ref_audio(file_paths[0]) + else: + self.update_ref_audio(", ".join(file_paths)) + + def add_drag_drop_events(self, widgets): + for widget in widgets: + widget.setAcceptDrops(True) + widget.installEventFilter(self) + + def eventFilter(self, obj, event): + if event.type() in (QEvent.DragEnter, QEvent.Drop): + mime_data = event.mimeData() + if mime_data.hasUrls(): + event.acceptProposedAction() + + return super().eventFilter(obj, event) + + def select_GPT_model(self): + file_path, _ = QFileDialog.getOpenFileName(self, "选择GPT模型文件", "", "GPT Files (*.ckpt)") + if file_path: + self.GPT_model_input.setText(file_path) + + def select_SoVITS_model(self): + file_path, _ = QFileDialog.getOpenFileName(self, "选择SoVITS模型文件", "", "SoVITS Files (*.pth)") + if file_path: + self.SoVITS_model_input.setText(file_path) + + def select_ref_audio(self): + file_path, _ = QFileDialog.getOpenFileName(self, "选择参考音频文件", "", "Audio Files (*.wav *.mp3)") + if file_path: + self.update_ref_audio(file_path) + + def upload_ref_text(self): + file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)") + if file_path: + with open(file_path, "r", encoding="utf-8") as file: + content = file.read() + self.ref_text_input.setText(content) + + def upload_target_text(self): + file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)") + if file_path: + with open(file_path, "r", encoding="utf-8") as file: + content = file.read() + self.target_text_input.setText(content) + + def select_output_path(self): + options = QFileDialog.Options() + options |= QFileDialog.DontUseNativeDialog + options |= QFileDialog.ShowDirsOnly + + folder_dialog = QFileDialog() + folder_dialog.setOptions(options) + folder_dialog.setFileMode(QFileDialog.Directory) + + if folder_dialog.exec_(): + folder_path = folder_dialog.selectedFiles()[0] + self.output_input.setText(folder_path) + + def update_ref_audio(self, file_path): + self.ref_audio_input.setText(file_path) + + def clear_output(self): + self.output_text.clear() + + def synthesize(self): + GPT_model_path = self.GPT_model_input.text() + SoVITS_model_path = self.SoVITS_model_input.text() + ref_audio_path = self.ref_audio_input.text() + language_combobox = self.ref_language_combobox.currentText() + language_combobox = i18n(language_combobox) + ref_text = self.ref_text_input.text() + target_language_combobox = self.target_language_combobox.currentText() + target_language_combobox = i18n(target_language_combobox) + target_text = self.target_text_input.text() + output_path = self.output_input.text() + + if GPT_model_path != self.GPT_Path: + change_gpt_weights(gpt_path=GPT_model_path) + self.GPT_Path = GPT_model_path + if SoVITS_model_path != self.SoVITS_Path: + change_sovits_weights(sovits_path=SoVITS_model_path) + self.SoVITS_Path = SoVITS_model_path + + synthesis_result = get_tts_wav( + ref_wav_path=ref_audio_path, + prompt_text=ref_text, + prompt_language=language_combobox, + text=target_text, + text_language=target_language_combobox, + ) + + result_list = list(synthesis_result) + + if result_list: + last_sampling_rate, last_audio_data = result_list[-1] + output_wav_path = os.path.join(output_path, "output.wav") + sf.write(output_wav_path, last_audio_data, last_sampling_rate) + + result = "Audio saved to " + output_wav_path + + self.status_bar.showMessage("合成完成!输出路径:" + output_wav_path, 5000) + self.output_text.append("处理结果:\n" + result) + + +if __name__ == "__main__": + app = QApplication(sys.argv) + mainWin = GPTSoVITSGUI() + mainWin.show() + sys.exit(app.exec_()) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py new file mode 100644 index 0000000000000000000000000000000000000000..4bee27cdc617b2a3aeb77f07e0c497fcd1031974 --- /dev/null +++ b/GPT_SoVITS/inference_webui.py @@ -0,0 +1,1281 @@ +""" +按中英混合识别 +按日英混合识别 +多语种启动切分识别语种 +全部按中文识别 +全部按英文识别 +全部按日文识别 +""" + +import logging +import traceback +import warnings + +import torchaudio + +logging.getLogger("markdown_it").setLevel(logging.ERROR) +logging.getLogger("urllib3").setLevel(logging.ERROR) +logging.getLogger("httpcore").setLevel(logging.ERROR) +logging.getLogger("httpx").setLevel(logging.ERROR) +logging.getLogger("asyncio").setLevel(logging.ERROR) +logging.getLogger("charset_normalizer").setLevel(logging.ERROR) +logging.getLogger("torchaudio._extension").setLevel(logging.ERROR) +logging.getLogger("multipart.multipart").setLevel(logging.ERROR) +warnings.simplefilter(action="ignore", category=FutureWarning) + +import json +import os +import re +import sys + +import torch +from text.LangSegmenter import LangSegmenter + +try: + import gradio.analytics as analytics + + analytics.version_check = lambda: None +except: + ... +version = model_version = os.environ.get("version", "v2") +path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth" +path_sovits_v4 = "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth" +is_exist_s2gv3 = os.path.exists(path_sovits_v3) +is_exist_s2gv4 = os.path.exists(path_sovits_v4) +pretrained_sovits_name = [ + "GPT_SoVITS/pretrained_models/s2G488k.pth", + "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", + "GPT_SoVITS/pretrained_models/s2Gv3.pth", + "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth", +] +pretrained_gpt_name = [ + "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", + "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", + "GPT_SoVITS/pretrained_models/s1v3.ckpt", + "GPT_SoVITS/pretrained_models/s1v3.ckpt", +] + + +_ = [[], []] +for i in range(4): + if os.path.exists(pretrained_gpt_name[i]): + _[0].append(pretrained_gpt_name[i]) + if os.path.exists(pretrained_sovits_name[i]): + _[-1].append(pretrained_sovits_name[i]) +pretrained_gpt_name, pretrained_sovits_name = _ + + +if os.path.exists("./weight.json"): + pass +else: + with open("./weight.json", "w", encoding="utf-8") as file: + json.dump({"GPT": {}, "SoVITS": {}}, file) + +with open("./weight.json", "r", encoding="utf-8") as file: + weight_data = file.read() + weight_data = json.loads(weight_data) + gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, pretrained_gpt_name)) + sovits_path = os.environ.get("sovits_path", weight_data.get("SoVITS", {}).get(version, pretrained_sovits_name)) + if isinstance(gpt_path, list): + gpt_path = gpt_path[0] + if isinstance(sovits_path, list): + sovits_path = sovits_path[0] + +# gpt_path = os.environ.get( +# "gpt_path", pretrained_gpt_name +# ) +# sovits_path = os.environ.get("sovits_path", pretrained_sovits_name) +cnhubert_base_path = os.environ.get("cnhubert_base_path", "GPT_SoVITS/pretrained_models/chinese-hubert-base") +bert_path = os.environ.get("bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large") +infer_ttswebui = os.environ.get("infer_ttswebui", 9872) +infer_ttswebui = int(infer_ttswebui) +is_share = os.environ.get("is_share", "False") +is_share = eval(is_share) +if "_CUDA_VISIBLE_DEVICES" in os.environ: + os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] +is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available() +# is_half=False +punctuation = set(["!", "?", "…", ",", ".", "-", " "]) +import gradio as gr +import librosa +import numpy as np +from feature_extractor import cnhubert +from transformers import AutoModelForMaskedLM, AutoTokenizer + +cnhubert.cnhubert_base_path = cnhubert_base_path + +import random + +from GPT_SoVITS.module.models import SynthesizerTrn, SynthesizerTrnV3,Generator + + +def set_seed(seed): + if seed == -1: + seed = random.randint(0, 1000000) + seed = int(seed) + random.seed(seed) + os.environ["PYTHONHASHSEED"] = str(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + + +# set_seed(42) + +from time import time as ttime + +from AR.models.t2s_lightning_module import Text2SemanticLightningModule +from peft import LoraConfig, get_peft_model +from text import cleaned_text_to_sequence +from text.cleaner import clean_text + +from tools.i18n.i18n import I18nAuto, scan_language_list + +language = os.environ.get("language", "Auto") +language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language +i18n = I18nAuto(language=language) + +# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。 + +if torch.cuda.is_available(): + device = "cuda" +else: + device = "cpu" + +dict_language_v1 = { + i18n("中文"): "all_zh", # 全部按中文识别 + i18n("英文"): "en", # 全部按英文识别#######不变 + i18n("日文"): "all_ja", # 全部按日文识别 + i18n("中英混合"): "zh", # 按中英混合识别####不变 + i18n("日英混合"): "ja", # 按日英混合识别####不变 + i18n("多语种混合"): "auto", # 多语种启动切分识别语种 +} +dict_language_v2 = { + i18n("中文"): "all_zh", # 全部按中文识别 + i18n("英文"): "en", # 全部按英文识别#######不变 + i18n("日文"): "all_ja", # 全部按日文识别 + i18n("粤语"): "all_yue", # 全部按中文识别 + i18n("韩文"): "all_ko", # 全部按韩文识别 + i18n("中英混合"): "zh", # 按中英混合识别####不变 + i18n("日英混合"): "ja", # 按日英混合识别####不变 + i18n("粤英混合"): "yue", # 按粤英混合识别####不变 + i18n("韩英混合"): "ko", # 按韩英混合识别####不变 + i18n("多语种混合"): "auto", # 多语种启动切分识别语种 + i18n("多语种混合(粤语)"): "auto_yue", # 多语种启动切分识别语种 +} +dict_language = dict_language_v1 if version == "v1" else dict_language_v2 + +tokenizer = AutoTokenizer.from_pretrained(bert_path) +bert_model = AutoModelForMaskedLM.from_pretrained(bert_path) +if is_half == True: + bert_model = bert_model.half().to(device) +else: + bert_model = bert_model.to(device) + + +def get_bert_feature(text, word2ph): + with torch.no_grad(): + inputs = tokenizer(text, return_tensors="pt") + for i in inputs: + inputs[i] = inputs[i].to(device) + res = bert_model(**inputs, output_hidden_states=True) + res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1] + assert len(word2ph) == len(text) + phone_level_feature = [] + for i in range(len(word2ph)): + repeat_feature = res[i].repeat(word2ph[i], 1) + phone_level_feature.append(repeat_feature) + phone_level_feature = torch.cat(phone_level_feature, dim=0) + return phone_level_feature.T + + +class DictToAttrRecursive(dict): + def __init__(self, input_dict): + super().__init__(input_dict) + for key, value in input_dict.items(): + if isinstance(value, dict): + value = DictToAttrRecursive(value) + self[key] = value + setattr(self, key, value) + + def __getattr__(self, item): + try: + return self[item] + except KeyError: + raise AttributeError(f"Attribute {item} not found") + + def __setattr__(self, key, value): + if isinstance(value, dict): + value = DictToAttrRecursive(value) + super(DictToAttrRecursive, self).__setitem__(key, value) + super().__setattr__(key, value) + + def __delattr__(self, item): + try: + del self[item] + except KeyError: + raise AttributeError(f"Attribute {item} not found") + + +ssl_model = cnhubert.get_model() +if is_half == True: + ssl_model = ssl_model.half().to(device) +else: + ssl_model = ssl_model.to(device) + +resample_transform_dict = {} + + +def resample(audio_tensor, sr0,sr1): + global resample_transform_dict + key="%s-%s"%(sr0,sr1) + if key not in resample_transform_dict: + resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device) + return resample_transform_dict[key](audio_tensor) + + +###todo:put them to process_ckpt and modify my_save func (save sovits weights), gpt save weights use my_save in process_ckpt +# symbol_version-model_version-if_lora_v3 +from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new + +v3v4set={"v3","v4"} +def change_sovits_weights(sovits_path, prompt_language=None, text_language=None): + global vq_model, hps, version, model_version, dict_language, if_lora_v3 + version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path) + print(sovits_path,version, model_version, if_lora_v3) + is_exist=is_exist_s2gv3 if model_version=="v3"else is_exist_s2gv4 + if if_lora_v3 == True and is_exist == False: + info = "GPT_SoVITS/pretrained_models/s2Gv3.pth" + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重"%model_version) + gr.Warning(info) + raise FileExistsError(info) + dict_language = dict_language_v1 if version == "v1" else dict_language_v2 + if prompt_language is not None and text_language is not None: + if prompt_language in list(dict_language.keys()): + prompt_text_update, prompt_language_update = ( + {"__type__": "update"}, + {"__type__": "update", "value": prompt_language}, + ) + else: + prompt_text_update = {"__type__": "update", "value": ""} + prompt_language_update = {"__type__": "update", "value": i18n("中文")} + if text_language in list(dict_language.keys()): + text_update, text_language_update = {"__type__": "update"}, {"__type__": "update", "value": text_language} + else: + text_update = {"__type__": "update", "value": ""} + text_language_update = {"__type__": "update", "value": i18n("中文")} + if model_version in v3v4set: + visible_sample_steps = True + visible_inp_refs = False + else: + visible_sample_steps = False + visible_inp_refs = True + yield ( + {"__type__": "update", "choices": list(dict_language.keys())}, + {"__type__": "update", "choices": list(dict_language.keys())}, + prompt_text_update, + prompt_language_update, + text_update, + text_language_update, + {"__type__": "update", "visible": visible_sample_steps, "value": 32 if model_version=="v3"else 8,"choices":[4, 8, 16, 32,64,128]if model_version=="v3"else [4, 8, 16, 32]}, + {"__type__": "update", "visible": visible_inp_refs}, + {"__type__": "update", "value": False, "interactive": True if model_version not in v3v4set else False}, + {"__type__": "update", "visible": True if model_version =="v3" else False}, + {"__type__": "update", "value": i18n("模型加载中,请等待"), "interactive": False}, + ) + + dict_s2 = load_sovits_new(sovits_path) + hps = dict_s2["config"] + hps = DictToAttrRecursive(hps) + hps.model.semantic_frame_rate = "25hz" + if "enc_p.text_embedding.weight" not in dict_s2["weight"]: + hps.model.version = "v2" # v3model,v2sybomls + elif dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322: + hps.model.version = "v1" + else: + hps.model.version = "v2" + version = hps.model.version + # print("sovits版本:",hps.model.version) + if model_version not in v3v4set: + vq_model = SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ) + model_version = version + else: + hps.model.version=model_version + vq_model = SynthesizerTrnV3( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ) + if "pretrained" not in sovits_path: + try: + del vq_model.enc_q + except: + pass + if is_half == True: + vq_model = vq_model.half().to(device) + else: + vq_model = vq_model.to(device) + vq_model.eval() + if if_lora_v3 == False: + print("loading sovits_%s" % model_version, vq_model.load_state_dict(dict_s2["weight"], strict=False)) + else: + path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4 + print( + "loading sovits_%spretrained_G"%model_version, + vq_model.load_state_dict(load_sovits_new(path_sovits)["weight"], strict=False), + ) + lora_rank = dict_s2["lora_rank"] + lora_config = LoraConfig( + target_modules=["to_k", "to_q", "to_v", "to_out.0"], + r=lora_rank, + lora_alpha=lora_rank, + init_lora_weights=True, + ) + vq_model.cfm = get_peft_model(vq_model.cfm, lora_config) + print("loading sovits_%s_lora%s" % (model_version,lora_rank)) + vq_model.load_state_dict(dict_s2["weight"], strict=False) + vq_model.cfm = vq_model.cfm.merge_and_unload() + # torch.save(vq_model.state_dict(),"merge_win.pth") + vq_model.eval() + + yield ( + {"__type__": "update", "choices": list(dict_language.keys())}, + {"__type__": "update", "choices": list(dict_language.keys())}, + prompt_text_update, + prompt_language_update, + text_update, + text_language_update, + {"__type__": "update", "visible": visible_sample_steps, "value":32 if model_version=="v3"else 8,"choices":[4, 8, 16, 32,64,128]if model_version=="v3"else [4, 8, 16, 32]}, + {"__type__": "update", "visible": visible_inp_refs}, + {"__type__": "update", "value": False, "interactive": True if model_version not in v3v4set else False}, + {"__type__": "update", "visible": True if model_version =="v3" else False}, + {"__type__": "update", "value": i18n("合成语音"), "interactive": True}, + ) + with open("./weight.json") as f: + data = f.read() + data = json.loads(data) + data["SoVITS"][version] = sovits_path + with open("./weight.json", "w") as f: + f.write(json.dumps(data)) + + +try: + next(change_sovits_weights(sovits_path)) +except: + pass + + +def change_gpt_weights(gpt_path): + global hz, max_sec, t2s_model, config + hz = 50 + dict_s1 = torch.load(gpt_path, map_location="cpu") + config = dict_s1["config"] + max_sec = config["data"]["max_sec"] + t2s_model = Text2SemanticLightningModule(config, "****", is_train=False) + t2s_model.load_state_dict(dict_s1["weight"]) + if is_half == True: + t2s_model = t2s_model.half() + t2s_model = t2s_model.to(device) + t2s_model.eval() + # total = sum([param.nelement() for param in t2s_model.parameters()]) + # print("Number of parameter: %.2fM" % (total / 1e6)) + with open("./weight.json") as f: + data = f.read() + data = json.loads(data) + data["GPT"][version] = gpt_path + with open("./weight.json", "w") as f: + f.write(json.dumps(data)) + + +change_gpt_weights(gpt_path) +os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" +import torch + +now_dir = os.getcwd() + + +def init_bigvgan(): + global bigvgan_model,hifigan_model + from BigVGAN import bigvgan + + bigvgan_model = bigvgan.BigVGAN.from_pretrained( + "%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,), + use_cuda_kernel=False, + ) # if True, RuntimeError: Ninja is required to load C++ extensions + # remove weight norm in the model and set to eval mode + bigvgan_model.remove_weight_norm() + bigvgan_model = bigvgan_model.eval() + if hifigan_model: + hifigan_model=hifigan_model.cpu() + hifigan_model=None + try:torch.cuda.empty_cache() + except:pass + if is_half == True: + bigvgan_model = bigvgan_model.half().to(device) + else: + bigvgan_model = bigvgan_model.to(device) + +def init_hifigan(): + global hifigan_model,bigvgan_model + hifigan_model = Generator( + initial_channel=100, + resblock="1", + resblock_kernel_sizes=[3, 7, 11], + resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], + upsample_rates=[10, 6, 2, 2, 2], + upsample_initial_channel=512, + upsample_kernel_sizes=[20, 12, 4, 4, 4], + gin_channels=0, is_bias=True + ) + hifigan_model.eval() + hifigan_model.remove_weight_norm() + state_dict_g = torch.load("%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu") + print("loading vocoder",hifigan_model.load_state_dict(state_dict_g)) + if bigvgan_model: + bigvgan_model=bigvgan_model.cpu() + bigvgan_model=None + try:torch.cuda.empty_cache() + except:pass + if is_half == True: + hifigan_model = hifigan_model.half().to(device) + else: + hifigan_model = hifigan_model.to(device) + +bigvgan_model=hifigan_model=None +if model_version=="v3": + init_bigvgan() +if model_version=="v4": + init_hifigan() + + +def get_spepc(hps, filename): + # audio = load_audio(filename, int(hps.data.sampling_rate)) + audio, sampling_rate = librosa.load(filename, sr=int(hps.data.sampling_rate)) + audio = torch.FloatTensor(audio) + maxx = audio.abs().max() + if maxx > 1: + audio /= min(2, maxx) + audio_norm = audio + audio_norm = audio_norm.unsqueeze(0) + spec = spectrogram_torch( + audio_norm, + hps.data.filter_length, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + center=False, + ) + return spec + + +def clean_text_inf(text, language, version): + language = language.replace("all_", "") + phones, word2ph, norm_text = clean_text(text, language, version) + phones = cleaned_text_to_sequence(phones, version) + return phones, word2ph, norm_text + + +dtype = torch.float16 if is_half == True else torch.float32 + + +def get_bert_inf(phones, word2ph, norm_text, language): + language = language.replace("all_", "") + if language == "zh": + bert = get_bert_feature(norm_text, word2ph).to(device) # .to(dtype) + else: + bert = torch.zeros( + (1024, len(phones)), + dtype=torch.float16 if is_half == True else torch.float32, + ).to(device) + + return bert + + +splits = { + ",", + "。", + "?", + "!", + ",", + ".", + "?", + "!", + "~", + ":", + ":", + "—", + "…", +} + + +def get_first(text): + pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]" + text = re.split(pattern, text)[0].strip() + return text + + +from text import chinese + + +def get_phones_and_bert(text, language, version, final=False): + if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}: + formattext = text + while " " in formattext: + formattext = formattext.replace(" ", " ") + if language == "all_zh": + if re.search(r"[A-Za-z]", formattext): + formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext) + formattext = chinese.mix_text_normalize(formattext) + return get_phones_and_bert(formattext, "zh", version) + else: + phones, word2ph, norm_text = clean_text_inf(formattext, language, version) + bert = get_bert_feature(norm_text, word2ph).to(device) + elif language == "all_yue" and re.search(r"[A-Za-z]", formattext): + formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext) + formattext = chinese.mix_text_normalize(formattext) + return get_phones_and_bert(formattext, "yue", version) + else: + phones, word2ph, norm_text = clean_text_inf(formattext, language, version) + bert = torch.zeros( + (1024, len(phones)), + dtype=torch.float16 if is_half == True else torch.float32, + ).to(device) + elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}: + textlist = [] + langlist = [] + if language == "auto": + for tmp in LangSegmenter.getTexts(text): + langlist.append(tmp["lang"]) + textlist.append(tmp["text"]) + elif language == "auto_yue": + for tmp in LangSegmenter.getTexts(text): + if tmp["lang"] == "zh": + tmp["lang"] = "yue" + langlist.append(tmp["lang"]) + textlist.append(tmp["text"]) + else: + for tmp in LangSegmenter.getTexts(text): + if tmp["lang"] == "en": + langlist.append(tmp["lang"]) + else: + # 因无法区别中日韩文汉字,以用户输入为准 + langlist.append(language) + textlist.append(tmp["text"]) + print(textlist) + print(langlist) + phones_list = [] + bert_list = [] + norm_text_list = [] + for i in range(len(textlist)): + lang = langlist[i] + phones, word2ph, norm_text = clean_text_inf(textlist[i], lang, version) + bert = get_bert_inf(phones, word2ph, norm_text, lang) + phones_list.append(phones) + norm_text_list.append(norm_text) + bert_list.append(bert) + bert = torch.cat(bert_list, dim=1) + phones = sum(phones_list, []) + norm_text = "".join(norm_text_list) + + if not final and len(phones) < 6: + return get_phones_and_bert("." + text, language, version, final=True) + + return phones, bert.to(dtype), norm_text + + +from module.mel_processing import mel_spectrogram_torch, spectrogram_torch + +spec_min = -12 +spec_max = 2 + + +def norm_spec(x): + return (x - spec_min) / (spec_max - spec_min) * 2 - 1 + + +def denorm_spec(x): + return (x + 1) / 2 * (spec_max - spec_min) + spec_min + + +mel_fn = lambda x: mel_spectrogram_torch( + x, + **{ + "n_fft": 1024, + "win_size": 1024, + "hop_size": 256, + "num_mels": 100, + "sampling_rate": 24000, + "fmin": 0, + "fmax": None, + "center": False, + }, +) +mel_fn_v4 = lambda x: mel_spectrogram_torch( + x, + **{ + "n_fft": 1280, + "win_size": 1280, + "hop_size": 320, + "num_mels": 100, + "sampling_rate": 32000, + "fmin": 0, + "fmax": None, + "center": False, + }, +) + + +def merge_short_text_in_array(texts, threshold): + if (len(texts)) < 2: + return texts + result = [] + text = "" + for ele in texts: + text += ele + if len(text) >= threshold: + result.append(text) + text = "" + if len(text) > 0: + if len(result) == 0: + result.append(text) + else: + result[len(result) - 1] += text + return result + + +sr_model = None + + +def audio_sr(audio, sr): + global sr_model + if sr_model == None: + from tools.audio_sr import AP_BWE + + try: + sr_model = AP_BWE(device, DictToAttrRecursive) + except FileNotFoundError: + gr.Warning(i18n("你没有下载超分模型的参数,因此不进行超分。如想超分请先参照教程把文件下载好")) + return audio.cpu().detach().numpy(), sr + return sr_model(audio, sr) + + +##ref_wav_path+prompt_text+prompt_language+text(单个)+text_language+top_k+top_p+temperature +# cache_tokens={}#暂未实现清理机制 +cache = {} + + +def get_tts_wav( + ref_wav_path, + prompt_text, + prompt_language, + text, + text_language, + how_to_cut=i18n("不切"), + top_k=20, + top_p=0.6, + temperature=0.6, + ref_free=False, + speed=1, + if_freeze=False, + inp_refs=None, + sample_steps=8, + if_sr=False, + pause_second=0.3, +): + global cache + if ref_wav_path: + pass + else: + gr.Warning(i18n("请上传参考音频")) + if text: + pass + else: + gr.Warning(i18n("请填入推理文本")) + t = [] + if prompt_text is None or len(prompt_text) == 0: + ref_free = True + if model_version in v3v4set: + ref_free = False # s2v3暂不支持ref_free + else: + if_sr = False + t0 = ttime() + prompt_language = dict_language[prompt_language] + text_language = dict_language[text_language] + + if not ref_free: + prompt_text = prompt_text.strip("\n") + if prompt_text[-1] not in splits: + prompt_text += "。" if prompt_language != "en" else "." + print(i18n("实际输入的参考文本:"), prompt_text) + text = text.strip("\n") + # if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text + + print(i18n("实际输入的目标文本:"), text) + zero_wav = np.zeros( + int(hps.data.sampling_rate * pause_second), + dtype=np.float16 if is_half == True else np.float32, + ) + zero_wav_torch = torch.from_numpy(zero_wav) + if is_half == True: + zero_wav_torch = zero_wav_torch.half().to(device) + else: + zero_wav_torch = zero_wav_torch.to(device) + if not ref_free: + with torch.no_grad(): + wav16k, sr = librosa.load(ref_wav_path, sr=16000) + if wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000: + gr.Warning(i18n("参考音频在3~10秒范围外,请更换!")) + raise OSError(i18n("参考音频在3~10秒范围外,请更换!")) + wav16k = torch.from_numpy(wav16k) + if is_half == True: + wav16k = wav16k.half().to(device) + else: + wav16k = wav16k.to(device) + wav16k = torch.cat([wav16k, zero_wav_torch]) + ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2) # .float() + codes = vq_model.extract_latent(ssl_content) + prompt_semantic = codes[0, 0] + prompt = prompt_semantic.unsqueeze(0).to(device) + + t1 = ttime() + t.append(t1 - t0) + + if how_to_cut == i18n("凑四句一切"): + text = cut1(text) + elif how_to_cut == i18n("凑50字一切"): + text = cut2(text) + elif how_to_cut == i18n("按中文句号。切"): + text = cut3(text) + elif how_to_cut == i18n("按英文句号.切"): + text = cut4(text) + elif how_to_cut == i18n("按标点符号切"): + text = cut5(text) + while "\n\n" in text: + text = text.replace("\n\n", "\n") + print(i18n("实际输入的目标文本(切句后):"), text) + texts = text.split("\n") + texts = process_text(texts) + texts = merge_short_text_in_array(texts, 5) + audio_opt = [] + ###s2v3暂不支持ref_free + if not ref_free: + phones1, bert1, norm_text1 = get_phones_and_bert(prompt_text, prompt_language, version) + + for i_text, text in enumerate(texts): + # 解决输入目标文本的空行导致报错的问题 + if len(text.strip()) == 0: + continue + if text[-1] not in splits: + text += "。" if text_language != "en" else "." + print(i18n("实际输入的目标文本(每句):"), text) + phones2, bert2, norm_text2 = get_phones_and_bert(text, text_language, version) + print(i18n("前端处理后的文本(每句):"), norm_text2) + if not ref_free: + bert = torch.cat([bert1, bert2], 1) + all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0) + else: + bert = bert2 + all_phoneme_ids = torch.LongTensor(phones2).to(device).unsqueeze(0) + + bert = bert.to(device).unsqueeze(0) + all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device) + + t2 = ttime() + # cache_key="%s-%s-%s-%s-%s-%s-%s-%s"%(ref_wav_path,prompt_text,prompt_language,text,text_language,top_k,top_p,temperature) + # print(cache.keys(),if_freeze) + if i_text in cache and if_freeze == True: + pred_semantic = cache[i_text] + else: + with torch.no_grad(): + pred_semantic, idx = t2s_model.model.infer_panel( + all_phoneme_ids, + all_phoneme_len, + None if ref_free else prompt, + bert, + # prompt_phone_len=ph_offset, + top_k=top_k, + top_p=top_p, + temperature=temperature, + early_stop_num=hz * max_sec, + ) + pred_semantic = pred_semantic[:, -idx:].unsqueeze(0) + cache[i_text] = pred_semantic + t3 = ttime() + ###v3不存在以下逻辑和inp_refs + if model_version not in v3v4set: + refers = [] + if inp_refs: + for path in inp_refs: + try: + refer = get_spepc(hps, path.name).to(dtype).to(device) + refers.append(refer) + except: + traceback.print_exc() + if len(refers) == 0: + refers = [get_spepc(hps, ref_wav_path).to(dtype).to(device)] + audio = vq_model.decode( + pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed + )[0][0] # .cpu().detach().numpy() + else: + refer = get_spepc(hps, ref_wav_path).to(device).to(dtype) + phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0) + phoneme_ids1 = torch.LongTensor(phones2).to(device).unsqueeze(0) + # print(11111111, phoneme_ids0, phoneme_ids1) + fea_ref, ge = vq_model.decode_encp(prompt.unsqueeze(0), phoneme_ids0, refer) + ref_audio, sr = torchaudio.load(ref_wav_path) + ref_audio = ref_audio.to(device).float() + if ref_audio.shape[0] == 2: + ref_audio = ref_audio.mean(0).unsqueeze(0) + tgt_sr=24000 if model_version=="v3"else 32000 + if sr != tgt_sr: + ref_audio = resample(ref_audio, sr,tgt_sr) + # print("ref_audio",ref_audio.abs().mean()) + mel2 = mel_fn(ref_audio)if model_version=="v3"else mel_fn_v4(ref_audio) + mel2 = norm_spec(mel2) + T_min = min(mel2.shape[2], fea_ref.shape[2]) + mel2 = mel2[:, :, :T_min] + fea_ref = fea_ref[:, :, :T_min] + Tref=468 if model_version=="v3"else 500 + Tchunk=934 if model_version=="v3"else 1000 + if T_min > Tref: + mel2 = mel2[:, :, -Tref:] + fea_ref = fea_ref[:, :, -Tref:] + T_min = Tref + chunk_len = Tchunk - T_min + mel2 = mel2.to(dtype) + fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge, speed) + cfm_resss = [] + idx = 0 + while 1: + fea_todo_chunk = fea_todo[:, :, idx : idx + chunk_len] + if fea_todo_chunk.shape[-1] == 0: + break + idx += chunk_len + fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1) + cfm_res = vq_model.cfm.inference( + fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0 + ) + cfm_res = cfm_res[:, :, mel2.shape[2] :] + mel2 = cfm_res[:, :, -T_min:] + fea_ref = fea_todo_chunk[:, :, -T_min:] + cfm_resss.append(cfm_res) + cfm_res = torch.cat(cfm_resss, 2) + cfm_res = denorm_spec(cfm_res) + if model_version=="v3": + if bigvgan_model == None: + init_bigvgan() + else:#v4 + if hifigan_model == None: + init_hifigan() + vocoder_model=bigvgan_model if model_version=="v3"else hifigan_model + with torch.inference_mode(): + wav_gen = vocoder_model(cfm_res) + audio = wav_gen[0][0] # .cpu().detach().numpy() + max_audio = torch.abs(audio).max() # 简单防止16bit爆音 + if max_audio > 1: + audio = audio / max_audio + audio_opt.append(audio) + audio_opt.append(zero_wav_torch) # zero_wav + t4 = ttime() + t.extend([t2 - t1, t3 - t2, t4 - t3]) + t1 = ttime() + print("%.3f\t%.3f\t%.3f\t%.3f" % (t[0], sum(t[1::3]), sum(t[2::3]), sum(t[3::3]))) + audio_opt = torch.cat(audio_opt, 0) # np.concatenate + if model_version in {"v1","v2"}:opt_sr=32000 + elif model_version=="v3":opt_sr=24000 + else:opt_sr=48000#v4 + if if_sr == True and opt_sr == 24000: + print(i18n("音频超分中")) + audio_opt, opt_sr = audio_sr(audio_opt.unsqueeze(0), opt_sr) + max_audio = np.abs(audio_opt).max() + if max_audio > 1: + audio_opt /= max_audio + else: + audio_opt = audio_opt.cpu().detach().numpy() + yield opt_sr, (audio_opt * 32767).astype(np.int16) + + +def split(todo_text): + todo_text = todo_text.replace("……", "。").replace("——", ",") + if todo_text[-1] not in splits: + todo_text += "。" + i_split_head = i_split_tail = 0 + len_text = len(todo_text) + todo_texts = [] + while 1: + if i_split_head >= len_text: + break # 结尾一定有标点,所以直接跳出即可,最后一段在上次已加入 + if todo_text[i_split_head] in splits: + i_split_head += 1 + todo_texts.append(todo_text[i_split_tail:i_split_head]) + i_split_tail = i_split_head + else: + i_split_head += 1 + return todo_texts + + +def cut1(inp): + inp = inp.strip("\n") + inps = split(inp) + split_idx = list(range(0, len(inps), 4)) + split_idx[-1] = None + if len(split_idx) > 1: + opts = [] + for idx in range(len(split_idx) - 1): + opts.append("".join(inps[split_idx[idx] : split_idx[idx + 1]])) + else: + opts = [inp] + opts = [item for item in opts if not set(item).issubset(punctuation)] + return "\n".join(opts) + + +def cut2(inp): + inp = inp.strip("\n") + inps = split(inp) + if len(inps) < 2: + return inp + opts = [] + summ = 0 + tmp_str = "" + for i in range(len(inps)): + summ += len(inps[i]) + tmp_str += inps[i] + if summ > 50: + summ = 0 + opts.append(tmp_str) + tmp_str = "" + if tmp_str != "": + opts.append(tmp_str) + # print(opts) + if len(opts) > 1 and len(opts[-1]) < 50: ##如果最后一个太短了,和前一个合一起 + opts[-2] = opts[-2] + opts[-1] + opts = opts[:-1] + opts = [item for item in opts if not set(item).issubset(punctuation)] + return "\n".join(opts) + + +def cut3(inp): + inp = inp.strip("\n") + opts = ["%s" % item for item in inp.strip("。").split("。")] + opts = [item for item in opts if not set(item).issubset(punctuation)] + return "\n".join(opts) + + +def cut4(inp): + inp = inp.strip("\n") + opts = re.split(r"(? 0 and i < len(inp) - 1 and inp[i - 1].isdigit() and inp[i + 1].isdigit(): + items.append(char) + else: + items.append(char) + mergeitems.append("".join(items)) + items = [] + else: + items.append(char) + + if items: + mergeitems.append("".join(items)) + + opt = [item for item in mergeitems if not set(item).issubset(punds)] + return "\n".join(opt) + + +def custom_sort_key(s): + # 使用正则表达式提取字符串中的数字部分和非数字部分 + parts = re.split("(\d+)", s) + # 将数字部分转换为整数,非数字部分保持不变 + parts = [int(part) if part.isdigit() else part for part in parts] + return parts + + +def process_text(texts): + _text = [] + if all(text in [None, " ", "\n", ""] for text in texts): + raise ValueError(i18n("请输入有效文本")) + for text in texts: + if text in [None, " ", ""]: + pass + else: + _text.append(text) + return _text + + +def change_choices(): + SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root) + return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, { + "choices": sorted(GPT_names, key=custom_sort_key), + "__type__": "update", + } + + +SoVITS_weight_root = ["SoVITS_weights", "SoVITS_weights_v2", "SoVITS_weights_v3", "SoVITS_weights_v4"] +GPT_weight_root = ["GPT_weights", "GPT_weights_v2", "GPT_weights_v3", "GPT_weights_v4"] +for path in SoVITS_weight_root + GPT_weight_root: + os.makedirs(path, exist_ok=True) + + +def get_weights_names(GPT_weight_root, SoVITS_weight_root): + SoVITS_names = [i for i in pretrained_sovits_name] + for path in SoVITS_weight_root: + for name in os.listdir(path): + if name.endswith(".pth"): + SoVITS_names.append("%s/%s" % (path, name)) + GPT_names = [i for i in pretrained_gpt_name] + for path in GPT_weight_root: + for name in os.listdir(path): + if name.endswith(".ckpt"): + GPT_names.append("%s/%s" % (path, name)) + return SoVITS_names, GPT_names + + +SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root) + + +def html_center(text, label="p"): + return f"""
+ <{label} style="margin: 0; padding: 0;">{text} +
""" + + +def html_left(text, label="p"): + return f"""
+ <{label} style="margin: 0; padding: 0;">{text} +
""" + + +with gr.Blocks(title="GPT-SoVITS WebUI") as app: + gr.Markdown( + value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + + "
" + + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") + ) + with gr.Group(): + gr.Markdown(html_center(i18n("模型切换"), "h3")) + with gr.Row(): + GPT_dropdown = gr.Dropdown( + label=i18n("GPT模型列表"), + choices=sorted(GPT_names, key=custom_sort_key), + value=gpt_path, + interactive=True, + scale=14, + ) + SoVITS_dropdown = gr.Dropdown( + label=i18n("SoVITS模型列表"), + choices=sorted(SoVITS_names, key=custom_sort_key), + value=sovits_path, + interactive=True, + scale=14, + ) + refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary", scale=14) + refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown]) + gr.Markdown(html_center(i18n("*请上传并填写参考信息"), "h3")) + with gr.Row(): + inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频,超过会报错!"), type="filepath", scale=13) + with gr.Column(scale=13): + ref_text_free = gr.Checkbox( + label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。") + + i18n("v3暂不支持该模式,使用了会报错。"), + value=False, + interactive=True if model_version not in v3v4set else False, + show_label=True, + scale=1, + ) + gr.Markdown( + html_left( + i18n("使用无参考文本模式时建议使用微调的GPT") + + "
" + + i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。") + ) + ) + prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="", lines=5, max_lines=5, scale=1) + with gr.Column(scale=14): + prompt_language = gr.Dropdown( + label=i18n("参考音频的语种"), + choices=list(dict_language.keys()), + value=i18n("中文"), + ) + inp_refs = ( + gr.File( + label=i18n( + "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。" + ), + file_count="multiple", + ) + if model_version not in v3v4set + else gr.File( + label=i18n( + "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。" + ), + file_count="multiple", + visible=False, + ) + ) + sample_steps = ( + gr.Radio( + label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"), + value=32 if model_version=="v3"else 8, + choices=[4, 8, 16, 32,64,128]if model_version=="v3"else [4, 8, 16, 32], + visible=True, + ) + if model_version in v3v4set + else gr.Radio( + label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"), + choices=[4, 8, 16, 32,64,128]if model_version=="v3"else [4, 8, 16, 32], + visible=False, + value=32 if model_version=="v3"else 8, + ) + ) + if_sr_Checkbox = gr.Checkbox( + label=i18n("v3输出如果觉得闷可以试试开超分"), + value=False, + interactive=True, + show_label=True, + visible=False if model_version !="v3" else True, + ) + gr.Markdown(html_center(i18n("*请填写需要合成的目标文本和语种模式"), "h3")) + with gr.Row(): + with gr.Column(scale=13): + text = gr.Textbox(label=i18n("需要合成的文本"), value="", lines=26, max_lines=26) + with gr.Column(scale=7): + text_language = gr.Dropdown( + label=i18n("需要合成的语种") + i18n(".限制范围越小判别效果越好。"), + choices=list(dict_language.keys()), + value=i18n("中文"), + scale=1, + ) + how_to_cut = gr.Dropdown( + label=i18n("怎么切"), + choices=[ + i18n("不切"), + i18n("凑四句一切"), + i18n("凑50字一切"), + i18n("按中文句号。切"), + i18n("按英文句号.切"), + i18n("按标点符号切"), + ], + value=i18n("凑四句一切"), + interactive=True, + scale=1, + ) + gr.Markdown(value=html_center(i18n("语速调整,高为更快"))) + if_freeze = gr.Checkbox( + label=i18n("是否直接对上次合成结果调整语速和音色。防止随机性。"), + value=False, + interactive=True, + show_label=True, + scale=1, + ) + with gr.Row(): + speed = gr.Slider( + minimum=0.6, maximum=1.65, step=0.05, label=i18n("语速"), value=1, interactive=True, scale=1 + ) + pause_second_slider = gr.Slider( + minimum=0.1, + maximum=0.5, + step=0.01, + label=i18n("句间停顿秒数"), + value=0.3, + interactive=True, + scale=1, + ) + gr.Markdown(html_center(i18n("GPT采样参数(无参考文本时不要太低。不懂就用默认):"))) + top_k = gr.Slider( + minimum=1, maximum=100, step=1, label=i18n("top_k"), value=15, interactive=True, scale=1 + ) + top_p = gr.Slider( + minimum=0, maximum=1, step=0.05, label=i18n("top_p"), value=1, interactive=True, scale=1 + ) + temperature = gr.Slider( + minimum=0, maximum=1, step=0.05, label=i18n("temperature"), value=1, interactive=True, scale=1 + ) + # with gr.Column(): + # gr.Markdown(value=i18n("手工调整音素。当音素框不为空时使用手工音素输入推理,无视目标文本框。")) + # phoneme=gr.Textbox(label=i18n("音素框"), value="") + # get_phoneme_button = gr.Button(i18n("目标文本转音素"), variant="primary") + with gr.Row(): + inference_button = gr.Button(value=i18n("合成语音"), variant="primary", size="lg", scale=25) + output = gr.Audio(label=i18n("输出的语音"), scale=14) + + inference_button.click( + get_tts_wav, + [ + inp_ref, + prompt_text, + prompt_language, + text, + text_language, + how_to_cut, + top_k, + top_p, + temperature, + ref_text_free, + speed, + if_freeze, + inp_refs, + sample_steps, + if_sr_Checkbox, + pause_second_slider, + ], + [output], + ) + SoVITS_dropdown.change( + change_sovits_weights, + [SoVITS_dropdown, prompt_language, text_language], + [ + prompt_language, + text_language, + prompt_text, + prompt_language, + text, + text_language, + sample_steps, + inp_refs, + ref_text_free, + if_sr_Checkbox, + inference_button, + ], + ) + GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], []) + + # gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。")) + # with gr.Row(): + # text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"), value="") + # button1 = gr.Button(i18n("凑四句一切"), variant="primary") + # button2 = gr.Button(i18n("凑50字一切"), variant="primary") + # button3 = gr.Button(i18n("按中文句号。切"), variant="primary") + # button4 = gr.Button(i18n("按英文句号.切"), variant="primary") + # button5 = gr.Button(i18n("按标点符号切"), variant="primary") + # text_opt = gr.Textbox(label=i18n("切分后文本"), value="") + # button1.click(cut1, [text_inp], [text_opt]) + # button2.click(cut2, [text_inp], [text_opt]) + # button3.click(cut3, [text_inp], [text_opt]) + # button4.click(cut4, [text_inp], [text_opt]) + # button5.click(cut5, [text_inp], [text_opt]) + # gr.Markdown(html_center(i18n("后续将支持转音素、手工修改音素、语音合成分步执行。"))) + +if __name__ == "__main__": + app.queue().launch( # concurrency_count=511, max_size=1022 + server_name="0.0.0.0", + inbrowser=True, + share=is_share, + server_port=infer_ttswebui, + # quiet=True, + ) diff --git a/GPT_SoVITS/inference_webui_fast.py b/GPT_SoVITS/inference_webui_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..311994b8b30396e79d23501860fb410fb87105a0 --- /dev/null +++ b/GPT_SoVITS/inference_webui_fast.py @@ -0,0 +1,546 @@ +""" +按中英混合识别 +按日英混合识别 +多语种启动切分识别语种 +全部按中文识别 +全部按英文识别 +全部按日文识别 +""" + +import json +import logging +import os +import random +import re +import sys + +now_dir = os.getcwd() +sys.path.append(now_dir) +sys.path.append("%s/GPT_SoVITS" % (now_dir)) + +logging.getLogger("markdown_it").setLevel(logging.ERROR) +logging.getLogger("urllib3").setLevel(logging.ERROR) +logging.getLogger("httpcore").setLevel(logging.ERROR) +logging.getLogger("httpx").setLevel(logging.ERROR) +logging.getLogger("asyncio").setLevel(logging.ERROR) +logging.getLogger("charset_normalizer").setLevel(logging.ERROR) +logging.getLogger("torchaudio._extension").setLevel(logging.ERROR) +import torch + +try: + import gradio.analytics as analytics + + analytics.version_check = lambda: None +except: + ... + + +infer_ttswebui = os.environ.get("infer_ttswebui", 9872) +infer_ttswebui = int(infer_ttswebui) +is_share = os.environ.get("is_share", "False") +is_share = eval(is_share) +if "_CUDA_VISIBLE_DEVICES" in os.environ: + os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] + +is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available() +gpt_path = os.environ.get("gpt_path", None) +sovits_path = os.environ.get("sovits_path", None) +cnhubert_base_path = os.environ.get("cnhubert_base_path", None) +bert_path = os.environ.get("bert_path", None) +version = model_version = os.environ.get("version", "v2") + +import gradio as gr +from TTS_infer_pack.text_segmentation_method import get_method +from TTS_infer_pack.TTS import NO_PROMPT_ERROR, TTS, TTS_Config + +from tools.i18n.i18n import I18nAuto, scan_language_list + +language = os.environ.get("language", "Auto") +language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language +i18n = I18nAuto(language=language) + + +# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。 + +if torch.cuda.is_available(): + device = "cuda" +# elif torch.backends.mps.is_available(): +# device = "mps" +else: + device = "cpu" + +# is_half = False +# device = "cpu" + +dict_language_v1 = { + i18n("中文"): "all_zh", # 全部按中文识别 + i18n("英文"): "en", # 全部按英文识别#######不变 + i18n("日文"): "all_ja", # 全部按日文识别 + i18n("中英混合"): "zh", # 按中英混合识别####不变 + i18n("日英混合"): "ja", # 按日英混合识别####不变 + i18n("多语种混合"): "auto", # 多语种启动切分识别语种 +} +dict_language_v2 = { + i18n("中文"): "all_zh", # 全部按中文识别 + i18n("英文"): "en", # 全部按英文识别#######不变 + i18n("日文"): "all_ja", # 全部按日文识别 + i18n("粤语"): "all_yue", # 全部按中文识别 + i18n("韩文"): "all_ko", # 全部按韩文识别 + i18n("中英混合"): "zh", # 按中英混合识别####不变 + i18n("日英混合"): "ja", # 按日英混合识别####不变 + i18n("粤英混合"): "yue", # 按粤英混合识别####不变 + i18n("韩英混合"): "ko", # 按韩英混合识别####不变 + i18n("多语种混合"): "auto", # 多语种启动切分识别语种 + i18n("多语种混合(粤语)"): "auto_yue", # 多语种启动切分识别语种 +} +dict_language = dict_language_v1 if version == "v1" else dict_language_v2 + +cut_method = { + i18n("不切"): "cut0", + i18n("凑四句一切"): "cut1", + i18n("凑50字一切"): "cut2", + i18n("按中文句号。切"): "cut3", + i18n("按英文句号.切"): "cut4", + i18n("按标点符号切"): "cut5", +} + +tts_config = TTS_Config("GPT_SoVITS/configs/tts_infer.yaml") +tts_config.device = device +tts_config.is_half = is_half +tts_config.version = version +if gpt_path is not None: + tts_config.t2s_weights_path = gpt_path +if sovits_path is not None: + tts_config.vits_weights_path = sovits_path +if cnhubert_base_path is not None: + tts_config.cnhuhbert_base_path = cnhubert_base_path +if bert_path is not None: + tts_config.bert_base_path = bert_path + +print(tts_config) +tts_pipeline = TTS(tts_config) +gpt_path = tts_config.t2s_weights_path +sovits_path = tts_config.vits_weights_path +version = tts_config.version + + +def inference( + text, + text_lang, + ref_audio_path, + aux_ref_audio_paths, + prompt_text, + prompt_lang, + top_k, + top_p, + temperature, + text_split_method, + batch_size, + speed_factor, + ref_text_free, + split_bucket, + fragment_interval, + seed, + keep_random, + parallel_infer, + repetition_penalty, + sample_steps, + super_sampling, +): + seed = -1 if keep_random else seed + actual_seed = seed if seed not in [-1, "", None] else random.randint(0, 2**32 - 1) + inputs = { + "text": text, + "text_lang": dict_language[text_lang], + "ref_audio_path": ref_audio_path, + "aux_ref_audio_paths": [item.name for item in aux_ref_audio_paths] if aux_ref_audio_paths is not None else [], + "prompt_text": prompt_text if not ref_text_free else "", + "prompt_lang": dict_language[prompt_lang], + "top_k": top_k, + "top_p": top_p, + "temperature": temperature, + "text_split_method": cut_method[text_split_method], + "batch_size": int(batch_size), + "speed_factor": float(speed_factor), + "split_bucket": split_bucket, + "return_fragment": False, + "fragment_interval": fragment_interval, + "seed": actual_seed, + "parallel_infer": parallel_infer, + "repetition_penalty": repetition_penalty, + "sample_steps": int(sample_steps), + "super_sampling": super_sampling, + } + try: + for item in tts_pipeline.run(inputs): + yield item, actual_seed + except NO_PROMPT_ERROR: + gr.Warning(i18n("V3不支持无参考文本模式,请填写参考文本!")) + + +def custom_sort_key(s): + # 使用正则表达式提取字符串中的数字部分和非数字部分 + parts = re.split("(\d+)", s) + # 将数字部分转换为整数,非数字部分保持不变 + parts = [int(part) if part.isdigit() else part for part in parts] + return parts + + +def change_choices(): + SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root) + return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, { + "choices": sorted(GPT_names, key=custom_sort_key), + "__type__": "update", + } + + +path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth" +path_sovits_v4 = "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth" +is_exist_s2gv3 = os.path.exists(path_sovits_v3) +is_exist_s2gv4 = os.path.exists(path_sovits_v4) +pretrained_sovits_name = [ + "GPT_SoVITS/pretrained_models/s2G488k.pth", + "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", + "GPT_SoVITS/pretrained_models/s2Gv3.pth", + "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth", +] +pretrained_gpt_name = [ + "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", + "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", + "GPT_SoVITS/pretrained_models/s1v3.ckpt", + "GPT_SoVITS/pretrained_models/s1v3.ckpt", +] + + +_ = [[], []] +for i in range(4): + if os.path.exists(pretrained_gpt_name[i]): + _[0].append(pretrained_gpt_name[i]) + if os.path.exists(pretrained_sovits_name[i]): + _[-1].append(pretrained_sovits_name[i]) +pretrained_gpt_name, pretrained_sovits_name = _ + +if os.path.exists("./weight.json"): + pass +else: + with open("./weight.json", "w", encoding="utf-8") as file: + json.dump({"GPT": {}, "SoVITS": {}}, file) + +with open("./weight.json", "r", encoding="utf-8") as file: + weight_data = file.read() + weight_data = json.loads(weight_data) + gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, pretrained_gpt_name)) + sovits_path = os.environ.get("sovits_path", weight_data.get("SoVITS", {}).get(version, pretrained_sovits_name)) + if isinstance(gpt_path, list): + gpt_path = gpt_path[0] + if isinstance(sovits_path, list): + sovits_path = sovits_path[0] + + +SoVITS_weight_root = ["SoVITS_weights", "SoVITS_weights_v2", "SoVITS_weights_v3", "SoVITS_weights_v4"] +GPT_weight_root = ["GPT_weights", "GPT_weights_v2", "GPT_weights_v3", "GPT_weights_v4"] +for path in SoVITS_weight_root + GPT_weight_root: + os.makedirs(path, exist_ok=True) + + +def get_weights_names(GPT_weight_root, SoVITS_weight_root): + SoVITS_names = [i for i in pretrained_sovits_name] + for path in SoVITS_weight_root: + for name in os.listdir(path): + if name.endswith(".pth"): + SoVITS_names.append("%s/%s" % (path, name)) + GPT_names = [i for i in pretrained_gpt_name] + for path in GPT_weight_root: + for name in os.listdir(path): + if name.endswith(".ckpt"): + GPT_names.append("%s/%s" % (path, name)) + return SoVITS_names, GPT_names + + +SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root) + + +from process_ckpt import get_sovits_version_from_path_fast + +v3v4set={"v3","v4"} +def change_sovits_weights(sovits_path, prompt_language=None, text_language=None): + global version, model_version, dict_language, if_lora_v3 + version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path) + # print(sovits_path,version, model_version, if_lora_v3) + is_exist=is_exist_s2gv3 if model_version=="v3"else is_exist_s2gv4 + path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4 + if if_lora_v3 == True and is_exist == False: + info = path_sovits + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重"%model_version) + gr.Warning(info) + raise FileExistsError(info) + dict_language = dict_language_v1 if version == "v1" else dict_language_v2 + if prompt_language is not None and text_language is not None: + if prompt_language in list(dict_language.keys()): + prompt_text_update, prompt_language_update = ( + {"__type__": "update"}, + {"__type__": "update", "value": prompt_language}, + ) + else: + prompt_text_update = {"__type__": "update", "value": ""} + prompt_language_update = {"__type__": "update", "value": i18n("中文")} + if text_language in list(dict_language.keys()): + text_update, text_language_update = {"__type__": "update"}, {"__type__": "update", "value": text_language} + else: + text_update = {"__type__": "update", "value": ""} + text_language_update = {"__type__": "update", "value": i18n("中文")} + if model_version in v3v4set: + visible_sample_steps = True + visible_inp_refs = False + else: + visible_sample_steps = False + visible_inp_refs = True + yield ( + {"__type__": "update", "choices": list(dict_language.keys())}, + {"__type__": "update", "choices": list(dict_language.keys())}, + prompt_text_update, + prompt_language_update, + text_update, + text_language_update, + {"__type__": "update", "interactive": visible_sample_steps, "value": 32}, + {"__type__": "update", "visible": visible_inp_refs}, + {"__type__": "update", "interactive": True if model_version not in v3v4set else False}, + {"__type__": "update", "value": i18n("模型加载中,请等待"), "interactive": False}, + ) + + tts_pipeline.init_vits_weights(sovits_path) + yield ( + {"__type__": "update", "choices": list(dict_language.keys())}, + {"__type__": "update", "choices": list(dict_language.keys())}, + prompt_text_update, + prompt_language_update, + text_update, + text_language_update, + {"__type__": "update", "interactive": visible_sample_steps, "value": 32}, + {"__type__": "update", "visible": visible_inp_refs}, + {"__type__": "update", "interactive": True if model_version not in v3v4set else False}, + {"__type__": "update", "value": i18n("合成语音"), "interactive": True}, + ) + with open("./weight.json") as f: + data = f.read() + data = json.loads(data) + data["SoVITS"][version] = sovits_path + with open("./weight.json", "w") as f: + f.write(json.dumps(data)) + + +with gr.Blocks(title="GPT-SoVITS WebUI") as app: + gr.Markdown( + value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + + "
" + + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") + ) + + with gr.Column(): + # with gr.Group(): + gr.Markdown(value=i18n("模型切换")) + with gr.Row(): + GPT_dropdown = gr.Dropdown( + label=i18n("GPT模型列表"), + choices=sorted(GPT_names, key=custom_sort_key), + value=gpt_path, + interactive=True, + ) + SoVITS_dropdown = gr.Dropdown( + label=i18n("SoVITS模型列表"), + choices=sorted(SoVITS_names, key=custom_sort_key), + value=sovits_path, + interactive=True, + ) + refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary") + refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown]) + + with gr.Row(): + with gr.Column(): + gr.Markdown(value=i18n("*请上传并填写参考信息")) + with gr.Row(): + inp_ref = gr.Audio(label=i18n("主参考音频(请上传3~10秒内参考音频,超过会报错!)"), type="filepath") + inp_refs = gr.File( + label=i18n("辅参考音频(可选多个,或不选)"), + file_count="multiple", + visible=True if model_version != "v3" else False, + ) + prompt_text = gr.Textbox(label=i18n("主参考音频的文本"), value="", lines=2) + with gr.Row(): + prompt_language = gr.Dropdown( + label=i18n("主参考音频的语种"), choices=list(dict_language.keys()), value=i18n("中文") + ) + with gr.Column(): + ref_text_free = gr.Checkbox( + label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), + value=False, + interactive=True if model_version != "v3" else False, + show_label=True, + ) + gr.Markdown( + i18n("使用无参考文本模式时建议使用微调的GPT") + + "
" + + i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。") + ) + + with gr.Column(): + gr.Markdown(value=i18n("*请填写需要合成的目标文本和语种模式")) + text = gr.Textbox(label=i18n("需要合成的文本"), value="", lines=20, max_lines=20) + text_language = gr.Dropdown( + label=i18n("需要合成的文本的语种"), choices=list(dict_language.keys()), value=i18n("中文") + ) + + with gr.Group(): + gr.Markdown(value=i18n("推理设置")) + with gr.Row(): + with gr.Column(): + with gr.Row(): + batch_size = gr.Slider( + minimum=1, maximum=200, step=1, label=i18n("batch_size"), value=20, interactive=True + ) + sample_steps = gr.Radio( + label=i18n("采样步数(仅对V3/4生效)"), value=32, choices=[4, 8, 16, 32, 64, 128], visible=True + ) + with gr.Row(): + fragment_interval = gr.Slider( + minimum=0.01, maximum=1, step=0.01, label=i18n("分段间隔(秒)"), value=0.3, interactive=True + ) + speed_factor = gr.Slider( + minimum=0.6, maximum=1.65, step=0.05, label="语速", value=1.0, interactive=True + ) + with gr.Row(): + top_k = gr.Slider(minimum=1, maximum=100, step=1, label=i18n("top_k"), value=5, interactive=True) + top_p = gr.Slider(minimum=0, maximum=1, step=0.05, label=i18n("top_p"), value=1, interactive=True) + with gr.Row(): + temperature = gr.Slider( + minimum=0, maximum=1, step=0.05, label=i18n("temperature"), value=1, interactive=True + ) + repetition_penalty = gr.Slider( + minimum=0, maximum=2, step=0.05, label=i18n("重复惩罚"), value=1.35, interactive=True + ) + + with gr.Column(): + with gr.Row(): + how_to_cut = gr.Dropdown( + label=i18n("怎么切"), + choices=[ + i18n("不切"), + i18n("凑四句一切"), + i18n("凑50字一切"), + i18n("按中文句号。切"), + i18n("按英文句号.切"), + i18n("按标点符号切"), + ], + value=i18n("凑四句一切"), + interactive=True, + scale=1, + ) + super_sampling = gr.Checkbox( + label=i18n("音频超采样(仅对V3生效))"), value=False, interactive=True, show_label=True + ) + + with gr.Row(): + parallel_infer = gr.Checkbox(label=i18n("并行推理"), value=True, interactive=True, show_label=True) + split_bucket = gr.Checkbox( + label=i18n("数据分桶(并行推理时会降低一点计算量)"), + value=True, + interactive=True, + show_label=True, + ) + + with gr.Row(): + seed = gr.Number(label=i18n("随机种子"), value=-1) + keep_random = gr.Checkbox(label=i18n("保持随机"), value=True, interactive=True, show_label=True) + + output = gr.Audio(label=i18n("输出的语音")) + with gr.Row(): + inference_button = gr.Button(i18n("合成语音"), variant="primary") + stop_infer = gr.Button(i18n("终止合成"), variant="primary") + + inference_button.click( + inference, + [ + text, + text_language, + inp_ref, + inp_refs, + prompt_text, + prompt_language, + top_k, + top_p, + temperature, + how_to_cut, + batch_size, + speed_factor, + ref_text_free, + split_bucket, + fragment_interval, + seed, + keep_random, + parallel_infer, + repetition_penalty, + sample_steps, + super_sampling, + ], + [output, seed], + ) + stop_infer.click(tts_pipeline.stop, [], []) + SoVITS_dropdown.change( + change_sovits_weights, + [SoVITS_dropdown, prompt_language, text_language], + [ + prompt_language, + text_language, + prompt_text, + prompt_language, + text, + text_language, + sample_steps, + inp_refs, + ref_text_free, + inference_button, + ], + ) # + GPT_dropdown.change(tts_pipeline.init_t2s_weights, [GPT_dropdown], []) + + with gr.Group(): + gr.Markdown( + value=i18n( + "文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。" + ) + ) + with gr.Row(): + text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"), value="", lines=4) + with gr.Column(): + _how_to_cut = gr.Radio( + label=i18n("怎么切"), + choices=[ + i18n("不切"), + i18n("凑四句一切"), + i18n("凑50字一切"), + i18n("按中文句号。切"), + i18n("按英文句号.切"), + i18n("按标点符号切"), + ], + value=i18n("凑四句一切"), + interactive=True, + ) + cut_text = gr.Button(i18n("切分"), variant="primary") + + def to_cut(text_inp, how_to_cut): + if len(text_inp.strip()) == 0 or text_inp == []: + return "" + method = get_method(cut_method[how_to_cut]) + return method(text_inp) + + text_opt = gr.Textbox(label=i18n("切分后文本"), value="", lines=4) + cut_text.click(to_cut, [text_inp, _how_to_cut], [text_opt]) + gr.Markdown(value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。")) + +if __name__ == "__main__": + app.queue().launch( # concurrency_count=511, max_size=1022 + server_name="0.0.0.0", + inbrowser=True, + share=is_share, + server_port=infer_ttswebui, + # quiet=True, + ) diff --git a/GPT_SoVITS/onnx_export.py b/GPT_SoVITS/onnx_export.py new file mode 100644 index 0000000000000000000000000000000000000000..fd680135fb7d71afb4680b05a62b9874c39ad21c --- /dev/null +++ b/GPT_SoVITS/onnx_export.py @@ -0,0 +1,398 @@ +import torch +import torchaudio +from AR.models.t2s_lightning_module_onnx import Text2SemanticLightningModule +from feature_extractor import cnhubert +from module.models_onnx import SynthesizerTrn, symbols_v1, symbols_v2 +from torch import nn + +cnhubert_base_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base" +cnhubert.cnhubert_base_path = cnhubert_base_path +ssl_model = cnhubert.get_model() +import json +import os + +import soundfile +from text import cleaned_text_to_sequence + + +def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): + hann_window = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) + y = torch.nn.functional.pad( + y.unsqueeze(1), + (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), + mode="reflect", + ) + y = y.squeeze(1) + spec = torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window, + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=False, + ) + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + return spec + + +class DictToAttrRecursive(dict): + def __init__(self, input_dict): + super().__init__(input_dict) + for key, value in input_dict.items(): + if isinstance(value, dict): + value = DictToAttrRecursive(value) + self[key] = value + setattr(self, key, value) + + def __getattr__(self, item): + try: + return self[item] + except KeyError: + raise AttributeError(f"Attribute {item} not found") + + def __setattr__(self, key, value): + if isinstance(value, dict): + value = DictToAttrRecursive(value) + super(DictToAttrRecursive, self).__setitem__(key, value) + super().__setattr__(key, value) + + def __delattr__(self, item): + try: + del self[item] + except KeyError: + raise AttributeError(f"Attribute {item} not found") + + +class T2SEncoder(nn.Module): + def __init__(self, t2s, vits): + super().__init__() + self.encoder = t2s.onnx_encoder + self.vits = vits + + def forward(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content): + codes = self.vits.extract_latent(ssl_content) + prompt_semantic = codes[0, 0] + bert = torch.cat([ref_bert.transpose(0, 1), text_bert.transpose(0, 1)], 1) + all_phoneme_ids = torch.cat([ref_seq, text_seq], 1) + bert = bert.unsqueeze(0) + prompt = prompt_semantic.unsqueeze(0) + return self.encoder(all_phoneme_ids, bert), prompt + + +class T2SModel(nn.Module): + def __init__(self, t2s_path, vits_model): + super().__init__() + dict_s1 = torch.load(t2s_path, map_location="cpu") + self.config = dict_s1["config"] + self.t2s_model = Text2SemanticLightningModule(self.config, "ojbk", is_train=False) + self.t2s_model.load_state_dict(dict_s1["weight"]) + self.t2s_model.eval() + self.vits_model = vits_model.vq_model + self.hz = 50 + self.max_sec = self.config["data"]["max_sec"] + self.t2s_model.model.top_k = torch.LongTensor([self.config["inference"]["top_k"]]) + self.t2s_model.model.early_stop_num = torch.LongTensor([self.hz * self.max_sec]) + self.t2s_model = self.t2s_model.model + self.t2s_model.init_onnx() + self.onnx_encoder = T2SEncoder(self.t2s_model, self.vits_model) + self.first_stage_decoder = self.t2s_model.first_stage_decoder + self.stage_decoder = self.t2s_model.stage_decoder + # self.t2s_model = torch.jit.script(self.t2s_model) + + def forward(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content): + early_stop_num = self.t2s_model.early_stop_num + + # [1,N] [1,N] [N, 1024] [N, 1024] [1, 768, N] + x, prompts = self.onnx_encoder(ref_seq, text_seq, ref_bert, text_bert, ssl_content) + + prefix_len = prompts.shape[1] + + # [1,N,512] [1,N] + y, k, v, y_emb, x_example = self.first_stage_decoder(x, prompts) + + stop = False + for idx in range(1, 1500): + # [1, N] [N_layer, N, 1, 512] [N_layer, N, 1, 512] [1, N, 512] [1] [1, N, 512] [1, N] + enco = self.stage_decoder(y, k, v, y_emb, x_example) + y, k, v, y_emb, logits, samples = enco + if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: + stop = True + if torch.argmax(logits, dim=-1)[0] == self.t2s_model.EOS or samples[0, 0] == self.t2s_model.EOS: + stop = True + if stop: + break + y[0, -1] = 0 + + return y[:, -idx:].unsqueeze(0) + + def export(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content, project_name, dynamo=False): + # self.onnx_encoder = torch.jit.script(self.onnx_encoder) + if dynamo: + export_options = torch.onnx.ExportOptions(dynamic_shapes=True) + onnx_encoder_export_output = torch.onnx.dynamo_export( + self.onnx_encoder, (ref_seq, text_seq, ref_bert, text_bert, ssl_content), export_options=export_options + ) + onnx_encoder_export_output.save(f"onnx/{project_name}/{project_name}_t2s_encoder.onnx") + return + + torch.onnx.export( + self.onnx_encoder, + (ref_seq, text_seq, ref_bert, text_bert, ssl_content), + f"onnx/{project_name}/{project_name}_t2s_encoder.onnx", + input_names=["ref_seq", "text_seq", "ref_bert", "text_bert", "ssl_content"], + output_names=["x", "prompts"], + dynamic_axes={ + "ref_seq": {1: "ref_length"}, + "text_seq": {1: "text_length"}, + "ref_bert": {0: "ref_length"}, + "text_bert": {0: "text_length"}, + "ssl_content": {2: "ssl_length"}, + }, + opset_version=16, + ) + x, prompts = self.onnx_encoder(ref_seq, text_seq, ref_bert, text_bert, ssl_content) + + torch.onnx.export( + self.first_stage_decoder, + (x, prompts), + f"onnx/{project_name}/{project_name}_t2s_fsdec.onnx", + input_names=["x", "prompts"], + output_names=["y", "k", "v", "y_emb", "x_example"], + dynamic_axes={ + "x": {1: "x_length"}, + "prompts": {1: "prompts_length"}, + }, + verbose=False, + opset_version=16, + ) + y, k, v, y_emb, x_example = self.first_stage_decoder(x, prompts) + + torch.onnx.export( + self.stage_decoder, + (y, k, v, y_emb, x_example), + f"onnx/{project_name}/{project_name}_t2s_sdec.onnx", + input_names=["iy", "ik", "iv", "iy_emb", "ix_example"], + output_names=["y", "k", "v", "y_emb", "logits", "samples"], + dynamic_axes={ + "iy": {1: "iy_length"}, + "ik": {1: "ik_length"}, + "iv": {1: "iv_length"}, + "iy_emb": {1: "iy_emb_length"}, + "ix_example": {1: "ix_example_length"}, + }, + verbose=False, + opset_version=16, + ) + + +class VitsModel(nn.Module): + def __init__(self, vits_path): + super().__init__() + dict_s2 = torch.load(vits_path, map_location="cpu") + self.hps = dict_s2["config"] + if dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322: + self.hps["model"]["version"] = "v1" + else: + self.hps["model"]["version"] = "v2" + + self.hps = DictToAttrRecursive(self.hps) + self.hps.model.semantic_frame_rate = "25hz" + self.vq_model = SynthesizerTrn( + self.hps.data.filter_length // 2 + 1, + self.hps.train.segment_size // self.hps.data.hop_length, + n_speakers=self.hps.data.n_speakers, + **self.hps.model, + ) + self.vq_model.eval() + self.vq_model.load_state_dict(dict_s2["weight"], strict=False) + + def forward(self, text_seq, pred_semantic, ref_audio): + refer = spectrogram_torch( + ref_audio, + self.hps.data.filter_length, + self.hps.data.sampling_rate, + self.hps.data.hop_length, + self.hps.data.win_length, + center=False, + ) + return self.vq_model(pred_semantic, text_seq, refer)[0, 0] + + +class GptSoVits(nn.Module): + def __init__(self, vits, t2s): + super().__init__() + self.vits = vits + self.t2s = t2s + + def forward(self, ref_seq, text_seq, ref_bert, text_bert, ref_audio, ssl_content, debug=False): + pred_semantic = self.t2s(ref_seq, text_seq, ref_bert, text_bert, ssl_content) + audio = self.vits(text_seq, pred_semantic, ref_audio) + if debug: + import onnxruntime + + sess = onnxruntime.InferenceSession("onnx/koharu/koharu_vits.onnx", providers=["CPU"]) + audio1 = sess.run( + None, + { + "text_seq": text_seq.detach().cpu().numpy(), + "pred_semantic": pred_semantic.detach().cpu().numpy(), + "ref_audio": ref_audio.detach().cpu().numpy(), + }, + ) + return audio, audio1 + return audio + + def export(self, ref_seq, text_seq, ref_bert, text_bert, ref_audio, ssl_content, project_name): + self.t2s.export(ref_seq, text_seq, ref_bert, text_bert, ssl_content, project_name) + pred_semantic = self.t2s(ref_seq, text_seq, ref_bert, text_bert, ssl_content) + torch.onnx.export( + self.vits, + (text_seq, pred_semantic, ref_audio), + f"onnx/{project_name}/{project_name}_vits.onnx", + input_names=["text_seq", "pred_semantic", "ref_audio"], + output_names=["audio"], + dynamic_axes={ + "text_seq": {1: "text_length"}, + "pred_semantic": {2: "pred_length"}, + "ref_audio": {1: "audio_length"}, + }, + opset_version=17, + verbose=False, + ) + + +class SSLModel(nn.Module): + def __init__(self): + super().__init__() + self.ssl = ssl_model + + def forward(self, ref_audio_16k): + return self.ssl.model(ref_audio_16k)["last_hidden_state"].transpose(1, 2) + + +def export(vits_path, gpt_path, project_name, vits_model="v2"): + vits = VitsModel(vits_path) + gpt = T2SModel(gpt_path, vits) + gpt_sovits = GptSoVits(vits, gpt) + ssl = SSLModel() + ref_seq = torch.LongTensor( + [ + cleaned_text_to_sequence( + [ + "n", + "i2", + "h", + "ao3", + ",", + "w", + "o3", + "sh", + "i4", + "b", + "ai2", + "y", + "e4", + ], + version=vits_model, + ) + ] + ) + text_seq = torch.LongTensor( + [ + cleaned_text_to_sequence( + [ + "w", + "o3", + "sh", + "i4", + "b", + "ai2", + "y", + "e4", + "w", + "o3", + "sh", + "i4", + "b", + "ai2", + "y", + "e4", + "w", + "o3", + "sh", + "i4", + "b", + "ai2", + "y", + "e4", + ], + version=vits_model, + ) + ] + ) + ref_bert = torch.randn((ref_seq.shape[1], 1024)).float() + text_bert = torch.randn((text_seq.shape[1], 1024)).float() + ref_audio = torch.randn((1, 48000 * 5)).float() + # ref_audio = torch.tensor([load_audio("rec.wav", 48000)]).float() + ref_audio_16k = torchaudio.functional.resample(ref_audio, 48000, 16000).float() + ref_audio_sr = torchaudio.functional.resample(ref_audio, 48000, vits.hps.data.sampling_rate).float() + + try: + os.mkdir(f"onnx/{project_name}") + except: + pass + + ssl_content = ssl(ref_audio_16k).float() + + # debug = False + debug = True + + # gpt_sovits.export(ref_seq, text_seq, ref_bert, text_bert, ref_audio_sr, ssl_content, project_name) + + if debug: + a, b = gpt_sovits(ref_seq, text_seq, ref_bert, text_bert, ref_audio_sr, ssl_content, debug=debug) + soundfile.write("out1.wav", a.cpu().detach().numpy(), vits.hps.data.sampling_rate) + soundfile.write("out2.wav", b[0], vits.hps.data.sampling_rate) + else: + a = gpt_sovits(ref_seq, text_seq, ref_bert, text_bert, ref_audio_sr, ssl_content).detach().cpu().numpy() + soundfile.write("out.wav", a, vits.hps.data.sampling_rate) + + if vits_model == "v1": + symbols = symbols_v1 + else: + symbols = symbols_v2 + + MoeVSConf = { + "Folder": f"{project_name}", + "Name": f"{project_name}", + "Type": "GPT-SoVits", + "Rate": vits.hps.data.sampling_rate, + "NumLayers": gpt.t2s_model.num_layers, + "EmbeddingDim": gpt.t2s_model.embedding_dim, + "Dict": "BasicDict", + "BertPath": "chinese-roberta-wwm-ext-large", + # "Symbol": symbols, + "AddBlank": False, + } + + MoeVSConfJson = json.dumps(MoeVSConf) + with open(f"onnx/{project_name}.json", "w") as MoeVsConfFile: + json.dump(MoeVSConf, MoeVsConfFile, indent=4) + + +if __name__ == "__main__": + try: + os.mkdir("onnx") + except: + pass + + gpt_path = "GPT_weights/nahida-e25.ckpt" + vits_path = "SoVITS_weights/nahida_e30_s3930.pth" + exp_path = "nahida" + export(vits_path, gpt_path, exp_path) + + # soundfile.write("out.wav", a, vits.hps.data.sampling_rate) diff --git a/GPT_SoVITS/prepare_datasets/1-get-text.py b/GPT_SoVITS/prepare_datasets/1-get-text.py new file mode 100644 index 0000000000000000000000000000000000000000..8d83e79ae6a8459339984dfe6c61d698f7e96746 --- /dev/null +++ b/GPT_SoVITS/prepare_datasets/1-get-text.py @@ -0,0 +1,143 @@ +# -*- coding: utf-8 -*- + +import os + +inp_text = os.environ.get("inp_text") +inp_wav_dir = os.environ.get("inp_wav_dir") +exp_name = os.environ.get("exp_name") +i_part = os.environ.get("i_part") +all_parts = os.environ.get("all_parts") +if "_CUDA_VISIBLE_DEVICES" in os.environ: + os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] +opt_dir = os.environ.get("opt_dir") +bert_pretrained_dir = os.environ.get("bert_pretrained_dir") +import torch + +is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available() +version = os.environ.get("version", None) +import traceback +import os.path +from text.cleaner import clean_text +from transformers import AutoModelForMaskedLM, AutoTokenizer +from tools.my_utils import clean_path + +# inp_text=sys.argv[1] +# inp_wav_dir=sys.argv[2] +# exp_name=sys.argv[3] +# i_part=sys.argv[4] +# all_parts=sys.argv[5] +# os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[6]#i_gpu +# opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name +# bert_pretrained_dir="/data/docker/liujing04/bert-vits2/Bert-VITS2-master20231106/bert/chinese-roberta-wwm-ext-large" + +from time import time as ttime +import shutil + + +def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path + dir = os.path.dirname(path) + name = os.path.basename(path) + # tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part) + tmp_path = "%s%s.pth" % (ttime(), i_part) + torch.save(fea, tmp_path) + shutil.move(tmp_path, "%s/%s" % (dir, name)) + + +txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part) +if os.path.exists(txt_path) == False: + bert_dir = "%s/3-bert" % (opt_dir) + os.makedirs(opt_dir, exist_ok=True) + os.makedirs(bert_dir, exist_ok=True) + if torch.cuda.is_available(): + device = "cuda:0" + # elif torch.backends.mps.is_available(): + # device = "mps" + else: + device = "cpu" + if os.path.exists(bert_pretrained_dir): + ... + else: + raise FileNotFoundError(bert_pretrained_dir) + tokenizer = AutoTokenizer.from_pretrained(bert_pretrained_dir) + bert_model = AutoModelForMaskedLM.from_pretrained(bert_pretrained_dir) + if is_half == True: + bert_model = bert_model.half().to(device) + else: + bert_model = bert_model.to(device) + + def get_bert_feature(text, word2ph): + with torch.no_grad(): + inputs = tokenizer(text, return_tensors="pt") + for i in inputs: + inputs[i] = inputs[i].to(device) + res = bert_model(**inputs, output_hidden_states=True) + res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1] + + assert len(word2ph) == len(text) + phone_level_feature = [] + for i in range(len(word2ph)): + repeat_feature = res[i].repeat(word2ph[i], 1) + phone_level_feature.append(repeat_feature) + + phone_level_feature = torch.cat(phone_level_feature, dim=0) + + return phone_level_feature.T + + def process(data, res): + for name, text, lan in data: + try: + name = clean_path(name) + name = os.path.basename(name) + print(name) + phones, word2ph, norm_text = clean_text(text.replace("%", "-").replace("¥", ","), lan, version) + path_bert = "%s/%s.pt" % (bert_dir, name) + if os.path.exists(path_bert) == False and lan == "zh": + bert_feature = get_bert_feature(norm_text, word2ph) + assert bert_feature.shape[-1] == len(phones) + # torch.save(bert_feature, path_bert) + my_save(bert_feature, path_bert) + phones = " ".join(phones) + # res.append([name,phones]) + res.append([name, phones, word2ph, norm_text]) + except: + print(name, text, traceback.format_exc()) + + todo = [] + res = [] + with open(inp_text, "r", encoding="utf8") as f: + lines = f.read().strip("\n").split("\n") + + language_v1_to_language_v2 = { + "ZH": "zh", + "zh": "zh", + "JP": "ja", + "jp": "ja", + "JA": "ja", + "ja": "ja", + "EN": "en", + "en": "en", + "En": "en", + "KO": "ko", + "Ko": "ko", + "ko": "ko", + "yue": "yue", + "YUE": "yue", + "Yue": "yue", + } + for line in lines[int(i_part) :: int(all_parts)]: + try: + wav_name, spk_name, language, text = line.split("|") + # todo.append([name,text,"zh"]) + if language in language_v1_to_language_v2.keys(): + todo.append([wav_name, text, language_v1_to_language_v2.get(language, language)]) + else: + print(f"\033[33m[Waring] The {language = } of {wav_name} is not supported for training.\033[0m") + except: + print(line, traceback.format_exc()) + + process(todo, res) + opt = [] + for name, phones, word2ph, norm_text in res: + opt.append("%s\t%s\t%s\t%s" % (name, phones, word2ph, norm_text)) + with open(txt_path, "w", encoding="utf8") as f: + f.write("\n".join(opt) + "\n") diff --git a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py new file mode 100644 index 0000000000000000000000000000000000000000..3a84c014aadf435dcce5c6339d060c6742f8633b --- /dev/null +++ b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py @@ -0,0 +1,134 @@ +# -*- coding: utf-8 -*- + +import sys +import os + +inp_text = os.environ.get("inp_text") +inp_wav_dir = os.environ.get("inp_wav_dir") +exp_name = os.environ.get("exp_name") +i_part = os.environ.get("i_part") +all_parts = os.environ.get("all_parts") +if "_CUDA_VISIBLE_DEVICES" in os.environ: + os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] +from feature_extractor import cnhubert + +opt_dir = os.environ.get("opt_dir") +cnhubert.cnhubert_base_path = os.environ.get("cnhubert_base_dir") +import torch + +is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available() + +import traceback +import numpy as np +from scipy.io import wavfile +import librosa + +now_dir = os.getcwd() +sys.path.append(now_dir) +from tools.my_utils import load_audio, clean_path + +# from config import cnhubert_base_path +# cnhubert.cnhubert_base_path=cnhubert_base_path +# inp_text=sys.argv[1] +# inp_wav_dir=sys.argv[2] +# exp_name=sys.argv[3] +# i_part=sys.argv[4] +# all_parts=sys.argv[5] +# os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[6] +# cnhubert.cnhubert_base_path=sys.argv[7] +# opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name + +from time import time as ttime +import shutil + + +def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path + dir = os.path.dirname(path) + name = os.path.basename(path) + # tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part) + tmp_path = "%s%s.pth" % (ttime(), i_part) + torch.save(fea, tmp_path) + shutil.move(tmp_path, "%s/%s" % (dir, name)) + + +hubert_dir = "%s/4-cnhubert" % (opt_dir) +wav32dir = "%s/5-wav32k" % (opt_dir) +os.makedirs(opt_dir, exist_ok=True) +os.makedirs(hubert_dir, exist_ok=True) +os.makedirs(wav32dir, exist_ok=True) + +maxx = 0.95 +alpha = 0.5 +if torch.cuda.is_available(): + device = "cuda:0" +# elif torch.backends.mps.is_available(): +# device = "mps" +else: + device = "cpu" +model = cnhubert.get_model() +# is_half=False +if is_half == True: + model = model.half().to(device) +else: + model = model.to(device) + +nan_fails = [] + + +def name2go(wav_name, wav_path): + hubert_path = "%s/%s.pt" % (hubert_dir, wav_name) + if os.path.exists(hubert_path): + return + tmp_audio = load_audio(wav_path, 32000) + tmp_max = np.abs(tmp_audio).max() + if tmp_max > 2.2: + print("%s-filtered,%s" % (wav_name, tmp_max)) + return + tmp_audio32 = (tmp_audio / tmp_max * (maxx * alpha * 32768)) + ((1 - alpha) * 32768) * tmp_audio + tmp_audio32b = (tmp_audio / tmp_max * (maxx * alpha * 1145.14)) + ((1 - alpha) * 1145.14) * tmp_audio + tmp_audio = librosa.resample(tmp_audio32b, orig_sr=32000, target_sr=16000) # 不是重采样问题 + tensor_wav16 = torch.from_numpy(tmp_audio) + if is_half == True: + tensor_wav16 = tensor_wav16.half().to(device) + else: + tensor_wav16 = tensor_wav16.to(device) + ssl = model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"].transpose(1, 2).cpu() # torch.Size([1, 768, 215]) + if np.isnan(ssl.detach().numpy()).sum() != 0: + nan_fails.append((wav_name, wav_path)) + print("nan filtered:%s" % wav_name) + return + wavfile.write( + "%s/%s" % (wav32dir, wav_name), + 32000, + tmp_audio32.astype("int16"), + ) + my_save(ssl, hubert_path) + + +with open(inp_text, "r", encoding="utf8") as f: + lines = f.read().strip("\n").split("\n") + +for line in lines[int(i_part) :: int(all_parts)]: + try: + # wav_name,text=line.split("\t") + wav_name, spk_name, language, text = line.split("|") + wav_name = clean_path(wav_name) + if inp_wav_dir != "" and inp_wav_dir != None: + wav_name = os.path.basename(wav_name) + wav_path = "%s/%s" % (inp_wav_dir, wav_name) + + else: + wav_path = wav_name + wav_name = os.path.basename(wav_name) + name2go(wav_name, wav_path) + except: + print(line, traceback.format_exc()) + +if len(nan_fails) > 0 and is_half == True: + is_half = False + model = model.float() + for wav in nan_fails: + try: + name2go(wav[0], wav[1]) + except: + print(wav_name, traceback.format_exc()) diff --git a/GPT_SoVITS/prepare_datasets/3-get-semantic.py b/GPT_SoVITS/prepare_datasets/3-get-semantic.py new file mode 100644 index 0000000000000000000000000000000000000000..ddb0607cc75fa03a8458f7ab0641e70dc79cf4f4 --- /dev/null +++ b/GPT_SoVITS/prepare_datasets/3-get-semantic.py @@ -0,0 +1,118 @@ +import os + +inp_text = os.environ.get("inp_text") +exp_name = os.environ.get("exp_name") +i_part = os.environ.get("i_part") +all_parts = os.environ.get("all_parts") +if "_CUDA_VISIBLE_DEVICES" in os.environ: + os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] +opt_dir = os.environ.get("opt_dir") +pretrained_s2G = os.environ.get("pretrained_s2G") +s2config_path = os.environ.get("s2config_path") + +if os.path.exists(pretrained_s2G): + ... +else: + raise FileNotFoundError(pretrained_s2G) +# version=os.environ.get("version","v2") +size = os.path.getsize(pretrained_s2G) +if size < 82978 * 1024: + version = "v1" +elif size < 100 * 1024 * 1024: + version = "v2" +elif size < 103520 * 1024: + version = "v1" +elif size < 700 * 1024 * 1024: + version = "v2" +else: + version = "v3" +import torch + +is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available() +import traceback +import sys + +now_dir = os.getcwd() +sys.path.append(now_dir) +import logging +import utils + +if version != "v3": + from module.models import SynthesizerTrn +else: + from module.models import SynthesizerTrnV3 as SynthesizerTrn +from tools.my_utils import clean_path + +logging.getLogger("numba").setLevel(logging.WARNING) +# from config import pretrained_s2G + +# inp_text=sys.argv[1] +# exp_name=sys.argv[2] +# i_part=sys.argv[3] +# all_parts=sys.argv[4] +# os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[5] +# opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name + + +hubert_dir = "%s/4-cnhubert" % (opt_dir) +semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part) +if os.path.exists(semantic_path) == False: + os.makedirs(opt_dir, exist_ok=True) + + if torch.cuda.is_available(): + device = "cuda" + # elif torch.backends.mps.is_available(): + # device = "mps" + else: + device = "cpu" + hps = utils.get_hparams_from_file(s2config_path) + vq_model = SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + version=version, + **hps.model, + ) + if is_half == True: + vq_model = vq_model.half().to(device) + else: + vq_model = vq_model.to(device) + vq_model.eval() + # utils.load_checkpoint(utils.latest_checkpoint_path(hps.s2_ckpt_dir, "G_*.pth"), vq_model, None, True) + # utils.load_checkpoint(pretrained_s2G, vq_model, None, True) + print( + vq_model.load_state_dict( + torch.load(pretrained_s2G, map_location="cpu", weights_only=False)["weight"], strict=False + ) + ) + + def name2go(wav_name, lines): + hubert_path = "%s/%s.pt" % (hubert_dir, wav_name) + if os.path.exists(hubert_path) == False: + return + ssl_content = torch.load(hubert_path, map_location="cpu") + if is_half == True: + ssl_content = ssl_content.half().to(device) + else: + ssl_content = ssl_content.to(device) + codes = vq_model.extract_latent(ssl_content) + semantic = " ".join([str(i) for i in codes[0, 0, :].tolist()]) + lines.append("%s\t%s" % (wav_name, semantic)) + + with open(inp_text, "r", encoding="utf8") as f: + lines = f.read().strip("\n").split("\n") + + lines1 = [] + for line in lines[int(i_part) :: int(all_parts)]: + # print(line) + try: + # wav_name,text=line.split("\t") + wav_name, spk_name, language, text = line.split("|") + wav_name = clean_path(wav_name) + wav_name = os.path.basename(wav_name) + # name2go(name,lines1) + name2go(wav_name, lines1) + except: + print(line, traceback.format_exc()) + with open(semantic_path, "w", encoding="utf8") as f: + f.write("\n".join(lines1)) diff --git a/GPT_SoVITS/pretrained_models/.gitignore b/GPT_SoVITS/pretrained_models/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..c96a04f008ee21e260b28f7701595ed59e2839e3 --- /dev/null +++ b/GPT_SoVITS/pretrained_models/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore \ No newline at end of file diff --git a/GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt b/GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..83b22408d0d3edd38eef2224a01b2c72a3f50a0b --- /dev/null +++ b/GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1c1e17e9c99547a89388f72048cd6e1b41b5a18b170e86a46dfde0324d63eb1 +size 155093966 diff --git a/GPT_SoVITS/pretrained_models/s1v3.ckpt b/GPT_SoVITS/pretrained_models/s1v3.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..54a87a3557ca48356e31cf3651155a6dbb8a05da --- /dev/null +++ b/GPT_SoVITS/pretrained_models/s1v3.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87133414860ea14ff6620c483a3db5ed07b44be42e2c3fcdad65523a729a745a +size 155284856 diff --git a/GPT_SoVITS/process_ckpt.py b/GPT_SoVITS/process_ckpt.py new file mode 100644 index 0000000000000000000000000000000000000000..4a2a1bacd6c5c67b370f5716127a9d38e77c012a --- /dev/null +++ b/GPT_SoVITS/process_ckpt.py @@ -0,0 +1,124 @@ +import traceback +from collections import OrderedDict +from time import time as ttime +import shutil +import os +import torch +from tools.i18n.i18n import I18nAuto + +i18n = I18nAuto() + + +def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path + dir = os.path.dirname(path) + name = os.path.basename(path) + tmp_path = "%s.pth" % (ttime()) + torch.save(fea, tmp_path) + shutil.move(tmp_path, "%s/%s" % (dir, name)) + + +""" +00:v1 +01:v2 +02:v3 +03:v3lora +04:v4lora + +""" +from io import BytesIO + + +def my_save2(fea, path,cfm_version): + bio = BytesIO() + torch.save(fea, bio) + bio.seek(0) + data = bio.getvalue() + byte=b"03" if cfm_version=="v3"else b"04" + data = byte + data[2:] + with open(path, "wb") as f: + f.write(data) + + +def savee(ckpt, name, epoch, steps, hps, cfm_version=None,lora_rank=None): + try: + opt = OrderedDict() + opt["weight"] = {} + for key in ckpt.keys(): + if "enc_q" in key: + continue + opt["weight"][key] = ckpt[key].half() + opt["config"] = hps + opt["info"] = "%sepoch_%siteration" % (epoch, steps) + if lora_rank: + opt["lora_rank"] = lora_rank + my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name),cfm_version) + else: + my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name)) + return "Success." + except: + return traceback.format_exc() + + +head2version = { + b"00": ["v1", "v1", False], + b"01": ["v2", "v2", False], + b"02": ["v2", "v3", False], + b"03": ["v2", "v3", True], + b"04": ["v2", "v4", True], +} +hash_pretrained_dict = { + "dc3c97e17592963677a4a1681f30c653": ["v2", "v2", False], # s2G488k.pth#sovits_v1_pretrained + "43797be674a37c1c83ee81081941ed0f": ["v2", "v3", False], # s2Gv3.pth#sovits_v3_pretrained + "6642b37f3dbb1f76882b69937c95a5f3": ["v2", "v2", False], # s2G2333K.pth#sovits_v2_pretrained + "4f26b9476d0c5033e04162c486074374": ["v2", "v4", False], # s2Gv4.pth#sovits_v4_pretrained +} +import hashlib + + +def get_hash_from_file(sovits_path): + with open(sovits_path, "rb") as f: + data = f.read(8192) + hash_md5 = hashlib.md5() + hash_md5.update(data) + return hash_md5.hexdigest() + + +def get_sovits_version_from_path_fast(sovits_path): + ###1-if it is pretrained sovits models, by hash + hash = get_hash_from_file(sovits_path) + if hash in hash_pretrained_dict: + return hash_pretrained_dict[hash] + ###2-new weights, by head + with open(sovits_path, "rb") as f: + version = f.read(2) + if version != b"PK": + return head2version[version] + ###3-old weights, by file size + if_lora_v3 = False + size = os.path.getsize(sovits_path) + """ + v1weights:about 82942KB + half thr:82978KB + v2weights:about 83014KB + v3weights:about 750MB + """ + if size < 82978 * 1024: + model_version = version = "v1" + elif size < 700 * 1024 * 1024: + model_version = version = "v2" + else: + version = "v2" + model_version = "v3" + return version, model_version, if_lora_v3 + + +def load_sovits_new(sovits_path): + f = open(sovits_path, "rb") + meta = f.read(2) + if meta != "PK": + data = b"PK" + f.read() + bio = BytesIO() + bio.write(data) + bio.seek(0) + return torch.load(bio, map_location="cpu", weights_only=False) + return torch.load(sovits_path, map_location="cpu", weights_only=False) diff --git a/GPT_SoVITS/s1_train.py b/GPT_SoVITS/s1_train.py new file mode 100644 index 0000000000000000000000000000000000000000..1176f0bcef869d2f66574d47d9a55d0cc7e1ac0c --- /dev/null +++ b/GPT_SoVITS/s1_train.py @@ -0,0 +1,171 @@ +# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/train_t2s.py +import os + +if "_CUDA_VISIBLE_DEVICES" in os.environ: + os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] +import argparse +import logging +import platform +from pathlib import Path + +import torch +from AR.data.data_module import Text2SemanticDataModule +from AR.models.t2s_lightning_module import Text2SemanticLightningModule +from AR.utils.io import load_yaml_config +from pytorch_lightning import Trainer, seed_everything +from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning.loggers import TensorBoardLogger # WandbLogger +from pytorch_lightning.strategies import DDPStrategy + +logging.getLogger("numba").setLevel(logging.WARNING) +logging.getLogger("matplotlib").setLevel(logging.WARNING) +torch.set_float32_matmul_precision("high") +from collections import OrderedDict + +from AR.utils import get_newest_ckpt +from process_ckpt import my_save + + +class my_model_ckpt(ModelCheckpoint): + def __init__( + self, + config, + if_save_latest, + if_save_every_weights, + half_weights_save_dir, + exp_name, + **kwargs, + ): + super().__init__(**kwargs) + self.if_save_latest = if_save_latest + self.if_save_every_weights = if_save_every_weights + self.half_weights_save_dir = half_weights_save_dir + self.exp_name = exp_name + self.config = config + + def on_train_epoch_end(self, trainer, pl_module): + # if not self._should_skip_saving_checkpoint(trainer) and self._should_save_on_train_epoch_end(trainer): + if self._should_save_on_train_epoch_end(trainer): + monitor_candidates = self._monitor_candidates(trainer) + if self._every_n_epochs >= 1 and (trainer.current_epoch + 1) % self._every_n_epochs == 0: + if ( + self.if_save_latest == True + ): ####如果设置只保存最后一个ckpt,在保存下一个ckpt后要清理掉之前的所有ckpt + to_clean = list(os.listdir(self.dirpath)) + self._save_topk_checkpoint(trainer, monitor_candidates) + if self.if_save_latest == True: + for name in to_clean: + try: + os.remove("%s/%s" % (self.dirpath, name)) + except: + pass + if self.if_save_every_weights == True: + to_save_od = OrderedDict() + to_save_od["weight"] = OrderedDict() + dictt = trainer.strategy._lightning_module.state_dict() + for key in dictt: + to_save_od["weight"][key] = dictt[key].half() + to_save_od["config"] = self.config + to_save_od["info"] = "GPT-e%s" % (trainer.current_epoch + 1) + # torch.save( + # print(os.environ) + if os.environ.get("LOCAL_RANK", "0") == "0": + my_save( + to_save_od, + "%s/%s-e%s.ckpt" + % ( + self.half_weights_save_dir, + self.exp_name, + trainer.current_epoch + 1, + ), + ) + self._save_last_checkpoint(trainer, monitor_candidates) + + +def main(args): + config = load_yaml_config(args.config_file) + + output_dir = Path(config["output_dir"]) + output_dir.mkdir(parents=True, exist_ok=True) + + ckpt_dir = output_dir / "ckpt" + ckpt_dir.mkdir(parents=True, exist_ok=True) + + seed_everything(config["train"]["seed"], workers=True) + ckpt_callback: ModelCheckpoint = my_model_ckpt( + config=config, + if_save_latest=config["train"]["if_save_latest"], + if_save_every_weights=config["train"]["if_save_every_weights"], + half_weights_save_dir=config["train"]["half_weights_save_dir"], + exp_name=config["train"]["exp_name"], + save_top_k=-1, + monitor="top_3_acc", + mode="max", + save_on_train_epoch_end=True, + every_n_epochs=config["train"]["save_every_n_epoch"], + dirpath=ckpt_dir, + ) + logger = TensorBoardLogger(name=output_dir.stem, save_dir=output_dir) + os.environ["MASTER_ADDR"] = "localhost" + os.environ["USE_LIBUV"] = "0" + trainer: Trainer = Trainer( + max_epochs=config["train"]["epochs"], + accelerator="gpu" if torch.cuda.is_available() else "cpu", + # val_check_interval=9999999999999999999999,###不要验证 + # check_val_every_n_epoch=None, + limit_val_batches=0, + devices=-1 if torch.cuda.is_available() else 1, + benchmark=False, + fast_dev_run=False, + strategy=DDPStrategy(process_group_backend="nccl" if platform.system() != "Windows" else "gloo") + if torch.cuda.is_available() + else "auto", + precision=config["train"]["precision"], + logger=logger, + num_sanity_val_steps=0, + callbacks=[ckpt_callback], + use_distributed_sampler=False, # 非常简单的修改,但解决了采用自定义的 bucket_sampler 下训练步数不一致的问题! + ) + + model: Text2SemanticLightningModule = Text2SemanticLightningModule(config, output_dir) + + data_module: Text2SemanticDataModule = Text2SemanticDataModule( + config, + train_semantic_path=config["train_semantic_path"], + train_phoneme_path=config["train_phoneme_path"], + # dev_semantic_path=args.dev_semantic_path, + # dev_phoneme_path=args.dev_phoneme_path + ) + + try: + # 使用正则表达式匹配文件名中的数字部分,并按数字大小进行排序 + newest_ckpt_name = get_newest_ckpt(os.listdir(ckpt_dir)) + ckpt_path = ckpt_dir / newest_ckpt_name + except Exception: + ckpt_path = None + print("ckpt_path:", ckpt_path) + trainer.fit(model, data_module, ckpt_path=ckpt_path) + + +# srun --gpus-per-node=1 --ntasks-per-node=1 python train.py --path-to-configuration configurations/default.yaml +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-c", + "--config_file", + type=str, + default="configs/s1longer.yaml", + help="path of config file", + ) + # args for dataset + # parser.add_argument('--train_semantic_path',type=str,default='/data/docker/liujing04/gpt-vits/fine_tune_dataset/xuangou/6-name2semantic.tsv') + # parser.add_argument('--train_phoneme_path', type=str, default='/data/docker/liujing04/gpt-vits/fine_tune_dataset/xuangou/2-name2text.txt') + + # parser.add_argument('--dev_semantic_path', type=str, default='dump_mix/semantic_dev.tsv') + # parser.add_argument('--dev_phoneme_path', type=str, default='dump_mix/phoneme_dev.npy') + # parser.add_argument('--output_dir',type=str,default='/data/docker/liujing04/gpt-vits/fine_tune_dataset/xuangou/logs_s1',help='directory to save the results') + # parser.add_argument('--output_dir',type=str,default='/liujing04/gpt_logs/s1/xuangou_ft',help='directory to save the results') + + args = parser.parse_args() + logging.info(str(args)) + main(args) diff --git a/GPT_SoVITS/s2_train.py b/GPT_SoVITS/s2_train.py new file mode 100644 index 0000000000000000000000000000000000000000..ab4611862ad1c82269030b9da899d2efd467e20e --- /dev/null +++ b/GPT_SoVITS/s2_train.py @@ -0,0 +1,680 @@ +import warnings + +warnings.filterwarnings("ignore") +import os + +import utils + +hps = utils.get_hparams(stage=2) +os.environ["CUDA_VISIBLE_DEVICES"] = hps.train.gpu_numbers.replace("-", ",") +import logging + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +from torch.cuda.amp import GradScaler, autocast +from torch.nn import functional as F +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter +from tqdm import tqdm + +logging.getLogger("matplotlib").setLevel(logging.INFO) +logging.getLogger("h5py").setLevel(logging.INFO) +logging.getLogger("numba").setLevel(logging.INFO) +from random import randint + +from module import commons +from module.data_utils import ( + DistributedBucketSampler, + TextAudioSpeakerCollate, + TextAudioSpeakerLoader, +) +from module.losses import discriminator_loss, feature_loss, generator_loss, kl_loss +from module.mel_processing import mel_spectrogram_torch, spec_to_mel_torch +from module.models import ( + MultiPeriodDiscriminator, + SynthesizerTrn, +) +from process_ckpt import savee + +torch.backends.cudnn.benchmark = False +torch.backends.cudnn.deterministic = False +###反正A100fp32更快,那试试tf32吧 +torch.backends.cuda.matmul.allow_tf32 = True +torch.backends.cudnn.allow_tf32 = True +torch.set_float32_matmul_precision("medium") # 最低精度但最快(也就快一丁点),对于结果造成不了影响 +# from config import pretrained_s2G,pretrained_s2D +global_step = 0 + +device = "cpu" # cuda以外的设备,等mps优化后加入 + + +def main(): + if torch.cuda.is_available(): + n_gpus = torch.cuda.device_count() + else: + n_gpus = 1 + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(randint(20000, 55555)) + + mp.spawn( + run, + nprocs=n_gpus, + args=( + n_gpus, + hps, + ), + ) + + +def run(rank, n_gpus, hps): + global global_step + if rank == 0: + logger = utils.get_logger(hps.data.exp_dir) + logger.info(hps) + # utils.check_git_hash(hps.s2_ckpt_dir) + writer = SummaryWriter(log_dir=hps.s2_ckpt_dir) + writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval")) + + dist.init_process_group( + backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl", + init_method="env://?use_libuv=False", + world_size=n_gpus, + rank=rank, + ) + torch.manual_seed(hps.train.seed) + if torch.cuda.is_available(): + torch.cuda.set_device(rank) + + train_dataset = TextAudioSpeakerLoader(hps.data) ######## + train_sampler = DistributedBucketSampler( + train_dataset, + hps.train.batch_size, + [ + 32, + 300, + 400, + 500, + 600, + 700, + 800, + 900, + 1000, + 1100, + 1200, + 1300, + 1400, + 1500, + 1600, + 1700, + 1800, + 1900, + ], + num_replicas=n_gpus, + rank=rank, + shuffle=True, + ) + collate_fn = TextAudioSpeakerCollate() + train_loader = DataLoader( + train_dataset, + num_workers=6, + shuffle=False, + pin_memory=True, + collate_fn=collate_fn, + batch_sampler=train_sampler, + persistent_workers=True, + prefetch_factor=4, + ) + # if rank == 0: + # eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data, val=True) + # eval_loader = DataLoader(eval_dataset, num_workers=0, shuffle=False, + # batch_size=1, pin_memory=True, + # drop_last=False, collate_fn=collate_fn) + + net_g = ( + SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ).cuda(rank) + if torch.cuda.is_available() + else SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ).to(device) + ) + + net_d = ( + MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) + if torch.cuda.is_available() + else MultiPeriodDiscriminator(hps.model.use_spectral_norm).to(device) + ) + for name, param in net_g.named_parameters(): + if not param.requires_grad: + print(name, "not requires_grad") + + te_p = list(map(id, net_g.enc_p.text_embedding.parameters())) + et_p = list(map(id, net_g.enc_p.encoder_text.parameters())) + mrte_p = list(map(id, net_g.enc_p.mrte.parameters())) + base_params = filter( + lambda p: id(p) not in te_p + et_p + mrte_p and p.requires_grad, + net_g.parameters(), + ) + + # te_p=net_g.enc_p.text_embedding.parameters() + # et_p=net_g.enc_p.encoder_text.parameters() + # mrte_p=net_g.enc_p.mrte.parameters() + + optim_g = torch.optim.AdamW( + # filter(lambda p: p.requires_grad, net_g.parameters()),###默认所有层lr一致 + [ + {"params": base_params, "lr": hps.train.learning_rate}, + { + "params": net_g.enc_p.text_embedding.parameters(), + "lr": hps.train.learning_rate * hps.train.text_low_lr_rate, + }, + { + "params": net_g.enc_p.encoder_text.parameters(), + "lr": hps.train.learning_rate * hps.train.text_low_lr_rate, + }, + { + "params": net_g.enc_p.mrte.parameters(), + "lr": hps.train.learning_rate * hps.train.text_low_lr_rate, + }, + ], + hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps, + ) + optim_d = torch.optim.AdamW( + net_d.parameters(), + hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps, + ) + if torch.cuda.is_available(): + net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True) + net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True) + else: + net_g = net_g.to(device) + net_d = net_d.to(device) + + try: # 如果能加载自动resume + _, _, _, epoch_str = utils.load_checkpoint( + utils.latest_checkpoint_path("%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), "D_*.pth"), + net_d, + optim_d, + ) # D多半加载没事 + if rank == 0: + logger.info("loaded D") + # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0) + _, _, _, epoch_str = utils.load_checkpoint( + utils.latest_checkpoint_path("%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), "G_*.pth"), + net_g, + optim_g, + ) + epoch_str += 1 + global_step = (epoch_str - 1) * len(train_loader) + # epoch_str = 1 + # global_step = 0 + except: # 如果首次不能加载,加载pretrain + # traceback.print_exc() + epoch_str = 1 + global_step = 0 + if ( + hps.train.pretrained_s2G != "" + and hps.train.pretrained_s2G != None + and os.path.exists(hps.train.pretrained_s2G) + ): + if rank == 0: + logger.info("loaded pretrained %s" % hps.train.pretrained_s2G) + print( + "loaded pretrained %s" % hps.train.pretrained_s2G, + net_g.module.load_state_dict( + torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"], + strict=False, + ) + if torch.cuda.is_available() + else net_g.load_state_dict( + torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"], + strict=False, + ), + ) ##测试不加载优化器 + if ( + hps.train.pretrained_s2D != "" + and hps.train.pretrained_s2D != None + and os.path.exists(hps.train.pretrained_s2D) + ): + if rank == 0: + logger.info("loaded pretrained %s" % hps.train.pretrained_s2D) + print( + "loaded pretrained %s" % hps.train.pretrained_s2D, + net_d.module.load_state_dict( + torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"], + ) + if torch.cuda.is_available() + else net_d.load_state_dict( + torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"], + ), + ) + + # scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) + # scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) + + scheduler_g = torch.optim.lr_scheduler.ExponentialLR( + optim_g, + gamma=hps.train.lr_decay, + last_epoch=-1, + ) + scheduler_d = torch.optim.lr_scheduler.ExponentialLR( + optim_d, + gamma=hps.train.lr_decay, + last_epoch=-1, + ) + for _ in range(epoch_str): + scheduler_g.step() + scheduler_d.step() + + scaler = GradScaler(enabled=hps.train.fp16_run) + + print("start training from epoch %s" % epoch_str) + for epoch in range(epoch_str, hps.train.epochs + 1): + if rank == 0: + train_and_evaluate( + rank, + epoch, + hps, + [net_g, net_d], + [optim_g, optim_d], + [scheduler_g, scheduler_d], + scaler, + # [train_loader, eval_loader], logger, [writer, writer_eval]) + [train_loader, None], + logger, + [writer, writer_eval], + ) + else: + train_and_evaluate( + rank, + epoch, + hps, + [net_g, net_d], + [optim_g, optim_d], + [scheduler_g, scheduler_d], + scaler, + [train_loader, None], + None, + None, + ) + scheduler_g.step() + scheduler_d.step() + print("training done") + + +def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers): + net_g, net_d = nets + optim_g, optim_d = optims + # scheduler_g, scheduler_d = schedulers + train_loader, eval_loader = loaders + if writers is not None: + writer, writer_eval = writers + + train_loader.batch_sampler.set_epoch(epoch) + global global_step + + net_g.train() + net_d.train() + for batch_idx, ( + ssl, + ssl_lengths, + spec, + spec_lengths, + y, + y_lengths, + text, + text_lengths, + ) in enumerate(tqdm(train_loader)): + if torch.cuda.is_available(): + spec, spec_lengths = ( + spec.cuda( + rank, + non_blocking=True, + ), + spec_lengths.cuda( + rank, + non_blocking=True, + ), + ) + y, y_lengths = ( + y.cuda( + rank, + non_blocking=True, + ), + y_lengths.cuda( + rank, + non_blocking=True, + ), + ) + ssl = ssl.cuda(rank, non_blocking=True) + ssl.requires_grad = False + # ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True) + text, text_lengths = ( + text.cuda( + rank, + non_blocking=True, + ), + text_lengths.cuda( + rank, + non_blocking=True, + ), + ) + else: + spec, spec_lengths = spec.to(device), spec_lengths.to(device) + y, y_lengths = y.to(device), y_lengths.to(device) + ssl = ssl.to(device) + ssl.requires_grad = False + # ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True) + text, text_lengths = text.to(device), text_lengths.to(device) + + with autocast(enabled=hps.train.fp16_run): + ( + y_hat, + kl_ssl, + ids_slice, + x_mask, + z_mask, + (z, z_p, m_p, logs_p, m_q, logs_q), + stats_ssl, + ) = net_g(ssl, spec, spec_lengths, text, text_lengths) + + mel = spec_to_mel_torch( + spec, + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.mel_fmin, + hps.data.mel_fmax, + ) + y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length) + y_hat_mel = mel_spectrogram_torch( + y_hat.squeeze(1), + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + hps.data.mel_fmin, + hps.data.mel_fmax, + ) + + y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice + + # Discriminator + y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) + with autocast(enabled=False): + loss_disc, losses_disc_r, losses_disc_g = discriminator_loss( + y_d_hat_r, + y_d_hat_g, + ) + loss_disc_all = loss_disc + optim_d.zero_grad() + scaler.scale(loss_disc_all).backward() + scaler.unscale_(optim_d) + grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) + scaler.step(optim_d) + + with autocast(enabled=hps.train.fp16_run): + # Generator + y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat) + with autocast(enabled=False): + loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel + loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl + + loss_fm = feature_loss(fmap_r, fmap_g) + loss_gen, losses_gen = generator_loss(y_d_hat_g) + loss_gen_all = loss_gen + loss_fm + loss_mel + kl_ssl * 1 + loss_kl + + optim_g.zero_grad() + scaler.scale(loss_gen_all).backward() + scaler.unscale_(optim_g) + grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) + scaler.step(optim_g) + scaler.update() + + if rank == 0: + if global_step % hps.train.log_interval == 0: + lr = optim_g.param_groups[0]["lr"] + losses = [loss_disc, loss_gen, loss_fm, loss_mel, kl_ssl, loss_kl] + logger.info( + "Train Epoch: {} [{:.0f}%]".format( + epoch, + 100.0 * batch_idx / len(train_loader), + ) + ) + logger.info([x.item() for x in losses] + [global_step, lr]) + + scalar_dict = { + "loss/g/total": loss_gen_all, + "loss/d/total": loss_disc_all, + "learning_rate": lr, + "grad_norm_d": grad_norm_d, + "grad_norm_g": grad_norm_g, + } + scalar_dict.update( + { + "loss/g/fm": loss_fm, + "loss/g/mel": loss_mel, + "loss/g/kl_ssl": kl_ssl, + "loss/g/kl": loss_kl, + } + ) + + # scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}) + # scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}) + # scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}) + image_dict = None + try: ###Some people installed the wrong version of matplotlib. + image_dict = { + "slice/mel_org": utils.plot_spectrogram_to_numpy( + y_mel[0].data.cpu().numpy(), + ), + "slice/mel_gen": utils.plot_spectrogram_to_numpy( + y_hat_mel[0].data.cpu().numpy(), + ), + "all/mel": utils.plot_spectrogram_to_numpy( + mel[0].data.cpu().numpy(), + ), + "all/stats_ssl": utils.plot_spectrogram_to_numpy( + stats_ssl[0].data.cpu().numpy(), + ), + } + except: + pass + if image_dict: + utils.summarize( + writer=writer, + global_step=global_step, + images=image_dict, + scalars=scalar_dict, + ) + else: + utils.summarize( + writer=writer, + global_step=global_step, + scalars=scalar_dict, + ) + global_step += 1 + if epoch % hps.train.save_every_epoch == 0 and rank == 0: + if hps.train.if_save_latest == 0: + utils.save_checkpoint( + net_g, + optim_g, + hps.train.learning_rate, + epoch, + os.path.join( + "%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), + "G_{}.pth".format(global_step), + ), + ) + utils.save_checkpoint( + net_d, + optim_d, + hps.train.learning_rate, + epoch, + os.path.join( + "%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), + "D_{}.pth".format(global_step), + ), + ) + else: + utils.save_checkpoint( + net_g, + optim_g, + hps.train.learning_rate, + epoch, + os.path.join( + "%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), + "G_{}.pth".format(233333333333), + ), + ) + utils.save_checkpoint( + net_d, + optim_d, + hps.train.learning_rate, + epoch, + os.path.join( + "%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), + "D_{}.pth".format(233333333333), + ), + ) + if rank == 0 and hps.train.if_save_every_weights == True: + if hasattr(net_g, "module"): + ckpt = net_g.module.state_dict() + else: + ckpt = net_g.state_dict() + logger.info( + "saving ckpt %s_e%s:%s" + % ( + hps.name, + epoch, + savee( + ckpt, + hps.name + "_e%s_s%s" % (epoch, global_step), + epoch, + global_step, + hps, + ), + ) + ) + + if rank == 0: + logger.info("====> Epoch: {}".format(epoch)) + + +def evaluate(hps, generator, eval_loader, writer_eval): + generator.eval() + image_dict = {} + audio_dict = {} + print("Evaluating ...") + with torch.no_grad(): + for batch_idx, ( + ssl, + ssl_lengths, + spec, + spec_lengths, + y, + y_lengths, + text, + text_lengths, + ) in enumerate(eval_loader): + print(111) + if torch.cuda.is_available(): + spec, spec_lengths = spec.cuda(), spec_lengths.cuda() + y, y_lengths = y.cuda(), y_lengths.cuda() + ssl = ssl.cuda() + text, text_lengths = text.cuda(), text_lengths.cuda() + else: + spec, spec_lengths = spec.to(device), spec_lengths.to(device) + y, y_lengths = y.to(device), y_lengths.to(device) + ssl = ssl.to(device) + text, text_lengths = text.to(device), text_lengths.to(device) + for test in [0, 1]: + y_hat, mask, *_ = ( + generator.module.infer( + ssl, + spec, + spec_lengths, + text, + text_lengths, + test=test, + ) + if torch.cuda.is_available() + else generator.infer( + ssl, + spec, + spec_lengths, + text, + text_lengths, + test=test, + ) + ) + y_hat_lengths = mask.sum([1, 2]).long() * hps.data.hop_length + + mel = spec_to_mel_torch( + spec, + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.mel_fmin, + hps.data.mel_fmax, + ) + y_hat_mel = mel_spectrogram_torch( + y_hat.squeeze(1).float(), + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + hps.data.mel_fmin, + hps.data.mel_fmax, + ) + image_dict.update( + { + f"gen/mel_{batch_idx}_{test}": utils.plot_spectrogram_to_numpy( + y_hat_mel[0].cpu().numpy(), + ), + } + ) + audio_dict.update( + { + f"gen/audio_{batch_idx}_{test}": y_hat[0, :, : y_hat_lengths[0]], + }, + ) + image_dict.update( + { + f"gt/mel_{batch_idx}": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy()), + }, + ) + audio_dict.update({f"gt/audio_{batch_idx}": y[0, :, : y_lengths[0]]}) + + # y_hat, mask, *_ = generator.module.infer(ssl, spec_lengths, speakers, y=None) + # audio_dict.update({ + # f"gen/audio_{batch_idx}_style_pred": y_hat[0, :, :] + # }) + + utils.summarize( + writer=writer_eval, + global_step=global_step, + images=image_dict, + audios=audio_dict, + audio_sampling_rate=hps.data.sampling_rate, + ) + generator.train() + + +if __name__ == "__main__": + main() diff --git a/GPT_SoVITS/s2_train_v3.py b/GPT_SoVITS/s2_train_v3.py new file mode 100644 index 0000000000000000000000000000000000000000..71d21967eb252faa37a8f55578705b0ae9fe19ba --- /dev/null +++ b/GPT_SoVITS/s2_train_v3.py @@ -0,0 +1,467 @@ +import warnings + +warnings.filterwarnings("ignore") +import os + +import utils + +hps = utils.get_hparams(stage=2) +os.environ["CUDA_VISIBLE_DEVICES"] = hps.train.gpu_numbers.replace("-", ",") +import logging + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +from torch.cuda.amp import GradScaler, autocast +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter +from tqdm import tqdm + +logging.getLogger("matplotlib").setLevel(logging.INFO) +logging.getLogger("h5py").setLevel(logging.INFO) +logging.getLogger("numba").setLevel(logging.INFO) +from random import randint + +from module import commons +from module.data_utils import ( + DistributedBucketSampler, +) +from module.data_utils import ( + TextAudioSpeakerCollateV3 as TextAudioSpeakerCollate, +) +from module.data_utils import ( + TextAudioSpeakerLoaderV3 as TextAudioSpeakerLoader, +) +from module.models import ( + SynthesizerTrnV3 as SynthesizerTrn, +) +from process_ckpt import savee + +torch.backends.cudnn.benchmark = False +torch.backends.cudnn.deterministic = False +###反正A100fp32更快,那试试tf32吧 +torch.backends.cuda.matmul.allow_tf32 = True +torch.backends.cudnn.allow_tf32 = True +torch.set_float32_matmul_precision("medium") # 最低精度但最快(也就快一丁点),对于结果造成不了影响 +# from config import pretrained_s2G,pretrained_s2D +global_step = 0 + +device = "cpu" # cuda以外的设备,等mps优化后加入 + + +def main(): + if torch.cuda.is_available(): + n_gpus = torch.cuda.device_count() + else: + n_gpus = 1 + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(randint(20000, 55555)) + + mp.spawn( + run, + nprocs=n_gpus, + args=( + n_gpus, + hps, + ), + ) + + +def run(rank, n_gpus, hps): + global global_step + if rank == 0: + logger = utils.get_logger(hps.data.exp_dir) + logger.info(hps) + # utils.check_git_hash(hps.s2_ckpt_dir) + writer = SummaryWriter(log_dir=hps.s2_ckpt_dir) + writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval")) + + dist.init_process_group( + backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl", + init_method="env://?use_libuv=False", + world_size=n_gpus, + rank=rank, + ) + torch.manual_seed(hps.train.seed) + if torch.cuda.is_available(): + torch.cuda.set_device(rank) + + train_dataset = TextAudioSpeakerLoader(hps.data) ######## + train_sampler = DistributedBucketSampler( + train_dataset, + hps.train.batch_size, + [ + 32, + 300, + 400, + 500, + 600, + 700, + 800, + 900, + 1000, + # 1100, + # 1200, + # 1300, + # 1400, + # 1500, + # 1600, + # 1700, + # 1800, + # 1900, + ], + num_replicas=n_gpus, + rank=rank, + shuffle=True, + ) + collate_fn = TextAudioSpeakerCollate() + train_loader = DataLoader( + train_dataset, + num_workers=6, + shuffle=False, + pin_memory=True, + collate_fn=collate_fn, + batch_sampler=train_sampler, + persistent_workers=True, + prefetch_factor=4, + ) + # if rank == 0: + # eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data, val=True) + # eval_loader = DataLoader(eval_dataset, num_workers=0, shuffle=False, + # batch_size=1, pin_memory=True, + # drop_last=False, collate_fn=collate_fn) + + net_g = ( + SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ).cuda(rank) + if torch.cuda.is_available() + else SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ).to(device) + ) + + # net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) if torch.cuda.is_available() else MultiPeriodDiscriminator(hps.model.use_spectral_norm).to(device) + # for name, param in net_g.named_parameters(): + # if not param.requires_grad: + # print(name, "not requires_grad") + + optim_g = torch.optim.AdamW( + filter(lambda p: p.requires_grad, net_g.parameters()), ###默认所有层lr一致 + hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps, + ) + # optim_d = torch.optim.AdamW( + # net_d.parameters(), + # hps.train.learning_rate, + # betas=hps.train.betas, + # eps=hps.train.eps, + # ) + if torch.cuda.is_available(): + net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True) + # net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True) + else: + net_g = net_g.to(device) + # net_d = net_d.to(device) + + try: # 如果能加载自动resume + # _, _, _, epoch_str = utils.load_checkpoint( + # utils.latest_checkpoint_path("%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "D_*.pth"), + # net_d, + # optim_d, + # ) # D多半加载没事 + # if rank == 0: + # logger.info("loaded D") + # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0) + _, _, _, epoch_str = utils.load_checkpoint( + utils.latest_checkpoint_path("%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), "G_*.pth"), + net_g, + optim_g, + ) + epoch_str += 1 + global_step = (epoch_str - 1) * len(train_loader) + # epoch_str = 1 + # global_step = 0 + except: # 如果首次不能加载,加载pretrain + # traceback.print_exc() + epoch_str = 1 + global_step = 0 + if ( + hps.train.pretrained_s2G != "" + and hps.train.pretrained_s2G != None + and os.path.exists(hps.train.pretrained_s2G) + ): + if rank == 0: + logger.info("loaded pretrained %s" % hps.train.pretrained_s2G) + print( + "loaded pretrained %s" % hps.train.pretrained_s2G, + net_g.module.load_state_dict( + torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"], + strict=False, + ) + if torch.cuda.is_available() + else net_g.load_state_dict( + torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"], + strict=False, + ), + ) ##测试不加载优化器 + # if hps.train.pretrained_s2D != ""and hps.train.pretrained_s2D != None and os.path.exists(hps.train.pretrained_s2D): + # if rank == 0: + # logger.info("loaded pretrained %s" % hps.train.pretrained_s2D) + # print( + # net_d.module.load_state_dict( + # torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"] + # ) if torch.cuda.is_available() else net_d.load_state_dict( + # torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"] + # ) + # ) + + # scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) + # scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) + + scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=-1) + # scheduler_d = torch.optim.lr_scheduler.ExponentialLR( + # optim_d, gamma=hps.train.lr_decay, last_epoch=-1 + # ) + for _ in range(epoch_str): + scheduler_g.step() + # scheduler_d.step() + + scaler = GradScaler(enabled=hps.train.fp16_run) + + net_d = optim_d = scheduler_d = None + print("start training from epoch %s" % epoch_str) + for epoch in range(epoch_str, hps.train.epochs + 1): + if rank == 0: + train_and_evaluate( + rank, + epoch, + hps, + [net_g, net_d], + [optim_g, optim_d], + [scheduler_g, scheduler_d], + scaler, + # [train_loader, eval_loader], logger, [writer, writer_eval]) + [train_loader, None], + logger, + [writer, writer_eval], + ) + else: + train_and_evaluate( + rank, + epoch, + hps, + [net_g, net_d], + [optim_g, optim_d], + [scheduler_g, scheduler_d], + scaler, + [train_loader, None], + None, + None, + ) + scheduler_g.step() + # scheduler_d.step() + print("training done") + + +def train_and_evaluate( + rank, + epoch, + hps, + nets, + optims, + schedulers, + scaler, + loaders, + logger, + writers, +): + net_g, net_d = nets + optim_g, optim_d = optims + # scheduler_g, scheduler_d = schedulers + train_loader, eval_loader = loaders + if writers is not None: + writer, writer_eval = writers + + train_loader.batch_sampler.set_epoch(epoch) + global global_step + + net_g.train() + # net_d.train() + # for batch_idx, ( + # ssl, + # ssl_lengths, + # spec, + # spec_lengths, + # y, + # y_lengths, + # text, + # text_lengths, + # ) in enumerate(tqdm(train_loader)): + for batch_idx, (ssl, spec, mel, ssl_lengths, spec_lengths, text, text_lengths, mel_lengths) in enumerate( + tqdm(train_loader) + ): + if torch.cuda.is_available(): + spec, spec_lengths = ( + spec.cuda( + rank, + non_blocking=True, + ), + spec_lengths.cuda( + rank, + non_blocking=True, + ), + ) + mel, mel_lengths = mel.cuda(rank, non_blocking=True), mel_lengths.cuda(rank, non_blocking=True) + ssl = ssl.cuda(rank, non_blocking=True) + ssl.requires_grad = False + # ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True) + text, text_lengths = ( + text.cuda( + rank, + non_blocking=True, + ), + text_lengths.cuda( + rank, + non_blocking=True, + ), + ) + else: + spec, spec_lengths = spec.to(device), spec_lengths.to(device) + mel, mel_lengths = mel.to(device), mel_lengths.to(device) + ssl = ssl.to(device) + ssl.requires_grad = False + # ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True) + text, text_lengths = text.to(device), text_lengths.to(device) + + with autocast(enabled=hps.train.fp16_run): + cfm_loss = net_g( + ssl, + spec, + mel, + ssl_lengths, + spec_lengths, + text, + text_lengths, + mel_lengths, + use_grad_ckpt=hps.train.grad_ckpt, + ) + loss_gen_all = cfm_loss + optim_g.zero_grad() + scaler.scale(loss_gen_all).backward() + scaler.unscale_(optim_g) + grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) + scaler.step(optim_g) + scaler.update() + + if rank == 0: + if global_step % hps.train.log_interval == 0: + lr = optim_g.param_groups[0]["lr"] + # losses = [commit_loss,cfm_loss,mel_loss,loss_disc, loss_gen, loss_fm, loss_mel, loss_kl] + losses = [cfm_loss] + logger.info( + "Train Epoch: {} [{:.0f}%]".format( + epoch, + 100.0 * batch_idx / len(train_loader), + ) + ) + logger.info([x.item() for x in losses] + [global_step, lr]) + + scalar_dict = {"loss/g/total": loss_gen_all, "learning_rate": lr, "grad_norm_g": grad_norm_g} + # image_dict = { + # "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()), + # "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()), + # "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()), + # "all/stats_ssl": utils.plot_spectrogram_to_numpy(stats_ssl[0].data.cpu().numpy()), + # } + utils.summarize( + writer=writer, + global_step=global_step, + # images=image_dict, + scalars=scalar_dict, + ) + + # if global_step % hps.train.eval_interval == 0: + # # evaluate(hps, net_g, eval_loader, writer_eval) + # utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch,os.path.join(hps.s2_ckpt_dir, "G_{}.pth".format(global_step)),scaler) + # # utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch,os.path.join(hps.s2_ckpt_dir, "D_{}.pth".format(global_step)),scaler) + # # keep_ckpts = getattr(hps.train, 'keep_ckpts', 3) + # # if keep_ckpts > 0: + # # utils.clean_checkpoints(path_to_models=hps.s2_ckpt_dir, n_ckpts_to_keep=keep_ckpts, sort_by_time=True) + + global_step += 1 + if epoch % hps.train.save_every_epoch == 0 and rank == 0: + if hps.train.if_save_latest == 0: + utils.save_checkpoint( + net_g, + optim_g, + hps.train.learning_rate, + epoch, + os.path.join( + "%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), + "G_{}.pth".format(global_step), + ), + ) + # utils.save_checkpoint( + # net_d, + # optim_d, + # hps.train.learning_rate, + # epoch, + # os.path.join( + # "%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "D_{}.pth".format(global_step) + # ), + # ) + else: + utils.save_checkpoint( + net_g, + optim_g, + hps.train.learning_rate, + epoch, + os.path.join( + "%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), + "G_{}.pth".format(233333333333), + ), + ) + # utils.save_checkpoint( + # net_d, + # optim_d, + # hps.train.learning_rate, + # epoch, + # os.path.join( + # "%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "D_{}.pth".format(233333333333) + # ), + # ) + if rank == 0 and hps.train.if_save_every_weights == True: + if hasattr(net_g, "module"): + ckpt = net_g.module.state_dict() + else: + ckpt = net_g.state_dict() + logger.info( + "saving ckpt %s_e%s:%s" + % ( + hps.name, + epoch, + savee( + ckpt, + hps.name + "_e%s_s%s" % (epoch, global_step), + epoch, + global_step, + hps, + ), + ) + ) + + if rank == 0: + logger.info("====> Epoch: {}".format(epoch)) + + +if __name__ == "__main__": + main() diff --git a/GPT_SoVITS/s2_train_v3_lora.py b/GPT_SoVITS/s2_train_v3_lora.py new file mode 100644 index 0000000000000000000000000000000000000000..ddeec4fcb0e046ca0d7a38cc11298b6791417b0b --- /dev/null +++ b/GPT_SoVITS/s2_train_v3_lora.py @@ -0,0 +1,379 @@ +import warnings + +warnings.filterwarnings("ignore") +import os + +import utils + +hps = utils.get_hparams(stage=2) +os.environ["CUDA_VISIBLE_DEVICES"] = hps.train.gpu_numbers.replace("-", ",") +import logging + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +from torch.cuda.amp import GradScaler, autocast +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter +from tqdm import tqdm + +logging.getLogger("matplotlib").setLevel(logging.INFO) +logging.getLogger("h5py").setLevel(logging.INFO) +logging.getLogger("numba").setLevel(logging.INFO) +from collections import OrderedDict as od +from random import randint + +from module import commons +from module.data_utils import ( + DistributedBucketSampler, + TextAudioSpeakerCollateV3, + TextAudioSpeakerLoaderV3, + TextAudioSpeakerCollateV4, + TextAudioSpeakerLoaderV4, + +) +from module.models import ( + SynthesizerTrnV3 as SynthesizerTrn, +) +from peft import LoraConfig, get_peft_model +from process_ckpt import savee + +torch.backends.cudnn.benchmark = False +torch.backends.cudnn.deterministic = False +###反正A100fp32更快,那试试tf32吧 +torch.backends.cuda.matmul.allow_tf32 = True +torch.backends.cudnn.allow_tf32 = True +torch.set_float32_matmul_precision("medium") # 最低精度但最快(也就快一丁点),对于结果造成不了影响 +# from config import pretrained_s2G,pretrained_s2D +global_step = 0 + +device = "cpu" # cuda以外的设备,等mps优化后加入 + + +def main(): + if torch.cuda.is_available(): + n_gpus = torch.cuda.device_count() + else: + n_gpus = 1 + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(randint(20000, 55555)) + + mp.spawn( + run, + nprocs=n_gpus, + args=( + n_gpus, + hps, + ), + ) + + +def run(rank, n_gpus, hps): + global global_step, no_grad_names, save_root, lora_rank + if rank == 0: + logger = utils.get_logger(hps.data.exp_dir) + logger.info(hps) + # utils.check_git_hash(hps.s2_ckpt_dir) + writer = SummaryWriter(log_dir=hps.s2_ckpt_dir) + writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval")) + + dist.init_process_group( + backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl", + init_method="env://?use_libuv=False", + world_size=n_gpus, + rank=rank, + ) + torch.manual_seed(hps.train.seed) + if torch.cuda.is_available(): + torch.cuda.set_device(rank) + + TextAudioSpeakerLoader=TextAudioSpeakerLoaderV3 if hps.model.version=="v3"else TextAudioSpeakerLoaderV4 + TextAudioSpeakerCollate=TextAudioSpeakerCollateV3 if hps.model.version=="v3"else TextAudioSpeakerCollateV4 + train_dataset = TextAudioSpeakerLoader(hps.data) ######## + train_sampler = DistributedBucketSampler( + train_dataset, + hps.train.batch_size, + [ + 32, + 300, + 400, + 500, + 600, + 700, + 800, + 900, + 1000, + # 1100, + # 1200, + # 1300, + # 1400, + # 1500, + # 1600, + # 1700, + # 1800, + # 1900, + ], + num_replicas=n_gpus, + rank=rank, + shuffle=True, + ) + collate_fn = TextAudioSpeakerCollate() + train_loader = DataLoader( + train_dataset, + num_workers=6, + shuffle=False, + pin_memory=True, + collate_fn=collate_fn, + batch_sampler=train_sampler, + persistent_workers=True, + prefetch_factor=4, + ) + save_root = "%s/logs_s2_%s_lora_%s" % (hps.data.exp_dir, hps.model.version, hps.train.lora_rank) + os.makedirs(save_root, exist_ok=True) + lora_rank = int(hps.train.lora_rank) + lora_config = LoraConfig( + target_modules=["to_k", "to_q", "to_v", "to_out.0"], + r=lora_rank, + lora_alpha=lora_rank, + init_lora_weights=True, + ) + + def get_model(hps): + return SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ) + + def get_optim(net_g): + return torch.optim.AdamW( + filter(lambda p: p.requires_grad, net_g.parameters()), ###默认所有层lr一致 + hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps, + ) + + def model2cuda(net_g, rank): + if torch.cuda.is_available(): + net_g = DDP(net_g.cuda(rank), device_ids=[rank], find_unused_parameters=True) + else: + net_g = net_g.to(device) + return net_g + + try: # 如果能加载自动resume + net_g = get_model(hps) + net_g.cfm = get_peft_model(net_g.cfm, lora_config) + net_g = model2cuda(net_g, rank) + optim_g = get_optim(net_g) + # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0) + _, _, _, epoch_str = utils.load_checkpoint( + utils.latest_checkpoint_path(save_root, "G_*.pth"), + net_g, + optim_g, + ) + epoch_str += 1 + global_step = (epoch_str - 1) * len(train_loader) + except: # 如果首次不能加载,加载pretrain + # traceback.print_exc() + epoch_str = 1 + global_step = 0 + net_g = get_model(hps) + if ( + hps.train.pretrained_s2G != "" + and hps.train.pretrained_s2G != None + and os.path.exists(hps.train.pretrained_s2G) + ): + if rank == 0: + logger.info("loaded pretrained %s" % hps.train.pretrained_s2G) + print( + "loaded pretrained %s" % hps.train.pretrained_s2G, + net_g.load_state_dict( + torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"], + strict=False, + ), + ) + net_g.cfm = get_peft_model(net_g.cfm, lora_config) + net_g = model2cuda(net_g, rank) + optim_g = get_optim(net_g) + + no_grad_names = set() + for name, param in net_g.named_parameters(): + if not param.requires_grad: + no_grad_names.add(name.replace("module.", "")) + # print(name, "not requires_grad") + # print(no_grad_names) + # os._exit(233333) + + scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=-1) + for _ in range(epoch_str): + scheduler_g.step() + + scaler = GradScaler(enabled=hps.train.fp16_run) + + net_d = optim_d = scheduler_d = None + print("start training from epoch %s" % epoch_str) + for epoch in range(epoch_str, hps.train.epochs + 1): + if rank == 0: + train_and_evaluate( + rank, + epoch, + hps, + [net_g, net_d], + [optim_g, optim_d], + [scheduler_g, scheduler_d], + scaler, + # [train_loader, eval_loader], logger, [writer, writer_eval]) + [train_loader, None], + logger, + [writer, writer_eval], + ) + else: + train_and_evaluate( + rank, + epoch, + hps, + [net_g, net_d], + [optim_g, optim_d], + [scheduler_g, scheduler_d], + scaler, + [train_loader, None], + None, + None, + ) + scheduler_g.step() + print("training done") + + +def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers): + net_g, net_d = nets + optim_g, optim_d = optims + # scheduler_g, scheduler_d = schedulers + train_loader, eval_loader = loaders + if writers is not None: + writer, writer_eval = writers + + train_loader.batch_sampler.set_epoch(epoch) + global global_step + + net_g.train() + for batch_idx, (ssl, spec, mel, ssl_lengths, spec_lengths, text, text_lengths, mel_lengths) in enumerate( + tqdm(train_loader) + ): + if torch.cuda.is_available(): + spec, spec_lengths = ( + spec.cuda( + rank, + non_blocking=True, + ), + spec_lengths.cuda( + rank, + non_blocking=True, + ), + ) + mel, mel_lengths = mel.cuda(rank, non_blocking=True), mel_lengths.cuda(rank, non_blocking=True) + ssl = ssl.cuda(rank, non_blocking=True) + ssl.requires_grad = False + text, text_lengths = ( + text.cuda( + rank, + non_blocking=True, + ), + text_lengths.cuda( + rank, + non_blocking=True, + ), + ) + else: + spec, spec_lengths = spec.to(device), spec_lengths.to(device) + mel, mel_lengths = mel.to(device), mel_lengths.to(device) + ssl = ssl.to(device) + ssl.requires_grad = False + text, text_lengths = text.to(device), text_lengths.to(device) + + with autocast(enabled=hps.train.fp16_run): + cfm_loss = net_g( + ssl, + spec, + mel, + ssl_lengths, + spec_lengths, + text, + text_lengths, + mel_lengths, + use_grad_ckpt=hps.train.grad_ckpt, + ) + loss_gen_all = cfm_loss + optim_g.zero_grad() + scaler.scale(loss_gen_all).backward() + scaler.unscale_(optim_g) + grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) + scaler.step(optim_g) + scaler.update() + + if rank == 0: + if global_step % hps.train.log_interval == 0: + lr = optim_g.param_groups[0]["lr"] + losses = [cfm_loss] + logger.info("Train Epoch: {} [{:.0f}%]".format(epoch, 100.0 * batch_idx / len(train_loader))) + logger.info([x.item() for x in losses] + [global_step, lr]) + + scalar_dict = {"loss/g/total": loss_gen_all, "learning_rate": lr, "grad_norm_g": grad_norm_g} + utils.summarize( + writer=writer, + global_step=global_step, + scalars=scalar_dict, + ) + + global_step += 1 + if epoch % hps.train.save_every_epoch == 0 and rank == 0: + if hps.train.if_save_latest == 0: + utils.save_checkpoint( + net_g, + optim_g, + hps.train.learning_rate, + epoch, + os.path.join(save_root, "G_{}.pth".format(global_step)), + ) + else: + utils.save_checkpoint( + net_g, + optim_g, + hps.train.learning_rate, + epoch, + os.path.join(save_root, "G_{}.pth".format(233333333333)), + ) + if rank == 0 and hps.train.if_save_every_weights == True: + if hasattr(net_g, "module"): + ckpt = net_g.module.state_dict() + else: + ckpt = net_g.state_dict() + sim_ckpt = od() + for key in ckpt: + # if "cfm"not in key: + # print(key) + if key not in no_grad_names: + sim_ckpt[key] = ckpt[key].half().cpu() + logger.info( + "saving ckpt %s_e%s:%s" + % ( + hps.name, + epoch, + savee( + sim_ckpt, + hps.name + "_e%s_s%s_l%s" % (epoch, global_step, lora_rank), + epoch, + global_step, + hps,cfm_version=hps.model.version, + lora_rank=lora_rank, + ), + ) + ) + + if rank == 0: + logger.info("====> Epoch: {}".format(epoch)) + + +if __name__ == "__main__": + main() diff --git a/GPT_SoVITS/utils.py b/GPT_SoVITS/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1cc2d97f1ccba11802be23cb38a7703740a4e0ec --- /dev/null +++ b/GPT_SoVITS/utils.py @@ -0,0 +1,361 @@ +import argparse +import glob +import json +import logging +import os +import subprocess +import sys +import traceback + +import librosa +import numpy as np +import torch + +logging.getLogger("numba").setLevel(logging.ERROR) +logging.getLogger("matplotlib").setLevel(logging.ERROR) + +MATPLOTLIB_FLAG = False + +logging.basicConfig(stream=sys.stdout, level=logging.ERROR) +logger = logging + + +def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False): + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") + iteration = checkpoint_dict["iteration"] + learning_rate = checkpoint_dict["learning_rate"] + if optimizer is not None and not skip_optimizer and checkpoint_dict["optimizer"] is not None: + optimizer.load_state_dict(checkpoint_dict["optimizer"]) + saved_state_dict = checkpoint_dict["model"] + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict = {} + for k, v in state_dict.items(): + try: + # assert "quantizer" not in k + # print("load", k) + new_state_dict[k] = saved_state_dict[k] + assert saved_state_dict[k].shape == v.shape, ( + saved_state_dict[k].shape, + v.shape, + ) + except: + traceback.print_exc() + print("error, %s is not in the checkpoint" % k) # shape不对也会,比如text_embedding当cleaner修改时 + new_state_dict[k] = v + if hasattr(model, "module"): + model.module.load_state_dict(new_state_dict) + else: + model.load_state_dict(new_state_dict) + print("load ") + logger.info( + "Loaded checkpoint '{}' (iteration {})".format( + checkpoint_path, + iteration, + ) + ) + return model, optimizer, learning_rate, iteration + + +import shutil +from time import time as ttime + + +def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path + dir = os.path.dirname(path) + name = os.path.basename(path) + tmp_path = "%s.pth" % (ttime()) + torch.save(fea, tmp_path) + shutil.move(tmp_path, "%s/%s" % (dir, name)) + + +def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): + logger.info("Saving model and optimizer state at iteration {} to {}".format(iteration, checkpoint_path)) + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + # torch.save( + my_save( + { + "model": state_dict, + "iteration": iteration, + "optimizer": optimizer.state_dict(), + "learning_rate": learning_rate, + }, + checkpoint_path, + ) + + +def summarize( + writer, + global_step, + scalars={}, + histograms={}, + images={}, + audios={}, + audio_sampling_rate=22050, +): + for k, v in scalars.items(): + writer.add_scalar(k, v, global_step) + for k, v in histograms.items(): + writer.add_histogram(k, v, global_step) + for k, v in images.items(): + writer.add_image(k, v, global_step, dataformats="HWC") + for k, v in audios.items(): + writer.add_audio(k, v, global_step, audio_sampling_rate) + + +def latest_checkpoint_path(dir_path, regex="G_*.pth"): + f_list = glob.glob(os.path.join(dir_path, regex)) + f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) + x = f_list[-1] + print(x) + return x + + +def plot_spectrogram_to_numpy(spectrogram): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger("matplotlib") + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") + plt.colorbar(im, ax=ax) + plt.xlabel("Frames") + plt.ylabel("Channels") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def plot_alignment_to_numpy(alignment, info=None): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger("matplotlib") + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + + fig, ax = plt.subplots(figsize=(6, 4)) + im = ax.imshow( + alignment.transpose(), + aspect="auto", + origin="lower", + interpolation="none", + ) + fig.colorbar(im, ax=ax) + xlabel = "Decoder timestep" + if info is not None: + xlabel += "\n\n" + info + plt.xlabel(xlabel) + plt.ylabel("Encoder timestep") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def load_wav_to_torch(full_path): + data, sampling_rate = librosa.load(full_path, sr=None) + return torch.FloatTensor(data), sampling_rate + + +def load_filepaths_and_text(filename, split="|"): + with open(filename, encoding="utf-8") as f: + filepaths_and_text = [line.strip().split(split) for line in f] + return filepaths_and_text + + +def get_hparams(init=True, stage=1): + parser = argparse.ArgumentParser() + parser.add_argument( + "-c", + "--config", + type=str, + default="./configs/s2.json", + help="JSON file for configuration", + ) + parser.add_argument("-p", "--pretrain", type=str, required=False, default=None, help="pretrain dir") + parser.add_argument( + "-rs", + "--resume_step", + type=int, + required=False, + default=None, + help="resume step", + ) + # parser.add_argument('-e', '--exp_dir', type=str, required=False,default=None,help='experiment directory') + # parser.add_argument('-g', '--pretrained_s2G', type=str, required=False,default=None,help='pretrained sovits gererator weights') + # parser.add_argument('-d', '--pretrained_s2D', type=str, required=False,default=None,help='pretrained sovits discriminator weights') + + args = parser.parse_args() + + config_path = args.config + with open(config_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + hparams.pretrain = args.pretrain + hparams.resume_step = args.resume_step + # hparams.data.exp_dir = args.exp_dir + if stage == 1: + model_dir = hparams.s1_ckpt_dir + else: + model_dir = hparams.s2_ckpt_dir + config_save_path = os.path.join(model_dir, "config.json") + + if not os.path.exists(model_dir): + os.makedirs(model_dir) + + with open(config_save_path, "w") as f: + f.write(data) + return hparams + + +def clean_checkpoints(path_to_models="logs/44k/", n_ckpts_to_keep=2, sort_by_time=True): + """Freeing up space by deleting saved ckpts + + Arguments: + path_to_models -- Path to the model directory + n_ckpts_to_keep -- Number of ckpts to keep, excluding G_0.pth and D_0.pth + sort_by_time -- True -> chronologically delete ckpts + False -> lexicographically delete ckpts + """ + import re + + ckpts_files = [f for f in os.listdir(path_to_models) if os.path.isfile(os.path.join(path_to_models, f))] + name_key = lambda _f: int(re.compile("._(\d+)\.pth").match(_f).group(1)) + time_key = lambda _f: os.path.getmtime(os.path.join(path_to_models, _f)) + sort_key = time_key if sort_by_time else name_key + x_sorted = lambda _x: sorted( + [f for f in ckpts_files if f.startswith(_x) and not f.endswith("_0.pth")], + key=sort_key, + ) + to_del = [ + os.path.join(path_to_models, fn) for fn in (x_sorted("G")[:-n_ckpts_to_keep] + x_sorted("D")[:-n_ckpts_to_keep]) + ] + del_info = lambda fn: logger.info(f".. Free up space by deleting ckpt {fn}") + del_routine = lambda x: [os.remove(x), del_info(x)] + rs = [del_routine(fn) for fn in to_del] + + +def get_hparams_from_dir(model_dir): + config_save_path = os.path.join(model_dir, "config.json") + with open(config_save_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + hparams.model_dir = model_dir + return hparams + + +def get_hparams_from_file(config_path): + with open(config_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + return hparams + + +def check_git_hash(model_dir): + source_dir = os.path.dirname(os.path.realpath(__file__)) + if not os.path.exists(os.path.join(source_dir, ".git")): + logger.warn( + "{} is not a git repository, therefore hash value comparison will be ignored.".format( + source_dir, + ) + ) + return + + cur_hash = subprocess.getoutput("git rev-parse HEAD") + + path = os.path.join(model_dir, "githash") + if os.path.exists(path): + saved_hash = open(path).read() + if saved_hash != cur_hash: + logger.warn( + "git hash values are different. {}(saved) != {}(current)".format( + saved_hash[:8], + cur_hash[:8], + ) + ) + else: + open(path, "w").write(cur_hash) + + +def get_logger(model_dir, filename="train.log"): + global logger + logger = logging.getLogger(os.path.basename(model_dir)) + logger.setLevel(logging.ERROR) + + formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") + if not os.path.exists(model_dir): + os.makedirs(model_dir) + h = logging.FileHandler(os.path.join(model_dir, filename)) + h.setLevel(logging.ERROR) + h.setFormatter(formatter) + logger.addHandler(h) + return logger + + +class HParams: + def __init__(self, **kwargs): + for k, v in kwargs.items(): + if type(v) == dict: + v = HParams(**v) + self[k] = v + + def keys(self): + return self.__dict__.keys() + + def items(self): + return self.__dict__.items() + + def values(self): + return self.__dict__.values() + + def __len__(self): + return len(self.__dict__) + + def __getitem__(self, key): + return getattr(self, key) + + def __setitem__(self, key, value): + return setattr(self, key, value) + + def __contains__(self, key): + return key in self.__dict__ + + def __repr__(self): + return self.__dict__.__repr__() + + +if __name__ == "__main__": + print( + load_wav_to_torch( + "/home/fish/wenetspeech/dataset_vq/Y0000022499_wHFSeHEx9CM/S00261.flac", + ) + ) diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..f1d3206c3b40e0bcdc73384635ab8c59d377c997 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 RVC-Boss + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 7be5fc7f47d5db027d120b8024982df93db95b74..463649acccc7935d67633a3df19a6986aa19c14d 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,409 @@ ---- -license: mit ---- +
+ +

GPT-SoVITS-WebUI

+A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.

+ +[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/RVC-Boss/GPT-SoVITS) + +RVC-Boss%2FGPT-SoVITS | Trendshift + + + +[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb) +[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) +[![Huggingface](https://img.shields.io/badge/🤗%20-online%20demo-yellow.svg?style=for-the-badge)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2) +[![Discord](https://img.shields.io/discord/1198701940511617164?color=%23738ADB&label=Discord&style=for-the-badge)](https://discord.gg/dnrgs5GHfG) + +**English** | [**中文简体**](./docs/cn/README.md) | [**日本語**](./docs/ja/README.md) | [**한국어**](./docs/ko/README.md) | [**Türkçe**](./docs/tr/README.md) + +
+ +--- + +## Features: + +1. **Zero-shot TTS:** Input a 5-second vocal sample and experience instant text-to-speech conversion. + +2. **Few-shot TTS:** Fine-tune the model with just 1 minute of training data for improved voice similarity and realism. + +3. **Cross-lingual Support:** Inference in languages different from the training dataset, currently supporting English, Japanese, Korean, Cantonese and Chinese. + +4. **WebUI Tools:** Integrated tools include voice accompaniment separation, automatic training set segmentation, Chinese ASR, and text labeling, assisting beginners in creating training datasets and GPT/SoVITS models. + +**Check out our [demo video](https://www.bilibili.com/video/BV12g4y1m7Uw) here!** + +Unseen speakers few-shot fine-tuning demo: + +https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb + +**User guide: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)** + +## Installation + +For users in China, you can [click here](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official) to use AutoDL Cloud Docker to experience the full functionality online. + +### Tested Environments + +| Python Version | PyTorch Version | Device | +|----------------|------------------|-----------------| +| Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 | +| Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 | +| Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.9 | PyTorch 2.5.1 | Apple silicon | +| Python 3.11 | PyTorch 2.6.0 | Apple silicon | +| Python 3.9 | PyTorch 2.2.2 | CPU | +| Python 3.9 | PyTorch 2.8.0dev | CUDA12.8(for Nvidia50x0) | + +### Windows + +If you are a Windows user (tested with win>=10), you can [download the integrated package](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true) and double-click on _go-webui.bat_ to start GPT-SoVITS-WebUI. + +**Users in China can [download the package here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).** + +### Linux + +```bash +conda create -n GPTSoVits python=3.9 +conda activate GPTSoVits +bash install.sh --source [--download-uvr5] +``` + +### macOS + +**Note: The models trained with GPUs on Macs result in significantly lower quality compared to those trained on other devices, so we are temporarily using CPUs instead.** + +1. Install Xcode command-line tools by running `xcode-select --install`. +2. Install the program by running the following commands: + +```bash +conda create -n GPTSoVits python=3.9 +conda activate GPTSoVits +bash install.sh --source [--download-uvr5] +``` + +### Install Manually + +#### Install FFmpeg + +##### Conda Users + +```bash +conda install ffmpeg +``` + +##### Ubuntu/Debian Users + +```bash +sudo apt install ffmpeg +sudo apt install libsox-dev +conda install -c conda-forge 'ffmpeg<7' +``` + +##### Windows Users + +Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) and [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) in the GPT-SoVITS root. + +Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) (Korean TTS Only) + +##### MacOS Users + +```bash +brew install ffmpeg +``` + +#### Install Dependences + +```bash +pip install -r extra-req.txt --no-deps +pip install -r requirements.txt +``` + +### Using Docker + +#### docker-compose.yaml configuration + +0. Regarding image tags: Due to rapid updates in the codebase and the slow process of packaging and testing images, please check [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(outdated) for the currently packaged latest images and select as per your situation, or alternatively, build locally using a Dockerfile according to your own needs. +1. Environment Variables: + - is_half: Controls half-precision/double-precision. This is typically the cause if the content under the directories 4-cnhubert/5-wav32k is not generated correctly during the "SSL extracting" step. Adjust to True or False based on your actual situation. +2. Volumes Configuration, The application's root directory inside the container is set to /workspace. The default docker-compose.yaml lists some practical examples for uploading/downloading content. +3. shm_size: The default available memory for Docker Desktop on Windows is too small, which can cause abnormal operations. Adjust according to your own situation. +4. Under the deploy section, GPU-related settings should be adjusted cautiously according to your system and actual circumstances. + +#### Running with docker compose + +``` +docker compose -f "docker-compose.yaml" up -d +``` + +#### Running with docker command + +As above, modify the corresponding parameters based on your actual situation, then run the following command: + +``` +docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx +``` + +## Pretrained Models + +**If `install.sh` runs successfully, you may skip No.1,2,3** + +**Users in China can [download all these models here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#nVNhX).** + +1. Download pretrained models from [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) and place them in `GPT_SoVITS/pretrained_models`. + +2. Download G2PW models from [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip), unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.(Chinese TTS Only) + +3. For UVR5 (Vocals/Accompaniment Separation & Reverberation Removal, additionally), download models from [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) and place them in `tools/uvr5/uvr5_weights`. + + - If you want to use `bs_roformer` or `mel_band_roformer` models for UVR5, you can manually download the model and corresponding configuration file, and put them in `tools/uvr5/uvr5_weights`. **Rename the model file and configuration file, ensure that the model and configuration files have the same and corresponding names except for the suffix**. In addition, the model and configuration file names **must include `roformer`** in order to be recognized as models of the roformer class. + + - The suggestion is to **directly specify the model type** in the model name and configuration file name, such as `mel_mand_roformer`, `bs_roformer`. If not specified, the features will be compared from the configuration file to determine which type of model it is. For example, the model `bs_roformer_ep_368_sdr_12.9628.ckpt` and its corresponding configuration file `bs_roformer_ep_368_sdr_12.9628.yaml` are a pair, `kim_mel_band_roformer.ckpt` and `kim_mel_band_roformer.yaml` are also a pair. + +4. For Chinese ASR (additionally), download models from [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), and [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) and place them in `tools/asr/models`. + +5. For English or Japanese ASR (additionally), download models from [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) and place them in `tools/asr/models`. Also, [other models](https://huggingface.co/Systran) may have the similar effect with smaller disk footprint. + +## Dataset Format + +The TTS annotation .list file format: + +``` +vocal_path|speaker_name|language|text +``` + +Language dictionary: + +- 'zh': Chinese +- 'ja': Japanese +- 'en': English +- 'ko': Korean +- 'yue': Cantonese + +Example: + +``` +D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin. +``` + +## Finetune and inference + +### Open WebUI + +#### Integrated Package Users + +Double-click `go-webui.bat`or use `go-webui.ps1` +if you want to switch to V1,then double-click`go-webui-v1.bat` or use `go-webui-v1.ps1` + +#### Others + +```bash +python webui.py +``` + +if you want to switch to V1,then + +```bash +python webui.py v1 +``` + +Or maunally switch version in WebUI + +### Finetune + +#### Path Auto-filling is now supported + + 1. Fill in the audio path + 2. Slice the audio into small chunks + 3. Denoise(optinal) + 4. ASR + 5. Proofreading ASR transcriptions + 6. Go to the next Tab, then finetune the model + +### Open Inference WebUI + +#### Integrated Package Users + +Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference` + +#### Others + +```bash +python GPT_SoVITS/inference_webui.py +``` + +OR + +```bash +python webui.py +``` + +then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference` + +## V2 Release Notes + +New Features: + +1. Support Korean and Cantonese + +2. An optimized text frontend + +3. Pre-trained model extended from 2k hours to 5k hours + +4. Improved synthesis quality for low-quality reference audio + + [more details]() + +Use v2 from v1 environment: + +1. `pip install -r requirements.txt` to update some packages + +2. Clone the latest codes from github. + +3. Download v2 pretrained models from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) and put them into `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`. + + Chinese v2 additional: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip)(Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.) + +## V3 Release Notes + +New Features: + +1. The timbre similarity is higher, requiring less training data to approximate the target speaker (the timbre similarity is significantly improved using the base model directly without fine-tuning). + +2. GPT model is more stable, with fewer repetitions and omissions, and it is easier to generate speech with richer emotional expression. + + [more details]() + +Use v3 from v2 environment: + +1. `pip install -r requirements.txt` to update some packages + +2. Clone the latest codes from github. + +3. Download v3 pretrained models (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS\pretrained_models`. + + additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt) + +## V4 Release Notes + +New Features: + +1. Version 4 fixes the issue of metallic artifacts in Version 3 caused by non-integer multiple upsampling, and natively outputs 48k audio to prevent muffled sound (whereas Version 3 only natively outputs 24k audio). The author considers Version 4 a direct replacement for Version 3, though further testing is still needed. + [more details]() + +Use v4 from v1/v2/v3 environment: + +1. `pip install -r requirements.txt` to update some packages + +2. Clone the latest codes from github. + +3. Download v4 pretrained models (gsv-v4-pretrained/s2v4.ckpt, and gsv-v4-pretrained/vocoder.pth) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS\pretrained_models`. + +## Todo List + +- [x] **High Priority:** + + - [x] Localization in Japanese and English. + - [x] User guide. + - [x] Japanese and English dataset fine tune training. + +- [ ] **Features:** + - [x] Zero-shot voice conversion (5s) / few-shot voice conversion (1min). + - [x] TTS speaking speed control. + - [ ] ~~Enhanced TTS emotion control.~~ Maybe use pretrained finetuned preset GPT models for better emotion. + - [ ] Experiment with changing SoVITS token inputs to probability distribution of GPT vocabs (transformer latent). + - [x] Improve English and Japanese text frontend. + - [ ] Develop tiny and larger-sized TTS models. + - [x] Colab scripts. + - [x] Try expand training dataset (2k hours -> 10k hours). + - [x] better sovits base model (enhanced audio quality) + - [ ] model mix + +## (Additional) Method for running from the command line + +Use the command line to open the WebUI for UVR5 + +``` +python tools/uvr5/webui.py "" +``` + + + +This is how the audio segmentation of the dataset is done using the command line + +``` +python audio_slicer.py \ + --input_path "" \ + --output_root "" \ + --threshold \ + --min_length \ + --min_interval + --hop_size +``` + +This is how dataset ASR processing is done using the command line(Only Chinese) + +``` +python tools/asr/funasr_asr.py -i -o +``` + +ASR processing is performed through Faster_Whisper(ASR marking except Chinese) + +(No progress bars, GPU performance may cause time delays) + +``` +python ./tools/asr/fasterwhisper_asr.py -i -o -l -p +``` + +A custom list save path is enabled + +## Credits + +Special thanks to the following projects and contributors: + +### Theoretical Research + +- [ar-vits](https://github.com/innnky/ar-vits) +- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR) +- [vits](https://github.com/jaywalnut310/vits) +- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556) +- [contentvec](https://github.com/auspicious3000/contentvec/) +- [hifi-gan](https://github.com/jik876/hifi-gan) +- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41) +- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py) +- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py) + +### Pretrained Models + +- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain) +- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large) +- [BigVGAN](https://github.com/NVIDIA/BigVGAN) + +### Text Frontend for Inference + +- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization) +- [split-lang](https://github.com/DoodleBears/split-lang) +- [g2pW](https://github.com/GitYCC/g2pW) +- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW) +- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw) + +### WebUI Tools + +- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui) +- [audio-slicer](https://github.com/openvpi/audio-slicer) +- [SubFix](https://github.com/cronrpc/SubFix) +- [FFmpeg](https://github.com/FFmpeg/FFmpeg) +- [gradio](https://github.com/gradio-app/gradio) +- [faster-whisper](https://github.com/SYSTRAN/faster-whisper) +- [FunASR](https://github.com/alibaba-damo-academy/FunASR) +- [AP-BWE](https://github.com/yxlu-0102/AP-BWE) + +Thankful to @Naozumi520 for providing the Cantonese training set and for the guidance on Cantonese-related knowledge. + +## Thanks to all contributors for their efforts + + + + diff --git a/api.py b/api.py new file mode 100644 index 0000000000000000000000000000000000000000..4b79d501fc98bb0841d23cd3f854e97bf4358e2d --- /dev/null +++ b/api.py @@ -0,0 +1,1236 @@ +""" +# api.py usage + +` python api.py -dr "123.wav" -dt "一二三。" -dl "zh" ` + +## 执行参数: + +`-s` - `SoVITS模型路径, 可在 config.py 中指定` +`-g` - `GPT模型路径, 可在 config.py 中指定` + +调用请求缺少参考音频时使用 +`-dr` - `默认参考音频路径` +`-dt` - `默认参考音频文本` +`-dl` - `默认参考音频语种, "中文","英文","日文","韩文","粤语,"zh","en","ja","ko","yue"` + +`-d` - `推理设备, "cuda","cpu"` +`-a` - `绑定地址, 默认"127.0.0.1"` +`-p` - `绑定端口, 默认9880, 可在 config.py 中指定` +`-fp` - `覆盖 config.py 使用全精度` +`-hp` - `覆盖 config.py 使用半精度` +`-sm` - `流式返回模式, 默认不启用, "close","c", "normal","n", "keepalive","k"` +·-mt` - `返回的音频编码格式, 流式默认ogg, 非流式默认wav, "wav", "ogg", "aac"` +·-st` - `返回的音频数据类型, 默认int16, "int16", "int32"` +·-cp` - `文本切分符号设定, 默认为空, 以",.,。"字符串的方式传入` + +`-hb` - `cnhubert路径` +`-b` - `bert路径` + +## 调用: + +### 推理 + +endpoint: `/` + +使用执行参数指定的参考音频: +GET: + `http://127.0.0.1:9880?text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh` +POST: +```json +{ + "text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。", + "text_language": "zh" +} +``` + +使用执行参数指定的参考音频并设定分割符号: +GET: + `http://127.0.0.1:9880?text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh&cut_punc=,。` +POST: +```json +{ + "text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。", + "text_language": "zh", + "cut_punc": ",。", +} +``` + +手动指定当次推理所使用的参考音频: +GET: + `http://127.0.0.1:9880?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh&text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh` +POST: +```json +{ + "refer_wav_path": "123.wav", + "prompt_text": "一二三。", + "prompt_language": "zh", + "text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。", + "text_language": "zh" +} +``` + +RESP: +成功: 直接返回 wav 音频流, http code 200 +失败: 返回包含错误信息的 json, http code 400 + +手动指定当次推理所使用的参考音频,并提供参数: +GET: + `http://127.0.0.1:9880?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh&text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh&top_k=20&top_p=0.6&temperature=0.6&speed=1&inp_refs="456.wav"&inp_refs="789.wav"` +POST: +```json +{ + "refer_wav_path": "123.wav", + "prompt_text": "一二三。", + "prompt_language": "zh", + "text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。", + "text_language": "zh", + "top_k": 20, + "top_p": 0.6, + "temperature": 0.6, + "speed": 1, + "inp_refs": ["456.wav","789.wav"] +} +``` + +RESP: +成功: 直接返回 wav 音频流, http code 200 +失败: 返回包含错误信息的 json, http code 400 + + +### 更换默认参考音频 + +endpoint: `/change_refer` + +key与推理端一样 + +GET: + `http://127.0.0.1:9880/change_refer?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh` +POST: +```json +{ + "refer_wav_path": "123.wav", + "prompt_text": "一二三。", + "prompt_language": "zh" +} +``` + +RESP: +成功: json, http code 200 +失败: json, 400 + + +### 命令控制 + +endpoint: `/control` + +command: +"restart": 重新运行 +"exit": 结束运行 + +GET: + `http://127.0.0.1:9880/control?command=restart` +POST: +```json +{ + "command": "restart" +} +``` + +RESP: 无 + +""" + +import argparse +import os +import re +import sys + +now_dir = os.getcwd() +sys.path.append(now_dir) +sys.path.append("%s/GPT_SoVITS" % (now_dir)) + +import signal +from text.LangSegmenter import LangSegmenter +from time import time as ttime +import torch +import torchaudio +import librosa +import soundfile as sf +from fastapi import FastAPI, Request, Query +from fastapi.responses import StreamingResponse, JSONResponse +import uvicorn +from transformers import AutoModelForMaskedLM, AutoTokenizer +import numpy as np +from feature_extractor import cnhubert +from io import BytesIO +from module.models import SynthesizerTrn, SynthesizerTrnV3 +from peft import LoraConfig, get_peft_model +from AR.models.t2s_lightning_module import Text2SemanticLightningModule +from text import cleaned_text_to_sequence +from text.cleaner import clean_text +from module.mel_processing import spectrogram_torch +import config as global_config +import logging +import subprocess + + +class DefaultRefer: + def __init__(self, path, text, language): + self.path = args.default_refer_path + self.text = args.default_refer_text + self.language = args.default_refer_language + + def is_ready(self) -> bool: + return is_full(self.path, self.text, self.language) + + +def is_empty(*items): # 任意一项不为空返回False + for item in items: + if item is not None and item != "": + return False + return True + + +def is_full(*items): # 任意一项为空返回False + for item in items: + if item is None or item == "": + return False + return True + + +def init_bigvgan(): + global bigvgan_model + from BigVGAN import bigvgan + + bigvgan_model = bigvgan.BigVGAN.from_pretrained( + "%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,), + use_cuda_kernel=False, + ) # if True, RuntimeError: Ninja is required to load C++ extensions + # remove weight norm in the model and set to eval mode + bigvgan_model.remove_weight_norm() + bigvgan_model = bigvgan_model.eval() + if is_half == True: + bigvgan_model = bigvgan_model.half().to(device) + else: + bigvgan_model = bigvgan_model.to(device) + + +resample_transform_dict = {} + + +def resample(audio_tensor, sr0): + global resample_transform_dict + if sr0 not in resample_transform_dict: + resample_transform_dict[sr0] = torchaudio.transforms.Resample(sr0, 24000).to(device) + return resample_transform_dict[sr0](audio_tensor) + + +from module.mel_processing import mel_spectrogram_torch + +spec_min = -12 +spec_max = 2 + + +def norm_spec(x): + return (x - spec_min) / (spec_max - spec_min) * 2 - 1 + + +def denorm_spec(x): + return (x + 1) / 2 * (spec_max - spec_min) + spec_min + + +mel_fn = lambda x: mel_spectrogram_torch( + x, + **{ + "n_fft": 1024, + "win_size": 1024, + "hop_size": 256, + "num_mels": 100, + "sampling_rate": 24000, + "fmin": 0, + "fmax": None, + "center": False, + }, +) + + +sr_model = None + + +def audio_sr(audio, sr): + global sr_model + if sr_model == None: + from tools.audio_sr import AP_BWE + + try: + sr_model = AP_BWE(device, DictToAttrRecursive) + except FileNotFoundError: + logger.info("你没有下载超分模型的参数,因此不进行超分。如想超分请先参照教程把文件下载") + return audio.cpu().detach().numpy(), sr + return sr_model(audio, sr) + + +class Speaker: + def __init__(self, name, gpt, sovits, phones=None, bert=None, prompt=None): + self.name = name + self.sovits = sovits + self.gpt = gpt + self.phones = phones + self.bert = bert + self.prompt = prompt + + +speaker_list = {} + + +class Sovits: + def __init__(self, vq_model, hps): + self.vq_model = vq_model + self.hps = hps + + +from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new + + +def get_sovits_weights(sovits_path): + path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth" + is_exist_s2gv3 = os.path.exists(path_sovits_v3) + + version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path) + if if_lora_v3 == True and is_exist_s2gv3 == False: + logger.info("SoVITS V3 底模缺失,无法加载相应 LoRA 权重") + + dict_s2 = load_sovits_new(sovits_path) + hps = dict_s2["config"] + hps = DictToAttrRecursive(hps) + hps.model.semantic_frame_rate = "25hz" + if "enc_p.text_embedding.weight" not in dict_s2["weight"]: + hps.model.version = "v2" # v3model,v2sybomls + elif dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322: + hps.model.version = "v1" + else: + hps.model.version = "v2" + + if model_version == "v3": + hps.model.version = "v3" + + model_params_dict = vars(hps.model) + if model_version != "v3": + vq_model = SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **model_params_dict, + ) + else: + vq_model = SynthesizerTrnV3( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **model_params_dict, + ) + init_bigvgan() + model_version = hps.model.version + logger.info(f"模型版本: {model_version}") + if "pretrained" not in sovits_path: + try: + del vq_model.enc_q + except: + pass + if is_half == True: + vq_model = vq_model.half().to(device) + else: + vq_model = vq_model.to(device) + vq_model.eval() + if if_lora_v3 == False: + vq_model.load_state_dict(dict_s2["weight"], strict=False) + else: + vq_model.load_state_dict(load_sovits_new(path_sovits_v3)["weight"], strict=False) + lora_rank = dict_s2["lora_rank"] + lora_config = LoraConfig( + target_modules=["to_k", "to_q", "to_v", "to_out.0"], + r=lora_rank, + lora_alpha=lora_rank, + init_lora_weights=True, + ) + vq_model.cfm = get_peft_model(vq_model.cfm, lora_config) + vq_model.load_state_dict(dict_s2["weight"], strict=False) + vq_model.cfm = vq_model.cfm.merge_and_unload() + # torch.save(vq_model.state_dict(),"merge_win.pth") + vq_model.eval() + + sovits = Sovits(vq_model, hps) + return sovits + + +class Gpt: + def __init__(self, max_sec, t2s_model): + self.max_sec = max_sec + self.t2s_model = t2s_model + + +global hz +hz = 50 + + +def get_gpt_weights(gpt_path): + dict_s1 = torch.load(gpt_path, map_location="cpu") + config = dict_s1["config"] + max_sec = config["data"]["max_sec"] + t2s_model = Text2SemanticLightningModule(config, "****", is_train=False) + t2s_model.load_state_dict(dict_s1["weight"]) + if is_half == True: + t2s_model = t2s_model.half() + t2s_model = t2s_model.to(device) + t2s_model.eval() + # total = sum([param.nelement() for param in t2s_model.parameters()]) + # logger.info("Number of parameter: %.2fM" % (total / 1e6)) + + gpt = Gpt(max_sec, t2s_model) + return gpt + + +def change_gpt_sovits_weights(gpt_path, sovits_path): + try: + gpt = get_gpt_weights(gpt_path) + sovits = get_sovits_weights(sovits_path) + except Exception as e: + return JSONResponse({"code": 400, "message": str(e)}, status_code=400) + + speaker_list["default"] = Speaker(name="default", gpt=gpt, sovits=sovits) + return JSONResponse({"code": 0, "message": "Success"}, status_code=200) + + +def get_bert_feature(text, word2ph): + with torch.no_grad(): + inputs = tokenizer(text, return_tensors="pt") + for i in inputs: + inputs[i] = inputs[i].to(device) #####输入是long不用管精度问题,精度随bert_model + res = bert_model(**inputs, output_hidden_states=True) + res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1] + assert len(word2ph) == len(text) + phone_level_feature = [] + for i in range(len(word2ph)): + repeat_feature = res[i].repeat(word2ph[i], 1) + phone_level_feature.append(repeat_feature) + phone_level_feature = torch.cat(phone_level_feature, dim=0) + # if(is_half==True):phone_level_feature=phone_level_feature.half() + return phone_level_feature.T + + +def clean_text_inf(text, language, version): + language = language.replace("all_", "") + phones, word2ph, norm_text = clean_text(text, language, version) + phones = cleaned_text_to_sequence(phones, version) + return phones, word2ph, norm_text + + +def get_bert_inf(phones, word2ph, norm_text, language): + language = language.replace("all_", "") + if language == "zh": + bert = get_bert_feature(norm_text, word2ph).to(device) # .to(dtype) + else: + bert = torch.zeros( + (1024, len(phones)), + dtype=torch.float16 if is_half == True else torch.float32, + ).to(device) + + return bert + + +from text import chinese + + +def get_phones_and_bert(text, language, version, final=False): + if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}: + formattext = text + while " " in formattext: + formattext = formattext.replace(" ", " ") + if language == "all_zh": + if re.search(r"[A-Za-z]", formattext): + formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext) + formattext = chinese.mix_text_normalize(formattext) + return get_phones_and_bert(formattext, "zh", version) + else: + phones, word2ph, norm_text = clean_text_inf(formattext, language, version) + bert = get_bert_feature(norm_text, word2ph).to(device) + elif language == "all_yue" and re.search(r"[A-Za-z]", formattext): + formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext) + formattext = chinese.mix_text_normalize(formattext) + return get_phones_and_bert(formattext, "yue", version) + else: + phones, word2ph, norm_text = clean_text_inf(formattext, language, version) + bert = torch.zeros( + (1024, len(phones)), + dtype=torch.float16 if is_half == True else torch.float32, + ).to(device) + elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}: + textlist = [] + langlist = [] + if language == "auto": + for tmp in LangSegmenter.getTexts(text): + langlist.append(tmp["lang"]) + textlist.append(tmp["text"]) + elif language == "auto_yue": + for tmp in LangSegmenter.getTexts(text): + if tmp["lang"] == "zh": + tmp["lang"] = "yue" + langlist.append(tmp["lang"]) + textlist.append(tmp["text"]) + else: + for tmp in LangSegmenter.getTexts(text): + if tmp["lang"] == "en": + langlist.append(tmp["lang"]) + else: + # 因无法区别中日韩文汉字,以用户输入为准 + langlist.append(language) + textlist.append(tmp["text"]) + phones_list = [] + bert_list = [] + norm_text_list = [] + for i in range(len(textlist)): + lang = langlist[i] + phones, word2ph, norm_text = clean_text_inf(textlist[i], lang, version) + bert = get_bert_inf(phones, word2ph, norm_text, lang) + phones_list.append(phones) + norm_text_list.append(norm_text) + bert_list.append(bert) + bert = torch.cat(bert_list, dim=1) + phones = sum(phones_list, []) + norm_text = "".join(norm_text_list) + + if not final and len(phones) < 6: + return get_phones_and_bert("." + text, language, version, final=True) + + return phones, bert.to(torch.float16 if is_half == True else torch.float32), norm_text + + +class DictToAttrRecursive(dict): + def __init__(self, input_dict): + super().__init__(input_dict) + for key, value in input_dict.items(): + if isinstance(value, dict): + value = DictToAttrRecursive(value) + self[key] = value + setattr(self, key, value) + + def __getattr__(self, item): + try: + return self[item] + except KeyError: + raise AttributeError(f"Attribute {item} not found") + + def __setattr__(self, key, value): + if isinstance(value, dict): + value = DictToAttrRecursive(value) + super(DictToAttrRecursive, self).__setitem__(key, value) + super().__setattr__(key, value) + + def __delattr__(self, item): + try: + del self[item] + except KeyError: + raise AttributeError(f"Attribute {item} not found") + + +def get_spepc(hps, filename): + audio, _ = librosa.load(filename, sr=int(hps.data.sampling_rate)) + audio = torch.FloatTensor(audio) + maxx = audio.abs().max() + if maxx > 1: + audio /= min(2, maxx) + audio_norm = audio + audio_norm = audio_norm.unsqueeze(0) + spec = spectrogram_torch( + audio_norm, + hps.data.filter_length, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + center=False, + ) + return spec + + +def pack_audio(audio_bytes, data, rate): + if media_type == "ogg": + audio_bytes = pack_ogg(audio_bytes, data, rate) + elif media_type == "aac": + audio_bytes = pack_aac(audio_bytes, data, rate) + else: + # wav无法流式, 先暂存raw + audio_bytes = pack_raw(audio_bytes, data, rate) + + return audio_bytes + + +def pack_ogg(audio_bytes, data, rate): + # Author: AkagawaTsurunaki + # Issue: + # Stack overflow probabilistically occurs + # when the function `sf_writef_short` of `libsndfile_64bit.dll` is called + # using the Python library `soundfile` + # Note: + # This is an issue related to `libsndfile`, not this project itself. + # It happens when you generate a large audio tensor (about 499804 frames in my PC) + # and try to convert it to an ogg file. + # Related: + # https://github.com/RVC-Boss/GPT-SoVITS/issues/1199 + # https://github.com/libsndfile/libsndfile/issues/1023 + # https://github.com/bastibe/python-soundfile/issues/396 + # Suggestion: + # Or split the whole audio data into smaller audio segment to avoid stack overflow? + + def handle_pack_ogg(): + with sf.SoundFile(audio_bytes, mode="w", samplerate=rate, channels=1, format="ogg") as audio_file: + audio_file.write(data) + + import threading + + # See: https://docs.python.org/3/library/threading.html + # The stack size of this thread is at least 32768 + # If stack overflow error still occurs, just modify the `stack_size`. + # stack_size = n * 4096, where n should be a positive integer. + # Here we chose n = 4096. + stack_size = 4096 * 4096 + try: + threading.stack_size(stack_size) + pack_ogg_thread = threading.Thread(target=handle_pack_ogg) + pack_ogg_thread.start() + pack_ogg_thread.join() + except RuntimeError as e: + # If changing the thread stack size is unsupported, a RuntimeError is raised. + print("RuntimeError: {}".format(e)) + print("Changing the thread stack size is unsupported.") + except ValueError as e: + # If the specified stack size is invalid, a ValueError is raised and the stack size is unmodified. + print("ValueError: {}".format(e)) + print("The specified stack size is invalid.") + + return audio_bytes + + +def pack_raw(audio_bytes, data, rate): + audio_bytes.write(data.tobytes()) + + return audio_bytes + + +def pack_wav(audio_bytes, rate): + if is_int32: + data = np.frombuffer(audio_bytes.getvalue(), dtype=np.int32) + wav_bytes = BytesIO() + sf.write(wav_bytes, data, rate, format="WAV", subtype="PCM_32") + else: + data = np.frombuffer(audio_bytes.getvalue(), dtype=np.int16) + wav_bytes = BytesIO() + sf.write(wav_bytes, data, rate, format="WAV") + return wav_bytes + + +def pack_aac(audio_bytes, data, rate): + if is_int32: + pcm = "s32le" + bit_rate = "256k" + else: + pcm = "s16le" + bit_rate = "128k" + process = subprocess.Popen( + [ + "ffmpeg", + "-f", + pcm, # 输入16位有符号小端整数PCM + "-ar", + str(rate), # 设置采样率 + "-ac", + "1", # 单声道 + "-i", + "pipe:0", # 从管道读取输入 + "-c:a", + "aac", # 音频编码器为AAC + "-b:a", + bit_rate, # 比特率 + "-vn", # 不包含视频 + "-f", + "adts", # 输出AAC数据流格式 + "pipe:1", # 将输出写入管道 + ], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + out, _ = process.communicate(input=data.tobytes()) + audio_bytes.write(out) + + return audio_bytes + + +def read_clean_buffer(audio_bytes): + audio_chunk = audio_bytes.getvalue() + audio_bytes.truncate(0) + audio_bytes.seek(0) + + return audio_bytes, audio_chunk + + +def cut_text(text, punc): + punc_list = [p for p in punc if p in {",", ".", ";", "?", "!", "、", ",", "。", "?", "!", ";", ":", "…"}] + if len(punc_list) > 0: + punds = r"[" + "".join(punc_list) + r"]" + text = text.strip("\n") + items = re.split(f"({punds})", text) + mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])] + # 在句子不存在符号或句尾无符号的时候保证文本完整 + if len(items) % 2 == 1: + mergeitems.append(items[-1]) + text = "\n".join(mergeitems) + + while "\n\n" in text: + text = text.replace("\n\n", "\n") + + return text + + +def only_punc(text): + return not any(t.isalnum() or t.isalpha() for t in text) + + +splits = { + ",", + "。", + "?", + "!", + ",", + ".", + "?", + "!", + "~", + ":", + ":", + "—", + "…", +} + + +def get_tts_wav( + ref_wav_path, + prompt_text, + prompt_language, + text, + text_language, + top_k=15, + top_p=0.6, + temperature=0.6, + speed=1, + inp_refs=None, + sample_steps=32, + if_sr=False, + spk="default", +): + infer_sovits = speaker_list[spk].sovits + vq_model = infer_sovits.vq_model + hps = infer_sovits.hps + version = vq_model.version + + infer_gpt = speaker_list[spk].gpt + t2s_model = infer_gpt.t2s_model + max_sec = infer_gpt.max_sec + + t0 = ttime() + prompt_text = prompt_text.strip("\n") + if prompt_text[-1] not in splits: + prompt_text += "。" if prompt_language != "en" else "." + prompt_language, text = prompt_language, text.strip("\n") + dtype = torch.float16 if is_half == True else torch.float32 + zero_wav = np.zeros(int(hps.data.sampling_rate * 0.3), dtype=np.float16 if is_half == True else np.float32) + with torch.no_grad(): + wav16k, sr = librosa.load(ref_wav_path, sr=16000) + wav16k = torch.from_numpy(wav16k) + zero_wav_torch = torch.from_numpy(zero_wav) + if is_half == True: + wav16k = wav16k.half().to(device) + zero_wav_torch = zero_wav_torch.half().to(device) + else: + wav16k = wav16k.to(device) + zero_wav_torch = zero_wav_torch.to(device) + wav16k = torch.cat([wav16k, zero_wav_torch]) + ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2) # .float() + codes = vq_model.extract_latent(ssl_content) + prompt_semantic = codes[0, 0] + prompt = prompt_semantic.unsqueeze(0).to(device) + + if version != "v3": + refers = [] + if inp_refs: + for path in inp_refs: + try: + refer = get_spepc(hps, path).to(dtype).to(device) + refers.append(refer) + except Exception as e: + logger.error(e) + if len(refers) == 0: + refers = [get_spepc(hps, ref_wav_path).to(dtype).to(device)] + else: + refer = get_spepc(hps, ref_wav_path).to(device).to(dtype) + + t1 = ttime() + # os.environ['version'] = version + prompt_language = dict_language[prompt_language.lower()] + text_language = dict_language[text_language.lower()] + phones1, bert1, norm_text1 = get_phones_and_bert(prompt_text, prompt_language, version) + texts = text.split("\n") + audio_bytes = BytesIO() + + for text in texts: + # 简单防止纯符号引发参考音频泄露 + if only_punc(text): + continue + + audio_opt = [] + if text[-1] not in splits: + text += "。" if text_language != "en" else "." + phones2, bert2, norm_text2 = get_phones_and_bert(text, text_language, version) + bert = torch.cat([bert1, bert2], 1) + + all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0) + bert = bert.to(device).unsqueeze(0) + all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device) + t2 = ttime() + with torch.no_grad(): + pred_semantic, idx = t2s_model.model.infer_panel( + all_phoneme_ids, + all_phoneme_len, + prompt, + bert, + # prompt_phone_len=ph_offset, + top_k=top_k, + top_p=top_p, + temperature=temperature, + early_stop_num=hz * max_sec, + ) + pred_semantic = pred_semantic[:, -idx:].unsqueeze(0) + t3 = ttime() + + if version != "v3": + audio = ( + vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed) + .detach() + .cpu() + .numpy()[0, 0] + ) ###试试重建不带上prompt部分 + else: + phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0) + phoneme_ids1 = torch.LongTensor(phones2).to(device).unsqueeze(0) + # print(11111111, phoneme_ids0, phoneme_ids1) + fea_ref, ge = vq_model.decode_encp(prompt.unsqueeze(0), phoneme_ids0, refer) + ref_audio, sr = torchaudio.load(ref_wav_path) + ref_audio = ref_audio.to(device).float() + if ref_audio.shape[0] == 2: + ref_audio = ref_audio.mean(0).unsqueeze(0) + if sr != 24000: + ref_audio = resample(ref_audio, sr) + # print("ref_audio",ref_audio.abs().mean()) + mel2 = mel_fn(ref_audio) + mel2 = norm_spec(mel2) + T_min = min(mel2.shape[2], fea_ref.shape[2]) + mel2 = mel2[:, :, :T_min] + fea_ref = fea_ref[:, :, :T_min] + if T_min > 468: + mel2 = mel2[:, :, -468:] + fea_ref = fea_ref[:, :, -468:] + T_min = 468 + chunk_len = 934 - T_min + # print("fea_ref",fea_ref,fea_ref.shape) + # print("mel2",mel2) + mel2 = mel2.to(dtype) + fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge, speed) + # print("fea_todo",fea_todo) + # print("ge",ge.abs().mean()) + cfm_resss = [] + idx = 0 + while 1: + fea_todo_chunk = fea_todo[:, :, idx : idx + chunk_len] + if fea_todo_chunk.shape[-1] == 0: + break + idx += chunk_len + fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1) + # set_seed(123) + cfm_res = vq_model.cfm.inference( + fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0 + ) + cfm_res = cfm_res[:, :, mel2.shape[2] :] + mel2 = cfm_res[:, :, -T_min:] + # print("fea", fea) + # print("mel2in", mel2) + fea_ref = fea_todo_chunk[:, :, -T_min:] + cfm_resss.append(cfm_res) + cmf_res = torch.cat(cfm_resss, 2) + cmf_res = denorm_spec(cmf_res) + if bigvgan_model == None: + init_bigvgan() + with torch.inference_mode(): + wav_gen = bigvgan_model(cmf_res) + audio = wav_gen[0][0].cpu().detach().numpy() + + max_audio = np.abs(audio).max() + if max_audio > 1: + audio /= max_audio + audio_opt.append(audio) + audio_opt.append(zero_wav) + audio_opt = np.concatenate(audio_opt, 0) + t4 = ttime() + + sr = hps.data.sampling_rate if version != "v3" else 24000 + if if_sr and sr == 24000: + audio_opt = torch.from_numpy(audio_opt).float().to(device) + audio_opt, sr = audio_sr(audio_opt.unsqueeze(0), sr) + max_audio = np.abs(audio_opt).max() + if max_audio > 1: + audio_opt /= max_audio + sr = 48000 + + if is_int32: + audio_bytes = pack_audio(audio_bytes, (audio_opt * 2147483647).astype(np.int32), sr) + else: + audio_bytes = pack_audio(audio_bytes, (audio_opt * 32768).astype(np.int16), sr) + # logger.info("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3)) + if stream_mode == "normal": + audio_bytes, audio_chunk = read_clean_buffer(audio_bytes) + yield audio_chunk + + if not stream_mode == "normal": + if media_type == "wav": + sr = 48000 if if_sr else 24000 + sr = hps.data.sampling_rate if version != "v3" else sr + audio_bytes = pack_wav(audio_bytes, sr) + yield audio_bytes.getvalue() + + +def handle_control(command): + if command == "restart": + os.execl(g_config.python_exec, g_config.python_exec, *sys.argv) + elif command == "exit": + os.kill(os.getpid(), signal.SIGTERM) + exit(0) + + +def handle_change(path, text, language): + if is_empty(path, text, language): + return JSONResponse( + {"code": 400, "message": '缺少任意一项以下参数: "path", "text", "language"'}, status_code=400 + ) + + if path != "" or path is not None: + default_refer.path = path + if text != "" or text is not None: + default_refer.text = text + if language != "" or language is not None: + default_refer.language = language + + logger.info(f"当前默认参考音频路径: {default_refer.path}") + logger.info(f"当前默认参考音频文本: {default_refer.text}") + logger.info(f"当前默认参考音频语种: {default_refer.language}") + logger.info(f"is_ready: {default_refer.is_ready()}") + + return JSONResponse({"code": 0, "message": "Success"}, status_code=200) + + +def handle( + refer_wav_path, + prompt_text, + prompt_language, + text, + text_language, + cut_punc, + top_k, + top_p, + temperature, + speed, + inp_refs, + sample_steps, + if_sr, +): + if ( + refer_wav_path == "" + or refer_wav_path is None + or prompt_text == "" + or prompt_text is None + or prompt_language == "" + or prompt_language is None + ): + refer_wav_path, prompt_text, prompt_language = ( + default_refer.path, + default_refer.text, + default_refer.language, + ) + if not default_refer.is_ready(): + return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400) + + if sample_steps not in [4, 8, 16, 32]: + sample_steps = 32 + + if cut_punc == None: + text = cut_text(text, default_cut_punc) + else: + text = cut_text(text, cut_punc) + + return StreamingResponse( + get_tts_wav( + refer_wav_path, + prompt_text, + prompt_language, + text, + text_language, + top_k, + top_p, + temperature, + speed, + inp_refs, + sample_steps, + if_sr, + ), + media_type="audio/" + media_type, + ) + + +# -------------------------------- +# 初始化部分 +# -------------------------------- +dict_language = { + "中文": "all_zh", + "粤语": "all_yue", + "英文": "en", + "日文": "all_ja", + "韩文": "all_ko", + "中英混合": "zh", + "粤英混合": "yue", + "日英混合": "ja", + "韩英混合": "ko", + "多语种混合": "auto", # 多语种启动切分识别语种 + "多语种混合(粤语)": "auto_yue", + "all_zh": "all_zh", + "all_yue": "all_yue", + "en": "en", + "all_ja": "all_ja", + "all_ko": "all_ko", + "zh": "zh", + "yue": "yue", + "ja": "ja", + "ko": "ko", + "auto": "auto", + "auto_yue": "auto_yue", +} + +# logger +logging.config.dictConfig(uvicorn.config.LOGGING_CONFIG) +logger = logging.getLogger("uvicorn") + +# 获取配置 +g_config = global_config.Config() + +# 获取参数 +parser = argparse.ArgumentParser(description="GPT-SoVITS api") + +parser.add_argument("-s", "--sovits_path", type=str, default=g_config.sovits_path, help="SoVITS模型路径") +parser.add_argument("-g", "--gpt_path", type=str, default=g_config.gpt_path, help="GPT模型路径") +parser.add_argument("-dr", "--default_refer_path", type=str, default="", help="默认参考音频路径") +parser.add_argument("-dt", "--default_refer_text", type=str, default="", help="默认参考音频文本") +parser.add_argument("-dl", "--default_refer_language", type=str, default="", help="默认参考音频语种") +parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu") +parser.add_argument("-a", "--bind_addr", type=str, default="0.0.0.0", help="default: 0.0.0.0") +parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880") +parser.add_argument( + "-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度" +) +parser.add_argument( + "-hp", "--half_precision", action="store_true", default=False, help="覆盖config.is_half为True, 使用半精度" +) +# bool值的用法为 `python ./api.py -fp ...` +# 此时 full_precision==True, half_precision==False +parser.add_argument("-sm", "--stream_mode", type=str, default="close", help="流式返回模式, close / normal / keepalive") +parser.add_argument("-mt", "--media_type", type=str, default="wav", help="音频编码格式, wav / ogg / aac") +parser.add_argument("-st", "--sub_type", type=str, default="int16", help="音频数据类型, int16 / int32") +parser.add_argument("-cp", "--cut_punc", type=str, default="", help="文本切分符号设定, 符号范围,.;?!、,。?!;:…") +# 切割常用分句符为 `python ./api.py -cp ".?!。?!"` +parser.add_argument("-hb", "--hubert_path", type=str, default=g_config.cnhubert_path, help="覆盖config.cnhubert_path") +parser.add_argument("-b", "--bert_path", type=str, default=g_config.bert_path, help="覆盖config.bert_path") + +args = parser.parse_args() +sovits_path = args.sovits_path +gpt_path = args.gpt_path +device = args.device +port = args.port +host = args.bind_addr +cnhubert_base_path = args.hubert_path +bert_path = args.bert_path +default_cut_punc = args.cut_punc + +# 应用参数配置 +default_refer = DefaultRefer(args.default_refer_path, args.default_refer_text, args.default_refer_language) + +# 模型路径检查 +if sovits_path == "": + sovits_path = g_config.pretrained_sovits_path + logger.warn(f"未指定SoVITS模型路径, fallback后当前值: {sovits_path}") +if gpt_path == "": + gpt_path = g_config.pretrained_gpt_path + logger.warn(f"未指定GPT模型路径, fallback后当前值: {gpt_path}") + +# 指定默认参考音频, 调用方 未提供/未给全 参考音频参数时使用 +if default_refer.path == "" or default_refer.text == "" or default_refer.language == "": + default_refer.path, default_refer.text, default_refer.language = "", "", "" + logger.info("未指定默认参考音频") +else: + logger.info(f"默认参考音频路径: {default_refer.path}") + logger.info(f"默认参考音频文本: {default_refer.text}") + logger.info(f"默认参考音频语种: {default_refer.language}") + +# 获取半精度 +is_half = g_config.is_half +if args.full_precision: + is_half = False +if args.half_precision: + is_half = True +if args.full_precision and args.half_precision: + is_half = g_config.is_half # 炒饭fallback +logger.info(f"半精: {is_half}") + +# 流式返回模式 +if args.stream_mode.lower() in ["normal", "n"]: + stream_mode = "normal" + logger.info("流式返回已开启") +else: + stream_mode = "close" + +# 音频编码格式 +if args.media_type.lower() in ["aac", "ogg"]: + media_type = args.media_type.lower() +elif stream_mode == "close": + media_type = "wav" +else: + media_type = "ogg" +logger.info(f"编码格式: {media_type}") + +# 音频数据类型 +if args.sub_type.lower() == "int32": + is_int32 = True + logger.info("数据类型: int32") +else: + is_int32 = False + logger.info("数据类型: int16") + +# 初始化模型 +cnhubert.cnhubert_base_path = cnhubert_base_path +tokenizer = AutoTokenizer.from_pretrained(bert_path) +bert_model = AutoModelForMaskedLM.from_pretrained(bert_path) +ssl_model = cnhubert.get_model() +if is_half: + bert_model = bert_model.half().to(device) + ssl_model = ssl_model.half().to(device) +else: + bert_model = bert_model.to(device) + ssl_model = ssl_model.to(device) +change_gpt_sovits_weights(gpt_path=gpt_path, sovits_path=sovits_path) + + +# -------------------------------- +# 接口部分 +# -------------------------------- +app = FastAPI() + + +@app.post("/set_model") +async def set_model(request: Request): + json_post_raw = await request.json() + return change_gpt_sovits_weights( + gpt_path=json_post_raw.get("gpt_model_path"), sovits_path=json_post_raw.get("sovits_model_path") + ) + + +@app.get("/set_model") +async def set_model( + gpt_model_path: str = None, + sovits_model_path: str = None, +): + return change_gpt_sovits_weights(gpt_path=gpt_model_path, sovits_path=sovits_model_path) + + +@app.post("/control") +async def control(request: Request): + json_post_raw = await request.json() + return handle_control(json_post_raw.get("command")) + + +@app.get("/control") +async def control(command: str = None): + return handle_control(command) + + +@app.post("/change_refer") +async def change_refer(request: Request): + json_post_raw = await request.json() + return handle_change( + json_post_raw.get("refer_wav_path"), json_post_raw.get("prompt_text"), json_post_raw.get("prompt_language") + ) + + +@app.get("/change_refer") +async def change_refer(refer_wav_path: str = None, prompt_text: str = None, prompt_language: str = None): + return handle_change(refer_wav_path, prompt_text, prompt_language) + + +@app.post("/") +async def tts_endpoint(request: Request): + json_post_raw = await request.json() + return handle( + json_post_raw.get("refer_wav_path"), + json_post_raw.get("prompt_text"), + json_post_raw.get("prompt_language"), + json_post_raw.get("text"), + json_post_raw.get("text_language"), + json_post_raw.get("cut_punc"), + json_post_raw.get("top_k", 15), + json_post_raw.get("top_p", 1.0), + json_post_raw.get("temperature", 1.0), + json_post_raw.get("speed", 1.0), + json_post_raw.get("inp_refs", []), + json_post_raw.get("sample_steps", 32), + json_post_raw.get("if_sr", False), + ) + + +@app.get("/") +async def tts_endpoint( + refer_wav_path: str = None, + prompt_text: str = None, + prompt_language: str = None, + text: str = None, + text_language: str = None, + cut_punc: str = None, + top_k: int = 15, + top_p: float = 1.0, + temperature: float = 1.0, + speed: float = 1.0, + inp_refs: list = Query(default=[]), + sample_steps: int = 32, + if_sr: bool = False, +): + return handle( + refer_wav_path, + prompt_text, + prompt_language, + text, + text_language, + cut_punc, + top_k, + top_p, + temperature, + speed, + inp_refs, + sample_steps, + if_sr, + ) + + +if __name__ == "__main__": + uvicorn.run(app, host=host, port=port, workers=1) diff --git a/api_v2.py b/api_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..8708207432d2ec53a99b4e4e53cfbdaa57dd2769 --- /dev/null +++ b/api_v2.py @@ -0,0 +1,500 @@ +""" +# WebAPI文档 + +` python api_v2.py -a 127.0.0.1 -p 9880 -c GPT_SoVITS/configs/tts_infer.yaml ` + +## 执行参数: + `-a` - `绑定地址, 默认"127.0.0.1"` + `-p` - `绑定端口, 默认9880` + `-c` - `TTS配置文件路径, 默认"GPT_SoVITS/configs/tts_infer.yaml"` + +## 调用: + +### 推理 + +endpoint: `/tts` +GET: +``` +http://127.0.0.1:9880/tts?text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_lang=zh&ref_audio_path=archive_jingyuan_1.wav&prompt_lang=zh&prompt_text=我是「罗浮」云骑将军景元。不必拘谨,「将军」只是一时的身份,你称呼我景元便可&text_split_method=cut5&batch_size=1&media_type=wav&streaming_mode=true +``` + +POST: +```json +{ + "text": "", # str.(required) text to be synthesized + "text_lang: "", # str.(required) language of the text to be synthesized + "ref_audio_path": "", # str.(required) reference audio path + "aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion + "prompt_text": "", # str.(optional) prompt text for the reference audio + "prompt_lang": "", # str.(required) language of the prompt text for the reference audio + "top_k": 5, # int. top k sampling + "top_p": 1, # float. top p sampling + "temperature": 1, # float. temperature for sampling + "text_split_method": "cut0", # str. text split method, see text_segmentation_method.py for details. + "batch_size": 1, # int. batch size for inference + "batch_threshold": 0.75, # float. threshold for batch splitting. + "split_bucket: True, # bool. whether to split the batch into multiple buckets. + "speed_factor":1.0, # float. control the speed of the synthesized audio. + "streaming_mode": False, # bool. whether to return a streaming response. + "seed": -1, # int. random seed for reproducibility. + "parallel_infer": True, # bool. whether to use parallel inference. + "repetition_penalty": 1.35 # float. repetition penalty for T2S model. + "sample_steps": 32, # int. number of sampling steps for VITS model V3. + "super_sampling": False, # bool. whether to use super-sampling for audio when using VITS model V3. +} +``` + +RESP: +成功: 直接返回 wav 音频流, http code 200 +失败: 返回包含错误信息的 json, http code 400 + +### 命令控制 + +endpoint: `/control` + +command: +"restart": 重新运行 +"exit": 结束运行 + +GET: +``` +http://127.0.0.1:9880/control?command=restart +``` +POST: +```json +{ + "command": "restart" +} +``` + +RESP: 无 + + +### 切换GPT模型 + +endpoint: `/set_gpt_weights` + +GET: +``` +http://127.0.0.1:9880/set_gpt_weights?weights_path=GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt +``` +RESP: +成功: 返回"success", http code 200 +失败: 返回包含错误信息的 json, http code 400 + + +### 切换Sovits模型 + +endpoint: `/set_sovits_weights` + +GET: +``` +http://127.0.0.1:9880/set_sovits_weights?weights_path=GPT_SoVITS/pretrained_models/s2G488k.pth +``` + +RESP: +成功: 返回"success", http code 200 +失败: 返回包含错误信息的 json, http code 400 + +""" + +import os +import sys +import traceback +from typing import Generator + +now_dir = os.getcwd() +sys.path.append(now_dir) +sys.path.append("%s/GPT_SoVITS" % (now_dir)) + +import argparse +import subprocess +import wave +import signal +import numpy as np +import soundfile as sf +from fastapi import FastAPI, Response +from fastapi.responses import StreamingResponse, JSONResponse +import uvicorn +from io import BytesIO +from tools.i18n.i18n import I18nAuto +from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config +from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import get_method_names as get_cut_method_names +from pydantic import BaseModel + +# print(sys.path) +i18n = I18nAuto() +cut_method_names = get_cut_method_names() + +parser = argparse.ArgumentParser(description="GPT-SoVITS api") +parser.add_argument("-c", "--tts_config", type=str, default="GPT_SoVITS/configs/tts_infer.yaml", help="tts_infer路径") +parser.add_argument("-a", "--bind_addr", type=str, default="127.0.0.1", help="default: 127.0.0.1") +parser.add_argument("-p", "--port", type=int, default="9880", help="default: 9880") +args = parser.parse_args() +config_path = args.tts_config +# device = args.device +port = args.port +host = args.bind_addr +argv = sys.argv + +if config_path in [None, ""]: + config_path = "GPT-SoVITS/configs/tts_infer.yaml" + +tts_config = TTS_Config(config_path) +print(tts_config) +tts_pipeline = TTS(tts_config) + +APP = FastAPI() + + +class TTS_Request(BaseModel): + text: str = None + text_lang: str = None + ref_audio_path: str = None + aux_ref_audio_paths: list = None + prompt_lang: str = None + prompt_text: str = "" + top_k: int = 5 + top_p: float = 1 + temperature: float = 1 + text_split_method: str = "cut5" + batch_size: int = 1 + batch_threshold: float = 0.75 + split_bucket: bool = True + speed_factor: float = 1.0 + fragment_interval: float = 0.3 + seed: int = -1 + media_type: str = "wav" + streaming_mode: bool = False + parallel_infer: bool = True + repetition_penalty: float = 1.35 + sample_steps: int = 32 + super_sampling: bool = False + + +### modify from https://github.com/RVC-Boss/GPT-SoVITS/pull/894/files +def pack_ogg(io_buffer: BytesIO, data: np.ndarray, rate: int): + with sf.SoundFile(io_buffer, mode="w", samplerate=rate, channels=1, format="ogg") as audio_file: + audio_file.write(data) + return io_buffer + + +def pack_raw(io_buffer: BytesIO, data: np.ndarray, rate: int): + io_buffer.write(data.tobytes()) + return io_buffer + + +def pack_wav(io_buffer: BytesIO, data: np.ndarray, rate: int): + io_buffer = BytesIO() + sf.write(io_buffer, data, rate, format="wav") + return io_buffer + + +def pack_aac(io_buffer: BytesIO, data: np.ndarray, rate: int): + process = subprocess.Popen( + [ + "ffmpeg", + "-f", + "s16le", # 输入16位有符号小端整数PCM + "-ar", + str(rate), # 设置采样率 + "-ac", + "1", # 单声道 + "-i", + "pipe:0", # 从管道读取输入 + "-c:a", + "aac", # 音频编码器为AAC + "-b:a", + "192k", # 比特率 + "-vn", # 不包含视频 + "-f", + "adts", # 输出AAC数据流格式 + "pipe:1", # 将输出写入管道 + ], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + out, _ = process.communicate(input=data.tobytes()) + io_buffer.write(out) + return io_buffer + + +def pack_audio(io_buffer: BytesIO, data: np.ndarray, rate: int, media_type: str): + if media_type == "ogg": + io_buffer = pack_ogg(io_buffer, data, rate) + elif media_type == "aac": + io_buffer = pack_aac(io_buffer, data, rate) + elif media_type == "wav": + io_buffer = pack_wav(io_buffer, data, rate) + else: + io_buffer = pack_raw(io_buffer, data, rate) + io_buffer.seek(0) + return io_buffer + + +# from https://huggingface.co/spaces/coqui/voice-chat-with-mistral/blob/main/app.py +def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=32000): + # This will create a wave header then append the frame input + # It should be first on a streaming wav file + # Other frames better should not have it (else you will hear some artifacts each chunk start) + wav_buf = BytesIO() + with wave.open(wav_buf, "wb") as vfout: + vfout.setnchannels(channels) + vfout.setsampwidth(sample_width) + vfout.setframerate(sample_rate) + vfout.writeframes(frame_input) + + wav_buf.seek(0) + return wav_buf.read() + + +def handle_control(command: str): + if command == "restart": + os.execl(sys.executable, sys.executable, *argv) + elif command == "exit": + os.kill(os.getpid(), signal.SIGTERM) + exit(0) + + +def check_params(req: dict): + text: str = req.get("text", "") + text_lang: str = req.get("text_lang", "") + ref_audio_path: str = req.get("ref_audio_path", "") + streaming_mode: bool = req.get("streaming_mode", False) + media_type: str = req.get("media_type", "wav") + prompt_lang: str = req.get("prompt_lang", "") + text_split_method: str = req.get("text_split_method", "cut5") + + if ref_audio_path in [None, ""]: + return JSONResponse(status_code=400, content={"message": "ref_audio_path is required"}) + if text in [None, ""]: + return JSONResponse(status_code=400, content={"message": "text is required"}) + if text_lang in [None, ""]: + return JSONResponse(status_code=400, content={"message": "text_lang is required"}) + elif text_lang.lower() not in tts_config.languages: + return JSONResponse( + status_code=400, + content={"message": f"text_lang: {text_lang} is not supported in version {tts_config.version}"}, + ) + if prompt_lang in [None, ""]: + return JSONResponse(status_code=400, content={"message": "prompt_lang is required"}) + elif prompt_lang.lower() not in tts_config.languages: + return JSONResponse( + status_code=400, + content={"message": f"prompt_lang: {prompt_lang} is not supported in version {tts_config.version}"}, + ) + if media_type not in ["wav", "raw", "ogg", "aac"]: + return JSONResponse(status_code=400, content={"message": f"media_type: {media_type} is not supported"}) + elif media_type == "ogg" and not streaming_mode: + return JSONResponse(status_code=400, content={"message": "ogg format is not supported in non-streaming mode"}) + + if text_split_method not in cut_method_names: + return JSONResponse( + status_code=400, content={"message": f"text_split_method:{text_split_method} is not supported"} + ) + + return None + + +async def tts_handle(req: dict): + """ + Text to speech handler. + + Args: + req (dict): + { + "text": "", # str.(required) text to be synthesized + "text_lang: "", # str.(required) language of the text to be synthesized + "ref_audio_path": "", # str.(required) reference audio path + "aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker synthesis + "prompt_text": "", # str.(optional) prompt text for the reference audio + "prompt_lang": "", # str.(required) language of the prompt text for the reference audio + "top_k": 5, # int. top k sampling + "top_p": 1, # float. top p sampling + "temperature": 1, # float. temperature for sampling + "text_split_method": "cut5", # str. text split method, see text_segmentation_method.py for details. + "batch_size": 1, # int. batch size for inference + "batch_threshold": 0.75, # float. threshold for batch splitting. + "split_bucket: True, # bool. whether to split the batch into multiple buckets. + "speed_factor":1.0, # float. control the speed of the synthesized audio. + "fragment_interval":0.3, # float. to control the interval of the audio fragment. + "seed": -1, # int. random seed for reproducibility. + "media_type": "wav", # str. media type of the output audio, support "wav", "raw", "ogg", "aac". + "streaming_mode": False, # bool. whether to return a streaming response. + "parallel_infer": True, # bool.(optional) whether to use parallel inference. + "repetition_penalty": 1.35 # float.(optional) repetition penalty for T2S model. + "sample_steps": 32, # int. number of sampling steps for VITS model V3. + "super_sampling": False, # bool. whether to use super-sampling for audio when using VITS model V3. + } + returns: + StreamingResponse: audio stream response. + """ + + streaming_mode = req.get("streaming_mode", False) + return_fragment = req.get("return_fragment", False) + media_type = req.get("media_type", "wav") + + check_res = check_params(req) + if check_res is not None: + return check_res + + if streaming_mode or return_fragment: + req["return_fragment"] = True + + try: + tts_generator = tts_pipeline.run(req) + + if streaming_mode: + + def streaming_generator(tts_generator: Generator, media_type: str): + if_frist_chunk = True + for sr, chunk in tts_generator: + if if_frist_chunk and media_type == "wav": + yield wave_header_chunk(sample_rate=sr) + media_type = "raw" + if_frist_chunk = False + yield pack_audio(BytesIO(), chunk, sr, media_type).getvalue() + + # _media_type = f"audio/{media_type}" if not (streaming_mode and media_type in ["wav", "raw"]) else f"audio/x-{media_type}" + return StreamingResponse( + streaming_generator( + tts_generator, + media_type, + ), + media_type=f"audio/{media_type}", + ) + + else: + sr, audio_data = next(tts_generator) + audio_data = pack_audio(BytesIO(), audio_data, sr, media_type).getvalue() + return Response(audio_data, media_type=f"audio/{media_type}") + except Exception as e: + return JSONResponse(status_code=400, content={"message": "tts failed", "Exception": str(e)}) + + +@APP.get("/control") +async def control(command: str = None): + if command is None: + return JSONResponse(status_code=400, content={"message": "command is required"}) + handle_control(command) + + +@APP.get("/tts") +async def tts_get_endpoint( + text: str = None, + text_lang: str = None, + ref_audio_path: str = None, + aux_ref_audio_paths: list = None, + prompt_lang: str = None, + prompt_text: str = "", + top_k: int = 5, + top_p: float = 1, + temperature: float = 1, + text_split_method: str = "cut0", + batch_size: int = 1, + batch_threshold: float = 0.75, + split_bucket: bool = True, + speed_factor: float = 1.0, + fragment_interval: float = 0.3, + seed: int = -1, + media_type: str = "wav", + streaming_mode: bool = False, + parallel_infer: bool = True, + repetition_penalty: float = 1.35, + sample_steps: int = 32, + super_sampling: bool = False, +): + req = { + "text": text, + "text_lang": text_lang.lower(), + "ref_audio_path": ref_audio_path, + "aux_ref_audio_paths": aux_ref_audio_paths, + "prompt_text": prompt_text, + "prompt_lang": prompt_lang.lower(), + "top_k": top_k, + "top_p": top_p, + "temperature": temperature, + "text_split_method": text_split_method, + "batch_size": int(batch_size), + "batch_threshold": float(batch_threshold), + "speed_factor": float(speed_factor), + "split_bucket": split_bucket, + "fragment_interval": fragment_interval, + "seed": seed, + "media_type": media_type, + "streaming_mode": streaming_mode, + "parallel_infer": parallel_infer, + "repetition_penalty": float(repetition_penalty), + "sample_steps": int(sample_steps), + "super_sampling": super_sampling, + } + return await tts_handle(req) + + +@APP.post("/tts") +async def tts_post_endpoint(request: TTS_Request): + req = request.dict() + return await tts_handle(req) + + +@APP.get("/set_refer_audio") +async def set_refer_aduio(refer_audio_path: str = None): + try: + tts_pipeline.set_ref_audio(refer_audio_path) + except Exception as e: + return JSONResponse(status_code=400, content={"message": "set refer audio failed", "Exception": str(e)}) + return JSONResponse(status_code=200, content={"message": "success"}) + + +# @APP.post("/set_refer_audio") +# async def set_refer_aduio_post(audio_file: UploadFile = File(...)): +# try: +# # 检查文件类型,确保是音频文件 +# if not audio_file.content_type.startswith("audio/"): +# return JSONResponse(status_code=400, content={"message": "file type is not supported"}) + +# os.makedirs("uploaded_audio", exist_ok=True) +# save_path = os.path.join("uploaded_audio", audio_file.filename) +# # 保存音频文件到服务器上的一个目录 +# with open(save_path , "wb") as buffer: +# buffer.write(await audio_file.read()) + +# tts_pipeline.set_ref_audio(save_path) +# except Exception as e: +# return JSONResponse(status_code=400, content={"message": f"set refer audio failed", "Exception": str(e)}) +# return JSONResponse(status_code=200, content={"message": "success"}) + + +@APP.get("/set_gpt_weights") +async def set_gpt_weights(weights_path: str = None): + try: + if weights_path in ["", None]: + return JSONResponse(status_code=400, content={"message": "gpt weight path is required"}) + tts_pipeline.init_t2s_weights(weights_path) + except Exception as e: + return JSONResponse(status_code=400, content={"message": "change gpt weight failed", "Exception": str(e)}) + + return JSONResponse(status_code=200, content={"message": "success"}) + + +@APP.get("/set_sovits_weights") +async def set_sovits_weights(weights_path: str = None): + try: + if weights_path in ["", None]: + return JSONResponse(status_code=400, content={"message": "sovits weight path is required"}) + tts_pipeline.init_vits_weights(weights_path) + except Exception as e: + return JSONResponse(status_code=400, content={"message": "change sovits weight failed", "Exception": str(e)}) + return JSONResponse(status_code=200, content={"message": "success"}) + + +if __name__ == "__main__": + try: + if host == "None": # 在调用时使用 -a None 参数,可以让api监听双栈 + host = None + uvicorn.run(app=APP, host=host, port=port, workers=1) + except Exception: + traceback.print_exc() + os.kill(os.getpid(), signal.SIGTERM) + exit(0) diff --git a/colab_webui.ipynb b/colab_webui.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..b4107756a0eb426e92ed4aab9ee051631b989389 --- /dev/null +++ b/colab_webui.ipynb @@ -0,0 +1,117 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GPT-SoVITS WebUI" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_o6a8GS2lWQM" + }, + "source": [ + "## Env Setup (Run Once Only)\n", + "## 环境配置, 只需运行一次" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile /content/setup.sh\n", + "set -e\n", + "\n", + "cd /content\n", + "\n", + "git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n", + "\n", + "cd GPT-SoVITS\n", + "\n", + "if conda env list | awk '{print $1}' | grep -Fxq \"GPTSoVITS\"; then\n", + " :\n", + "else\n", + " conda create -n GPTSoVITS python=3.10 -y\n", + "fi\n", + "\n", + "source activate GPTSoVITS\n", + "\n", + "pip install ipykernel\n", + "\n", + "bash install.sh --source HF --download-uvr5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q condacolab\n", + "import condacolab\n", + "condacolab.install_from_url(\"https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh\")\n", + "!cd /content && bash setup.sh" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Launch WebUI\n", + "## 启动 WebUI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4oRGUzkrk8C7" + }, + "outputs": [], + "source": [ + "!cd /content/GPT-SoVITS && source activate GPTSoVITS && export is_share=True && python webui.py" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "include_colab_link": true, + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/config.py b/config.py new file mode 100644 index 0000000000000000000000000000000000000000..5f90c5cd60994b3f9e064681cd67f80d738a06f6 --- /dev/null +++ b/config.py @@ -0,0 +1,69 @@ +import sys +import os + +import torch + +# 推理用的指定模型 +sovits_path = "" +gpt_path = "" +is_half_str = os.environ.get("is_half", "True") +is_half = True if is_half_str.lower() == "true" else False +is_share_str = os.environ.get("is_share", "False") +is_share = True if is_share_str.lower() == "true" else False + +cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base" +bert_path = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large" +pretrained_sovits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth" +pretrained_gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" + +exp_root = "logs" +python_exec = sys.executable or "python" +if torch.cuda.is_available(): + infer_device = "cuda" +else: + infer_device = "cpu" + +webui_port_main = 9874 +webui_port_uvr5 = 9873 +webui_port_infer_tts = 9872 +webui_port_subfix = 9871 + +api_port = 9880 + +if infer_device == "cuda": + gpu_name = torch.cuda.get_device_name(0) + if ( + ("16" in gpu_name and "V100" not in gpu_name.upper()) + or "P40" in gpu_name.upper() + or "P10" in gpu_name.upper() + or "1060" in gpu_name + or "1070" in gpu_name + or "1080" in gpu_name + ): + is_half = False + +if infer_device == "cpu": + is_half = False + + +class Config: + def __init__(self): + self.sovits_path = sovits_path + self.gpt_path = gpt_path + self.is_half = is_half + + self.cnhubert_path = cnhubert_path + self.bert_path = bert_path + self.pretrained_sovits_path = pretrained_sovits_path + self.pretrained_gpt_path = pretrained_gpt_path + + self.exp_root = exp_root + self.python_exec = python_exec + self.infer_device = infer_device + + self.webui_port_main = webui_port_main + self.webui_port_uvr5 = webui_port_uvr5 + self.webui_port_infer_tts = webui_port_infer_tts + self.webui_port_subfix = webui_port_subfix + + self.api_port = api_port diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000000000000000000000000000000000000..aca8ab9ed1b6938016e1fcacf1d24d673255576e --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,32 @@ +version: '3.8' + +services: + gpt-sovits: + image: breakstring/gpt-sovits:latest # please change the image name and tag base your environment. If the tag contains the word 'elite', such as "latest-elite", it indicates that the image does not include the necessary models such as GPT-SoVITS, UVR5, Damo ASR, etc. You will need to download them yourself and map them into the container. + container_name: gpt-sovits-container + environment: + - is_half=False + - is_share=False + volumes: + - ./output:/workspace/output + - ./logs:/workspace/logs + - ./SoVITS_weights:/workspace/SoVITS_weights + - ./reference:/workspace/reference + working_dir: /workspace + ports: + - "9880:9880" + - "9871:9871" + - "9872:9872" + - "9873:9873" + - "9874:9874" + shm_size: 16G + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: "all" + capabilities: [gpu] + stdin_open: true + tty: true + restart: unless-stopped diff --git a/dockerbuild.sh b/dockerbuild.sh new file mode 100644 index 0000000000000000000000000000000000000000..3a4a1e183d88ac6c0167d5d5e05519cf6da13b9a --- /dev/null +++ b/dockerbuild.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# 获取当前日期,格式为 YYYYMMDD +DATE=$(date +%Y%m%d) +# 获取最新的 Git commit 哈希值的前 7 位 +COMMIT_HASH=$(git rev-parse HEAD | cut -c 1-7) + +# 构建 full 版本的镜像 +docker build --build-arg IMAGE_TYPE=full -t breakstring/gpt-sovits:latest . +# 为同一个镜像添加带日期的标签 +docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$DATE +# 为同一个镜像添加带当前代码库Commit哈希值的标签 +docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$COMMIT_HASH + + +# 构建 elite 版本的镜像(无模型下载步骤,需手工将模型下载安装进容器) +docker build --build-arg IMAGE_TYPE=elite -t breakstring/gpt-sovits:latest-elite . +# 为同一个镜像添加带日期的标签 +docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$DATE-elite +# 为同一个镜像添加带当前代码库Commit哈希值的标签 +docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$COMMIT_HASH-elite diff --git a/docs/cn/Changelog_CN.md b/docs/cn/Changelog_CN.md new file mode 100644 index 0000000000000000000000000000000000000000..cd1d50d0ef4caec723f651c84f616ae8cc0b8f50 --- /dev/null +++ b/docs/cn/Changelog_CN.md @@ -0,0 +1,302 @@ +### 20240121更新 + +1-config添加is_share, 诸如colab等场景可以将此改为True, 来使得webui映射到公网 + +2-WebUI添加英文系统英文翻译适配 + +3-cmd-asr自动判断是否已自带damo模型, 如不在默认目录上将从modelscope自带下载 + +4-[SoVITS训练报错ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) 尝试修复(过滤长度0的样本等) + +5-清理TEMP文件夹缓存音频等文件 + +6-大幅削弱合成音频包含参考音频结尾的问题 + +### 20240122更新 + +1-修复过短输出文件返回重复参考音频的问题. + +2-经测试, 英文日文训练原生支持(日文训练需要根目录不含非英文等特殊字符). + +3-音频路径检查.如果尝试读取输入错的路径报错路径不存在, 而非ffmpeg错误. + +### 20240123更新 + +1-解决hubert提取nan导致SoVITS/GPT训练报错ZeroDivisionError的问题 + +2-支持推理界面快速切换模型 + +3-优化模型文件排序逻辑 + +4-中文分词使用jieba_fast代替jieba + +### 20240126更新 + +1-支持输出文本中英混合、日英混合 + +2-输出可选切分模式 + +3-修复uvr5读取到目录自动跳出的问题 + +4-修复多个换行导致推理报错 + +5-去除推理界面大量冗余log + +6-支持mac训练推理 + +7-自动识别不支持半精度的卡强制单精度.cpu推理下强制单精度. + +### 20240128更新 + +1-修复数字转汉字念法问题 + +2-修复句首少量字容易吞字的问题 + +3-通过限制排除不合理的参考音频长度 + +4-修复GPT训练不保存ckpt的问题 + +5-完善Dockerfile的下载模型流程 + +### 20240129更新 + +1-16系等半精度训练有问题的显卡把训练配置改为单精度训练 + +2-测试更新可用的colab版本 + +3-修复git clone modelscope funasr仓库+老版本funasr导致接口不对齐报错的问题 + + +### 20240130更新 + +1-所有涉及路径的地方双引号自动去除,小白复制路径带双引号不会报错 + +2-修复中英文标点切割问题和句首句尾补标点的问题 + +3-增加按标点符号切分 + +### 20240201更新 + +1-修复uvr5读取格式错误导致分离失败的问题 + +2-支持中日英混合多种文本自动切分识别语种 + +### 20240202更新 + +1-修复asr路径尾缀带/保存文件名报错 + +2-引入paddlespeech的Normalizer https://github.com/RVC-Boss/GPT-SoVITS/pull/377 修复一些问题, 例如: xx.xx%(带百分号类), 元/吨 会读成 元吨 而不是元每吨,下划线不再会报错 + +### 20240207更新 + +1-修正语种传参混乱导致中文推理效果下降 https://github.com/RVC-Boss/GPT-SoVITS/issues/391 + +2-uvr5适配高版本librosa https://github.com/RVC-Boss/GPT-SoVITS/pull/403 + +3-[修复uvr5 inf everywhere报错的问题(is_half传参未转换bool导致恒定半精度推理, 16系显卡会inf)](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8) + +4-优化英文文本前端 + +5-修复gradio依赖 + +6-支持三连根目录留空自动读取.list全路径 + +7-集成faster whisper ASR日文英文 + +### 20240208更新 + +1-GPT训练卡死 (win10 1909) 和https://github.com/RVC-Boss/GPT-SoVITS/issues/232 (系统语言繁体) GPT训练报错, [尝试修复](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b). + +### 20240212更新 + +1-faster whisper和funasr逻辑优化.faster whisper转镜像站下载, 规避huggingface连不上的问题. + +2-DPO Loss实验性训练选项开启, 通过构造负样本训练缓解GPT重复漏字问题.推理界面公开几个推理参数. https://github.com/RVC-Boss/GPT-SoVITS/pull/457 + +### 20240214更新 + +1-训练支持中文实验名 (原来会报错) + +2-DPO训练改为可勾选选项而非必须.如勾选batch size自动减半.修复推理界面新参数不传参的问题. + +### 20240216更新 + +1-支持无参考文本输入 + +2-修复中文文本前端bug https://github.com/RVC-Boss/GPT-SoVITS/issues/475 + +### 20240221更新 + +1-数据处理添加语音降噪选项 (降噪为只剩16k采样率, 除非底噪很大先不急着用哦). + +2-中文日文前端处理优化 https://github.com/RVC-Boss/GPT-SoVITS/pull/559 https://github.com/RVC-Boss/GPT-SoVITS/pull/556 https://github.com/RVC-Boss/GPT-SoVITS/pull/532 https://github.com/RVC-Boss/GPT-SoVITS/pull/507 https://github.com/RVC-Boss/GPT-SoVITS/pull/509 + +3-mac CPU推理更快因此把推理设备从mps改到CPU + +4-colab修复不开启公网url + +### 20240306更新 + +1-推理加速50% (RTX3090+pytorch2.2.1+cu11.8+win10+py39 tested) https://github.com/RVC-Boss/GPT-SoVITS/pull/672 + +2-如果用faster whisper非中文ASR不再需要先下中文funasr模型 + +3-修复uvr5去混响模型 是否混响 反的 https://github.com/RVC-Boss/GPT-SoVITS/pull/610 + +4-faster whisper如果无cuda可用自动cpu推理 https://github.com/RVC-Boss/GPT-SoVITS/pull/675 + +5-修改is_half的判断使在Mac上能正常CPU推理 https://github.com/RVC-Boss/GPT-SoVITS/pull/573 + +### 202403/202404/202405更新 + +2个重点 + +1-修复sovits训练未冻结vq的问题 (可能造成效果下降) + +2-增加一个快速推理分支 + +以下都是小修补 + +1-修复无参考文本模式问题 + +2-优化中英文文本前端 + +3-api格式优化 + +4-cmd格式问题修复 + +5-训练数据处理阶段不支持的语言提示报错 + +6-nan自动转fp32阶段的hubert提取bug修复 + +### 20240610 + +小问题修复: + +1-完善纯标点、多标点文本输入的判断逻辑 https://github.com/RVC-Boss/GPT-SoVITS/pull/1168 https://github.com/RVC-Boss/GPT-SoVITS/pull/1169 + +2-uvr5中的mdxnet去混响cmd格式修复, 兼容路径带空格 [#501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232) + +3-s2训练进度条逻辑修复 https://github.com/RVC-Boss/GPT-SoVITS/pull/1159 + +大问题修复: + +4-修复了webui的GPT中文微调没读到bert导致和推理不一致, 训练太多可能效果还会变差的问题.如果大量数据微调的建议重新微调模型得到质量优化 [#99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) + +### 20240706 + +小问题修复: + +1-[修正CPU推理默认bs小数](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041) + +2-修复降噪、asr中途遇到异常跳出所有需处理的音频文件的问题 https://github.com/RVC-Boss/GPT-SoVITS/pull/1258 https://github.com/RVC-Boss/GPT-SoVITS/pull/1265 https://github.com/RVC-Boss/GPT-SoVITS/pull/1267 + +3-修复按标点符号切分时小数会被切分 https://github.com/RVC-Boss/GPT-SoVITS/pull/1253 + +4-[多卡训练多进程保存逻辑修复](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca) + +5-移除冗余my_utils https://github.com/RVC-Boss/GPT-SoVITS/pull/1251 + +重点: + +6-倍速推理代码经过验证后推理效果和base完全一致, 合并进main.使用的代码: https://github.com/RVC-Boss/GPT-SoVITS/pull/672 .支持无参考文本模式也倍速. + +后面会逐渐验证快速推理分支的推理改动的一致性 + +### 20240727 + +1-清理冗余i18n代码 https://github.com/RVC-Boss/GPT-SoVITS/pull/1298 + +2-修复用户打文件及路径在结尾添加/会导致命令行报错的问题 https://github.com/RVC-Boss/GPT-SoVITS/pull/1299 + +3-修复GPT训练的step计算逻辑 https://github.com/RVC-Boss/GPT-SoVITS/pull/756 + +重点: + +4-[支持合成语速调节.支持冻结随机性只调节语速, ](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2)并将其更新到api.py上https://github.com/RVC-Boss/GPT-SoVITS/pull/1340 + + +### 20240806 + +1-增加bs-roformer人声伴奏分离模型支持. https://github.com/RVC-Boss/GPT-SoVITS/pull/1306 https://github.com/RVC-Boss/GPT-SoVITS/pull/1356 [支持fp16推理.](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c) + +2-更好的中文文本前端. https://github.com/RVC-Boss/GPT-SoVITS/pull/987 https://github.com/RVC-Boss/GPT-SoVITS/pull/1351 https://github.com/RVC-Boss/GPT-SoVITS/pull/1404 优化多音字逻辑 (v2版本特供). https://github.com/RVC-Boss/GPT-SoVITS/pull/488 + +3-自动填充下一步的文件路径 https://github.com/RVC-Boss/GPT-SoVITS/pull/1355 + +4-增加喂饭逻辑, 用户瞎写显卡序号也可以正常运作 [bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299) [4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) + +5-增加粤语ASR支持 [8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) + +6-GPT-SoVITS-v2支持 + +7-计时逻辑优化 https://github.com/RVC-Boss/GPT-SoVITS/pull/1387 + +### 20240821 + +1-fast_inference分支合并进main: https://github.com/RVC-Boss/GPT-SoVITS/pull/1490 + +2-支持通过ssml标签优化数字、电话、时间日期等: https://github.com/RVC-Boss/GPT-SoVITS/issues/1508 + +3-api修复优化: https://github.com/RVC-Boss/GPT-SoVITS/pull/1503 + +4-修复了参考音频混合只能上传一条的bug:https://github.com/RVC-Boss/GPT-SoVITS/pull/1422 + +5-增加了各种数据集检查,若缺失会弹出warning:https://github.com/RVC-Boss/GPT-SoVITS/pull/1422 + +### 20250211 + +增加gpt-sovits-v3模型, 需要14G显存可以微调 + +### 20250212 + +sovits-v3微调支持开启梯度检查点, 需要12G显存可以微调https://github.com/RVC-Boss/GPT-SoVITS/pull/2040 + +### 20250214 + +优化多语种混合文本切分策略a https://github.com/RVC-Boss/GPT-SoVITS/pull/2047 + +### 20250217 + +优化文本里的数字和英文处理逻辑https://github.com/RVC-Boss/GPT-SoVITS/pull/2062 + +### 20250218 + +优化多语种混合文本切分策略b https://github.com/RVC-Boss/GPT-SoVITS/pull/2073 + +### 20250223 + +1-sovits-v3微调支持lora训练, 需要8G显存可以微调, 效果比全参微调更好 + +2-人声背景音分离增加mel band roformer模型支持https://github.com/RVC-Boss/GPT-SoVITS/pull/2078 + +### 20250226 + +https://github.com/RVC-Boss/GPT-SoVITS/pull/2112 https://github.com/RVC-Boss/GPT-SoVITS/pull/2114 + +修复中文路径下mecab的报错 (具体表现为日文韩文、文本混合语种切分可能会遇到的报错) + +### 20250227 + +针对v3生成24k音频感觉闷的问题https://github.com/RVC-Boss/GPT-SoVITS/issues/2085 https://github.com/RVC-Boss/GPT-SoVITS/issues/2117 ,支持使用24k to 48k的音频超分模型缓解. + + +### 20250228 + +修复短文本语种选择出错 https://github.com/RVC-Boss/GPT-SoVITS/pull/2122 + +修复v3sovits未传参以支持调节语速 + +### 202503 + +修复一批由依赖的库版本不对导致的问题https://github.com/RVC-Boss/GPT-SoVITS/commit/6c468583c5566e5fbb4fb805e4cc89c403e997b8 + +修复模型加载异步逻辑https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa + +修复其他若干bug + +重点更新: + +1-v3支持并行推理 https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa + +2-整合包修复onnxruntime GPU推理的支持, 影响: (1) g2pw有个onnx模型原先是CPU推理现在用GPU, 显著降低推理的CPU瓶颈 (2) foxjoy去混响模型现在可使用GPU推理 diff --git a/docs/cn/README.md b/docs/cn/README.md new file mode 100644 index 0000000000000000000000000000000000000000..cc72b893eef3f44fb6e564e354dc0e66ba623fb4 --- /dev/null +++ b/docs/cn/README.md @@ -0,0 +1,395 @@ +
+ +

GPT-SoVITS-WebUI

+强大的少样本语音转换与语音合成Web用户界面.

+ +[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/RVC-Boss/GPT-SoVITS) + +RVC-Boss%2FGPT-SoVITS | Trendshift + + + +[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb) +[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) +[![Huggingface](https://img.shields.io/badge/🤗%20-online%20demo-yellow.svg?style=for-the-badge)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2) +[![Discord](https://img.shields.io/discord/1198701940511617164?color=%23738ADB&label=Discord&style=for-the-badge)](https://discord.gg/dnrgs5GHfG) + +[**English**](../../README.md) | **中文简体** | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md) + +
+ +--- + +## 功能: + +1. **零样本文本到语音 (TTS): ** 输入 5 秒的声音样本, 即刻体验文本到语音转换. + +2. **少样本 TTS: ** 仅需 1 分钟的训练数据即可微调模型, 提升声音相似度和真实感. + +3. **跨语言支持: ** 支持与训练数据集不同语言的推理, 目前支持英语、日语、韩语、粤语和中文. + +4. **WebUI 工具: ** 集成工具包括声音伴奏分离、自动训练集分割、中文自动语音识别(ASR)和文本标注, 协助初学者创建训练数据集和 GPT/SoVITS 模型. + +**查看我们的介绍视频 [demo video](https://www.bilibili.com/video/BV12g4y1m7Uw)** + +未见过的说话者 few-shot 微调演示: + +https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb + +**用户手册: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)** + +## 安装 + +中国地区的用户可[点击此处](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official)使用 AutoDL 云端镜像进行体验. + +### 测试通过的环境 + +| Python Version | PyTorch Version | Device | +|----------------|------------------|-----------------| +| Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 | +| Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 | +| Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.9 | PyTorch 2.5.1 | Apple silicon | +| Python 3.11 | PyTorch 2.6.0 | Apple silicon | +| Python 3.9 | PyTorch 2.2.2 | CPU | + +### Windows + +如果你是 Windows 用户 (已在 win>=10 上测试), 可以下载[整合包](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true), 解压后双击 go-webui.bat 即可启动 GPT-SoVITS-WebUI. + +**中国地区的用户可以[在此处下载整合包](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).** + +### Linux + +```bash +conda create -n GPTSoVits python=3.9 +conda activate GPTSoVits +bash install.sh --source [--download-uvr5] +``` + +### macOS + +**注: 在 Mac 上使用 GPU 训练的模型效果显著低于其他设备训练的模型, 所以我们暂时使用 CPU 进行训练.** + +1. 运行 `xcode-select --install` 安装 Xcode command-line tools. +2. 运行以下的命令来安装本项目: + +```bash +conda create -n GPTSoVits python=3.9 +conda activate GPTSoVits +bash install.sh --source [--download-uvr5] +``` + +### 手动安装 + +#### 安装 FFmpeg + +##### Conda 用户 + +```bash +conda install ffmpeg +``` + +##### Ubuntu/Debian 用户 + +```bash +sudo apt install ffmpeg +sudo apt install libsox-dev +conda install -c conda-forge 'ffmpeg<7' +``` + +##### Windows 用户 + +下载并将 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) 和 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) 放置在 GPT-SoVITS 根目录下. + +安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语 TTS) + +##### MacOS 用户 + +```bash +brew install ffmpeg +``` + +#### 安装依赖 + +```bash +pip install -r extra-req.txt --no-deps +pip install -r requirements.txt +``` + +### 在 Docker 中使用 + +#### docker-compose.yaml 设置 + +0. image 的标签: 由于代码库更新很快, 镜像的打包和测试又很慢, 所以请自行在 [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(旧版本) 查看当前打包好的最新的镜像并根据自己的情况选用, 或者在本地根据您自己的需求通过 Dockerfile 进行构建. +1. 环境变量: + +- is_half: 半精度/双精度控制.在进行 "SSL extracting" 步骤时如果无法正确生成 4-cnhubert/5-wav32k 目录下的内容时, 一般都是它引起的, 可以根据实际情况来调整为 True 或者 False. + +2. Volume 设置, 容器内的应用根目录设置为 /workspace. 默认的 docker-compose.yaml 中列出了一些实际的例子, 便于上传/下载内容. +3. shm_size: Windows 下的 Docker Desktop 默认可用内存过小, 会导致运行异常, 根据自己情况酌情设置. +4. deploy 小节下的 gpu 相关内容, 请根据您的系统和实际情况酌情设置. + +#### 通过 docker compose 运行 + +``` +docker compose -f "docker-compose.yaml" up -d +``` + +#### 通过 docker 命令运行 + +同上, 根据您自己的实际情况修改对应的参数, 然后运行如下命令: + +``` +docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx +``` + +## 预训练模型 + +**若成功运行`install.sh`可跳过 No.1,2,3** + +**中国地区的用户可以[在此处下载这些模型](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#nVNhX).** + +1. 从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型, 并将其放置在 `GPT_SoVITS/pretrained_models` 目录中. + +2. 从 [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) 下载模型, 解压并重命名为 `G2PWModel`, 然后将其放置在 `GPT_SoVITS/text` 目录中. (仅限中文 TTS) + +3. 对于 UVR5 (人声/伴奏分离和混响移除, 额外功能), 从 [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 下载模型, 并将其放置在 `tools/uvr5/uvr5_weights` 目录中. + + - 如果你在 UVR5 中使用 `bs_roformer` 或 `mel_band_roformer`模型, 你可以手动下载模型和相应的配置文件, 并将它们放在 `tools/UVR5/UVR5_weights` 中.**重命名模型文件和配置文件, 确保除后缀外**, 模型和配置文件具有相同且对应的名称.此外, 模型和配置文件名**必须包含"roformer"**, 才能被识别为 roformer 类的模型. + + - 建议在模型名称和配置文件名中**直接指定模型类型**, 例如`mel_mand_roformer`、`bs_roformer`.如果未指定, 将从配置文中比对特征, 以确定它是哪种类型的模型.例如, 模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对.`kim_mel_band_roformer.ckpt` 和 `kim_mel_band_roformer.yaml` 也是一对. + +4. 对于中文 ASR (额外功能), 从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型, 并将它们放置在 `tools/asr/models` 目录中. + +5. 对于英语或日语 ASR (额外功能), 从 [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 下载模型, 并将其放置在 `tools/asr/models` 目录中.此外, [其他模型](https://huggingface.co/Systran) 可能具有类似效果且占用更少的磁盘空间. + +## 数据集格式 + +文本到语音 (TTS) 注释 .list 文件格式: + +``` +vocal_path|speaker_name|language|text +``` + +语言字典: + +- 'zh': 中文 +- 'ja': 日语 +- 'en': 英语 +- 'ko': 韩语 +- 'yue': 粤语 + +示例: + +``` +D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神. +``` + +## 微调与推理 + +### 打开 WebUI + +#### 整合包用户 + +双击`go-webui.bat`或者使用`go-webui.ps1` +若想使用 V1,则双击`go-webui-v1.bat`或者使用`go-webui-v1.ps1` + +#### 其他 + +```bash +python webui.py +``` + +若想使用 V1,则 + +```bash +python webui.py v1 +``` + +或者在 webUI 内动态切换 + +### 微调 + +#### 现已支持自动填充路径 + + 1. 填入训练音频路径 + 2. 切割音频 + 3. 进行降噪(可选) + 4. 进行ASR + 5. 校对标注 + 6. 前往下一个窗口,点击训练 + +### 打开推理 WebUI + +#### 整合包用户 + +双击 `go-webui.bat` 或者使用 `go-webui.ps1` ,然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI + +#### 其他 + +```bash +python GPT_SoVITS/inference_webui.py +``` + +或者 + +```bash +python webui.py +``` + +然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI + +## V2 发布说明 + +新特性: + +1. 支持韩语及粤语 + +2. 更好的文本前端 + +3. 底模由 2k 小时扩展至 5k 小时 + +4. 对低音质参考音频 (尤其是来源于网络的高频严重缺失、听着很闷的音频) 合成出来音质更好 + + 详见[wiki]() + +从 v1 环境迁移至 v2 + +1. 需要 pip 安装 requirements.txt 更新环境 + +2. 需要克隆 github 上的最新代码 + +3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到 GPT_SoVITS\pretrained_models\gsv-v2final-pretrained 下 + + 中文额外需要下载[G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (下载 G2PW 模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下) + +## V3 更新说明 + +新模型特点: + +1. 音色相似度更像, 需要更少训练集来逼近本人 (不训练直接使用底模模式下音色相似性提升更大) + +2. GPT 合成更稳定, 重复漏字更少, 也更容易跑出丰富情感 + + 详见[wiki]() + +从 v2 环境迁移至 v3 + +1. 需要 pip 安装 requirements.txt 更新环境 + +2. 需要克隆 github 上的最新代码 + +3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些 v3 新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS\pretrained_models`目录下 + + 如果想用音频超分功能缓解 v3 模型生成 24k 音频觉得闷的问题, 需要下载额外的模型参数, 参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt) + +## 待办事项清单 + +- [x] **高优先级: ** + + - [x] 日语和英语的本地化. + - [x] 用户指南. + - [x] 日语和英语数据集微调训练. + +- [ ] **功能:** + - [x] 零样本声音转换 (5 秒) / 少样本声音转换 (1 分钟). + - [x] TTS 语速控制. + - [ ] ~~增强的 TTS 情感控制.~~ + - [ ] 尝试将 SoVITS 令牌输入更改为词汇的概率分布. + - [x] 改进英语和日语文本前端. + - [ ] 开发体积小和更大的 TTS 模型. + - [x] Colab 脚本. + - [x] 扩展训练数据集 (从 2k 小时到 10k 小时). + - [x] 更好的 sovits 基础模型 (增强的音频质量). + - [ ] 模型混合. + +## (附加) 命令行运行方式 + +使用命令行打开 UVR5 的 WebUI + +``` +python tools/uvr5/webui.py "" +``` + + + +这是使用命令行完成数据集的音频切分的方式 + +``` +python audio_slicer.py \ + --input_path "" \ + --output_root "" \ + --threshold \ + --min_length \ + --min_interval + --hop_size +``` + +这是使用命令行完成数据集 ASR 处理的方式 (仅限中文) + +``` +python tools/asr/funasr_asr.py -i -o +``` + +通过 Faster_Whisper 进行 ASR 处理 (除中文之外的 ASR 标记) + + (没有进度条, GPU 性能可能会导致时间延迟) + +``` +python ./tools/asr/fasterwhisper_asr.py -i -o -l -p +``` + +启用自定义列表保存路径 + +## 致谢 + +特别感谢以下项目和贡献者: + +### 理论研究 + +- [ar-vits](https://github.com/innnky/ar-vits) +- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR) +- [vits](https://github.com/jaywalnut310/vits) +- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556) +- [contentvec](https://github.com/auspicious3000/contentvec/) +- [hifi-gan](https://github.com/jik876/hifi-gan) +- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41) +- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py) +- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py) + +### 预训练模型 + +- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain) +- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large) +- [BigVGAN](https://github.com/NVIDIA/BigVGAN) + +### 推理用文本前端 + +- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization) +- [split-lang](https://github.com/DoodleBears/split-lang) +- [g2pW](https://github.com/GitYCC/g2pW) +- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW) +- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw) + +### WebUI 工具 + +- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui) +- [audio-slicer](https://github.com/openvpi/audio-slicer) +- [SubFix](https://github.com/cronrpc/SubFix) +- [FFmpeg](https://github.com/FFmpeg/FFmpeg) +- [gradio](https://github.com/gradio-app/gradio) +- [faster-whisper](https://github.com/SYSTRAN/faster-whisper) +- [FunASR](https://github.com/alibaba-damo-academy/FunASR) +- [AP-BWE](https://github.com/yxlu-0102/AP-BWE) + +感谢 @Naozumi520 提供粤语训练集, 并在粤语相关知识方面给予指导. + +## 感谢所有贡献者的努力 + + + + diff --git a/docs/en/Changelog_EN.md b/docs/en/Changelog_EN.md new file mode 100644 index 0000000000000000000000000000000000000000..3c3fe18a975c3f552d83fc6f5d79843ad92aff66 --- /dev/null +++ b/docs/en/Changelog_EN.md @@ -0,0 +1,222 @@ +### 20240121 Update + +1. Added `is_share` to the `config`. In scenarios like Colab, this can be set to `True` to map the WebUI to the public network. +2. Added English system translation support to WebUI. +3. The `cmd-asr` automatically detects if the FunASR model is included; if not found in the default directory, it will be downloaded from ModelScope. +4. Attempted to fix the SoVITS training ZeroDivisionError reported in [Issue 79](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) by filtering samples with zero length, etc. +5. Cleaned up cached audio files and other files in the `TEMP` folder. +6. Significantly reduced the issue of synthesized audio containing the end of the reference audio. + +### 20240122 Update + +1. Fixed the issue where excessively short output files resulted in repeating the reference audio. +2. Tested native support for English and Japanese training (Japanese training requires the root directory to be free of non-English special characters). +3. Improved audio path checking. If an attempt is made to read from an incorrect input path, it will report that the path does not exist instead of an ffmpeg error. + +### 20240123 Update + +1. Resolved the issue where Hubert extraction caused NaN errors, leading to SoVITS/GPT training ZeroDivisionError. +2. Added support for quick model switching in the inference WebUI. +3. Optimized the model file sorting logic. +4. Replaced `jieba` with `jieba_fast` for Chinese word segmentation. + +### 20240126 Update + +1. Added support for Chinese-English mixed and Japanese-English mixed output texts. +2. Added an optional segmentation mode for output. +3. Fixed the issue of UVR5 reading and automatically jumping out of directories. +4. Fixed multiple newline issues causing inference errors. +5. Removed redundant logs in the inference WebUI. +6. Supported training and inference on Mac. +7. Automatically forced single precision for GPU that do not support half precision; enforced single precision under CPU inference. + +### 20240128 Update + +1. Fixed the issue with the pronunciation of numbers converting to Chinese characters. +2. Fixed the issue of swallowing a few characters at the beginning of sentences. +3. Excluded unreasonable reference audio lengths by setting restrictions. +4. Fixed the issue where GPT training did not save checkpoints. +5. Completed model downloading process in the Dockerfile. + +### 20240129 Update + +1. Changed training configurations to single precision for GPUs like the 16 series, which have issues with half precision training. +2. Tested and updated the available Colab version. +3. Fixed the issue of git cloning the ModelScope FunASR repository with older versions of FunASR causing interface misalignment errors. + +### 20240130 Update + +1. Automatically removed double quotes from all path-related entries to prevent errors from novice users copying paths with double quotes. +2. Fixed issues with splitting Chinese and English punctuation and added punctuation at the beginning and end of sentences. +3. Added splitting by punctuation. + +### 20240201 Update + +1. Fixed the UVR5 format reading error causing separation failures. +2. Supported automatic segmentation and language recognition for mixed Chinese-Japanese-English texts. + +### 20240202 Update + +1. Fixed the issue where an ASR path ending with `/` caused an error in saving the filename. +2. [PR 377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377) introduced PaddleSpeech's Normalizer to fix issues like reading "xx.xx%" (percent symbols) and "元/吨" being read as "元吨" instead of "元每吨", and fixed underscore errors. + +### 20240207 Update + +1. Corrected language parameter confusion causing decreased Chinese inference quality reported in [Issue 391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391). +2. [PR 403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403) adapted UVR5 to higher versions of librosa. +3. [Commit 14a2851](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8) fixed UVR5 inf everywhere error caused by `is_half` parameter not converting to boolean, resulting in constant half precision inference, which caused `inf` on 16 series GPUs. +4. Optimized English text frontend. +5. Fixed Gradio dependencies. +6. Supported automatic reading of `.list` full paths if the root directory is left blank during dataset preparation. +7. Integrated Faster Whisper ASR for Japanese and English. + +### 20240208 Update + +1. [Commit 59f35ad](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b) attempted to fix GPT training hang on Windows 10 1909 and [Issue 232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232) (Traditional Chinese System Language). + +### 20240212 Update + +1. Optimized logic for Faster Whisper and FunASR, switching Faster Whisper to mirror downloads to avoid issues with Hugging Face connections. +2. [PR 457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457) enabled experimental DPO Loss training option to mitigate GPT repetition and missing characters by constructing negative samples during training and made several inference parameters available in the inference WebUI. + +### 20240214 Update + +1. Supported Chinese experiment names in training (previously caused errors). +2. Made DPO training an optional feature instead of mandatory. If selected, the batch size is automatically halved. Fixed issues with new parameters not being passed in the inference WebUI. + +### 20240216 Update + +1. Supported input without reference text. +2. Fixed bugs in Chinese frontend reported in [Issue 475](https://github.com/RVC-Boss/GPT-SoVITS/issues/475). + +### 20240221 Update + +1. Added a noise reduction option during data processing (noise reduction leaves only 16kHz sampling rate; use only if the background noise is significant). +2. [PR 559](https://github.com/RVC-Boss/GPT-SoVITS/pull/559), [PR 556](https://github.com/RVC-Boss/GPT-SoVITS/pull/556), [PR 532](https://github.com/RVC-Boss/GPT-SoVITS/pull/532), [PR 507](https://github.com/RVC-Boss/GPT-SoVITS/pull/507), [PR 509](https://github.com/RVC-Boss/GPT-SoVITS/pull/509) optimized Chinese and Japanese frontend processing. +3. Switched Mac CPU inference to use CPU instead of MPS for faster performance. +4. Fixed Colab public URL issue. + +### 20240306 Update + +1. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) accelerated inference by 50% (tested on RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39) . +2. No longer requires downloading the Chinese FunASR model first when using Faster Whisper non-Chinese ASR. +3. [PR 610](https://github.com/RVC-Boss/GPT-SoVITS/pull/610) fixed UVR5 reverb removal model where the setting was reversed. +4. [PR 675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675) enabled automatic CPU inference for Faster Whisper if no CUDA is available. +5. [PR 573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573) modified `is_half` check to ensure proper CPU inference on Mac. + +### 202403/202404/202405 Update + +#### Minor Fixes: + +1. Fixed issues with the no-reference text mode. +2. Optimized the Chinese and English text frontend. +3. Improved API format. +4. Fixed CMD format issues. +5. Added error prompts for unsupported languages during training data processing. +6. Fixed the bug in Hubert extraction. + +#### Major Fixes: + +1. Fixed the issue of SoVITS training without freezing VQ (which could cause quality degradation). +2. Added a quick inference branch. + +### 20240610 Update + +#### Minor Fixes: + +1. [PR 1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168) & [PR 1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169) improved the logic for pure punctuation and multi-punctuation text input. +2. [Commit 501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232) fixed CMD format for MDXNet de-reverb in UVR5, supporting paths with spaces. +3. [PR 1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159) fixed progress bar logic for SoVITS training in `s2_train.py`. + +#### Major Fixes: + +4. [Commit 99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) fixed the issue of WebUI's GPT fine-tuning not reading BERT feature of Chinese input texts, causing inconsistency with inference and potential quality degradation. + **Caution: If you have previously fine-tuned with a large amount of data, it is recommended to retune the model to improve quality.** + +### 20240706 Update + +#### Minor Fixes: + +1. [Commit 1250670](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041) fixed default batch size decimal issue in CPU inference. +2. [PR 1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR 1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR 1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267) fixed issues where denoising or ASR encountering exceptions would exit all pending audio files. +3. [PR 1253](https://github.com/RVC-Boss/GPT-SoVITS/pull/1253) fixed the issue of splitting decimals when splitting by punctuation. +4. [Commit a208698](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca) fixed multi-process save logic for multi-GPU training. +5. [PR 1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251) removed redundant `my_utils`. + +#### Major Fixes: + +6. The accelerated inference code from [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) has been validated and merged into the main branch, ensuring consistent inference effects with the base. + It also supports accelerated inference in no-reference text mode. + +**Future updates will continue to verify the consistency of changes in the `fast_inference` branch**. + +### 20240727 Update + +#### Minor Fixes: + +1. [PR 1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298) cleaned up redundant i18n code. +2. [PR 1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299) fixed issues where trailing slashes in user file paths caused command line errors. +3. [PR 756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756) fixed the step calculation logic in GPT training. + +#### Major Fixes: + +4. [Commit 9588a3c](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2) supported speech rate adjustment for synthesis. + Enabled freezing randomness while only adjusting the speech rate. + +### 20240806 Update + +1. [PR 1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR 1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) Added support for the BS RoFormer vocal accompaniment separation model. [Commit e62e965](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c) Enabled FP16 inference. +2. Improved Chinese text frontend. + - [PR 488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488) added support for polyphonic characters (v2 only); + - [PR 987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987) added quantifier; + - [PR 1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351) supports arithmetic and basic math formulas; + - [PR 1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404) fixed mixed text errors. +3. [PR 1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) automatically filled in the paths when processing audio in the WebUI. +4. [Commit bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit 4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) optimized GPU recognition logic. +5. [Commit 8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) added support for Cantonese ASR. +6. Added support for GPT-SoVITS v2. +7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) optimized timing logic. + +### 20240821 Update + +1. [PR 1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490) Merge the `fast_inference` branch into the main branch. +2. [Issue 1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508) Support for optimizing numbers, phone numbers, dates, and times using SSML tags. +3. [PR 1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503) Fixed and optimized API. +4. [PR 1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422) Fixed the bug where only one reference audio could be uploaded for mixing, Added various dataset checks with warnings popping up if missing files. + +### 20250211 Update + +- [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) Added GPT-SoVITS v3 Model, Need 14GB GPU Memory to Fine-tune SoVITS v3. + +### 20250212 Update + +- [PR 2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040) Added gradient checkpointing to Fine-tune SoVITS v3, Need 12GB GPU Memory. + +### 20250214 Update + +- [PR 2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047) Optimize the multilingual mixed text segmentation strategy **A**. + -AAdded `split-lang` as a language segmentation tool to improve segmentation capabilities for multi-language mixed text. + +### 20250217 Update + +- [PR 2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062) Optimize the logic for handling numbers and English in the text. + +### 20250218 Update + +- [PR 2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073) Optimize the multilingual mixed text segmentation strategy **B**. + +### 20250223 Update + +1. LoRA training is supported for fine-tuning with SoVITS V3. It requires 8GB GPU Memory and the results are better than full parameter fine-tuning. +2. [PR 2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078) Added Mel Band RoFormer model for Vocal & Instrument Separation. + +### 20250226 Update + +1. [PR 2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112) Fix issues caused by non-English directories in Windows. + - Using `langsegmenter` for Korean. +2. [PR 2113](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114) Fix issues caused by non-English directories in Windows. + - Using `langsegmenter` for Korean/Japanese. + +### 20250227 Update + +- Added 24K to 48K audio super-resolution models to alleviate the muffled issue when generating 24K audio with V3 model, as reported in [Issue 2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085), [Issue 2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117). \ No newline at end of file diff --git a/docs/ja/Changelog_JA.md b/docs/ja/Changelog_JA.md new file mode 100644 index 0000000000000000000000000000000000000000..1173c081583fd520b9ef317eeec406c0e9033c35 --- /dev/null +++ b/docs/ja/Changelog_JA.md @@ -0,0 +1,221 @@ +### 20240121 更新 + +1. `config`に`is_share`を追加し、Colab などの環境でこれを`True`に設定すると、webui を公共ネットワークにマッピングできます. +2. WebUI に英語システムの英語翻訳を追加しました. +3. `cmd-asr`は FunASR モデルが既に含まれているかどうかを自動的に確認し、デフォルトのパスにない場合は modelscope から自動的にダウンロードします. +4. [SoVITS 训练报错 ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) 修復を試みます (長さ 0 のサンプルをフィルタリングなど) +5. TEMP ファイルフォルダからオーディオやその他のファイルをクリーンアップして最適化します. +6. 合成オーディオがリファレンスオーディオの終わりを含む問題を大幅に改善しました. + +### 20240122 更新 + +1. 短すぎる出力ファイルが重複したリファレンスオーディオを返す問題を修正しました. +2. 英語-日本語学習がスムーズに進む QA を完了しました. (ただし、日本語学習はルートディレクトリに英語以外の文字が含まれていない必要があります) +3. オーディオパスをチェックします.間違ったパスを読み取ろうとすると、「パスが存在しません」というエラーメッセージが返されます.これは ffmpeg モジュールのエラーではありません. + +### 20240123 更新 + +1. hubert から nan 抽出による SoVITS/GPT 学習中の ZeroDivisionError 関連エラーを修正しました. +2. 推論インターフェースでモデルを素早く切り替えることができるようにサポートしました. +3. モデルファイルのソートロジックを最適化しました. +4. 中国語の分析に `jieba_fast` を `jieba` に置き換えました. + +### 20240126 更新 + +1. 中国語と英語、日本語と英語が混在した出力テキストをサポートします. +2. 出力で選択的な分割モードをサポートします. +3. uvr5 がディレクトリを読み取り、自動的に終了する問題を修正しました. +4. 複数の改行による推論エラーを修正しました. +5. 推論インターフェースから不要なログを削除しました. +6. MacOS での学習と推論をサポートします. +7. 半精度をサポートしていないカードを自動的に識別して単精度を強制し、CPU 推論では単精度を強制します. + +### 20240128 更新 + +1. 数字を漢字で読む問題を修正しました. +2. 文章の先頭の一部の単語が欠落する問題を修正しました. +3. 不適切な長さのリファレンスオーディオを制限しました. +4. GPT 学習時の ckpt が保存されない問題を修正しました. +5. Dockerfile のモデルダウンロードプロセスを改善しました. + +### 20240129 更新 + +1. 16 系などの半精度学習に問題があるカードは、学習構成を単精度学習に変更しました. +2. Colab でも使用可能なバージョンをテストして更新しました. +3. ModelScope FunASR リポジトリの古いバージョンで git クローンを行う際のインターフェース不整合エラーの問題を修正しました. + +### 20240130 更新 + +1. パスと関連する文字列を解析して、二重引用符を自動的に削除します.また、パスをコピーする場合、二重引用符が含まれていてもエラーが発生しません. +2. 中国語と英語、日本語と英語の混合出力をサポートします. +3. 出力で選択的な分割モードをサポートします. + +### 20240201 更新 + +1. UVR5 形式の読み取りエラーによる分離失敗を修正しました. +2. 中国語・日本語・英語の混合テキストに対する自動分割と言語認識をサポートしました. + +### 20240202 更新 + +1. ASRパスが `/` で終わることによるファイル名保存エラーの問題を修正しました. +2. [PR 377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377) で PaddleSpeech の Normalizer を導入し、"xx.xx%" (パーセント記号) の読み取りや"元/吨"が"元吨"ではなく"元每吨"と読まれる問題、アンダースコアエラーを修正しました. + +### 20240207 更新 + +1. [Issue 391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391) で報告された中国語推論品質の低下を引き起こした言語パラメータの混乱を修正しました. +2. [PR 403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403) で UVR5 を librosa のより高いバージョンに適応させました. +3. [Commit 14a2851](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8) で、`is_half` パラメータがブール値に変換されず、常に半精度推論が行われ、16 シリーズの GPU で `inf` が発生する UVR5 inf everywhereエラーを修正しました. +4. 英語テキストフロントエンドを最適化しました. +5. Gradio の依存関係を修正しました. +6. データセット準備中にルートディレクトリが空白の場合、`.list` フルパスの自動読み取りをサポートしました. +7. 日本語と英語のために Faster Whisper ASR を統合しました. + +### 20240208 更新 + +1. [Commit 59f35ad](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b) で、Windows 10 1909 および [Issue 232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232) (繁体字中国語システム言語) での GPT トレーニングのハングを修正する試みを行いました. + +### 20240212 更新 + +1. Faster Whisper と FunASR のロジックを最適化し、Faster Whisper をミラーダウンロードに切り替えて Hugging Face の接続問題を回避しました. +2. [PR 457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457) で、GPT の繰り返しと文字欠落を軽減するために、トレーニング中に負のサンプルを構築する実験的なDPO Lossトレーニングオプションを有効にし、いくつかの推論パラメータを推論WebUIで利用可能にしました. + +### 20240214 更新 + +1. トレーニングで中国語の実験名をサポート (以前はエラーが発生していました). +2. DPOトレーニングを必須ではなくオプション機能に変更.選択された場合、バッチサイズは自動的に半分になります.推論 WebUI で新しいパラメータが渡されない問題を修正しました. + +### 20240216 更新 + +1. 参照テキストなしでの入力をサポート. +2. [Issue 475](https://github.com/RVC-Boss/GPT-SoVITS/issues/475) で報告された中国語フロントエンドのバグを修正しました. + +### 20240221 更新 + +1. データ処理中のノイズ低減オプションを追加 (ノイズ低減は16kHzサンプリングレートのみを残します;背景ノイズが大きい場合にのみ使用してください). +2. [PR 559](https://github.com/RVC-Boss/GPT-SoVITS/pull/559), [PR 556](https://github.com/RVC-Boss/GPT-SoVITS/pull/556), [PR 532](https://github.com/RVC-Boss/GPT-SoVITS/pull/532), [PR 507](https://github.com/RVC-Boss/GPT-SoVITS/pull/507), [PR 509](https://github.com/RVC-Boss/GPT-SoVITS/pull/509) で中国語と日本語のフロントエンド処理を最適化しました. +3. Mac CPU 推論を MPS ではなく CPU を使用するように切り替え、パフォーマンスを向上させました. +4. Colab のパブリック URL の問題を修正しました. +### 20240306 更新 + +1. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) で推論速度を50%向上させました (RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39 でテスト). +2. Faster Whisper非中国語ASRを使用する際、最初に中国語FunASRモデルをダウンロードする必要がなくなりました. +3. [PR 610](https://github.com/RVC-Boss/GPT-SoVITS/pull/610) で UVR5 残響除去モデルの設定が逆になっていた問題を修正しました. +4. [PR 675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675) で、CUDA が利用できない場合に Faster Whisper の自動 CPU 推論を有効にしました. +5. [PR 573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573) で、Mac での適切なCPU推論を確保するために `is_half` チェックを修正しました. + +### 202403/202404/202405 更新 + +#### マイナー修正: + +1. 参照テキストなしモードの問題を修正しました. +2. 中国語と英語のテキストフロントエンドを最適化しました. +3. API フォーマットを改善しました. +4. CMD フォーマットの問題を修正しました. +5. トレーニングデータ処理中のサポートされていない言語に対するエラープロンプトを追加しました. +6. Hubert 抽出のバグを修正しました. + +#### メジャー修正: + +1. SoVITS トレーニングで VQ を凍結せずに品質低下を引き起こす問題を修正しました. +2. クイック推論ブランチを追加しました. + +### 20240610 更新 + +#### マイナー修正: + +1. [PR 1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168) & [PR 1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169)で、純粋な句読点および複数の句読点を含むテキスト入力のロジックを改善しました. +2. [Commit 501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232)で、UVR5 の MDXNet デリバブをサポートする CMD フォーマットを修正し、スペースを含むパスをサポートしました. +3. [PR 1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159)で、`s2_train.py` の SoVITS トレーニングのプログレスバーロジックを修正しました. + +#### メジャー修正: + +4. [Commit 99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) で、WebUI の GPT ファインチューニングが中国語入力テキストの BERT 特徴を読み取らず、推論との不一致や品質低下の可能性を修正しました. + **注意: 以前に大量のデータでファインチューニングを行った場合、品質向上のためにモデルを再調整することをお勧めします.** + +### 20240706 更新 + +#### マイナー修正: + +1. [Commit 1250670](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041) で、CPU 推論のデフォルトバッチサイズの小数点問題を修正しました. +2. [PR 1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR 1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR 1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267) で、ノイズ除去またはASRが例外に遭遇した場合に、すべての保留中のオーディオファイルが終了する問題を修正しました. +3. [PR 1253](https://github.com/RVC-Boss/GPT-SoVITS/pull/1253) で、句読点で分割する際の小数点分割の問題を修正しました. +4. [Commit a208698](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca) で、マルチGPUトレーニングのマルチプロセス保存ロジックを修正しました. +5. [PR 1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251) で、不要な `my_utils` を削除しました. + +#### メジャー修正: + +6. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) の加速推論コードが検証され、メインブランチにマージされ、ベースとの推論効果の一貫性が確保されました. + また、参照テキストなしモードでの加速推論もサポートしています. + +**今後の更新では、`fast_inference`ブランチの変更の一貫性を継続的に検証します**. + +### 20240727 更新 + +#### マイナー修正: + +1. [PR 1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298) で、不要な i18n コードをクリーンアップしました. +2. [PR 1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299) で、ユーザーファイルパスの末尾のスラッシュがコマンドラインエラーを引き起こす問題を修正しました. +3. [PR 756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756) で、GPT トレーニングのステップ計算ロジックを修正しました. + +#### メジャー修正: + +4. [Commit 9588a3c](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2) で、合成のスピーチレート調整をサポートしました. + スピーチレートのみを調整しながらランダム性を固定できるようになりました. + +### 20240806 更新 + +1. [PR 1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306)、[PR 1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) BS RoFormer ボーカルアコムパニ分離モデルのサポートを追加しました.[Commit e62e965](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c) FP16 推論を有効にしました. +2. 中国語テキストフロントエンドを改善しました. + - [PR 488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488) 多音字のサポートを追加 (v2 のみ); + - [PR 987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987) 量詞を追加; + - [PR 1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351) 四則演算と基本数式のサポート; + - [PR 1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404) 混合テキストエラーを修正. +3. [PR 1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) WebUIでオーディオ処理時にパスを自動入力しました. +4. [Commit bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit 4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) GPU 認識ロジックを最適化しました. +5. [Commit 8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) 広東語ASRのサポートを追加しました. +6. GPT-SoVITS v2 のサポートを追加しました. +7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) タイミングロジックを最適化しました. + +### 20240821 更新 + +1. [PR 1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490) `fast_inference` ブランチをメインブランチにマージしました. +2. [Issue 1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508) SSMLタグを使用して数字、電話番号、日付、時間などの最適化をサポートしました. +3. [PR 1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503) APIの修正と最適化を行いました. +4. [PR 1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422) 参照音声のミキシングで1つしかアップロードできないバグを修正し、データセットの各種チェックを追加してファイルが欠落している場合に警告を表示するようにしました. + +### 20250211 更新 + +1. [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) GPT-SoVITS v3 モデルを追加しました.SoVITS v3のファインチューニングには14GBのGPUメモリが必要です. + +### 20250212 更新 + +- [PR 2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040) SoVITS v3のファインチューニングにグラデーションチェックポイントを追加、12GBのGPUメモリが必要です. + +### 20250214 更新 + +- [PR 2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047) 多言語混合テキスト分割戦略の最適化 **A**. + - `split-lang`を言語分割ツールとして追加し、多言語混合テキストの分割能力を向上させました. + +### 20250217 更新 + +- [PR 2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062) テキスト内の数字と英語の処理ロジックを最適化. + +### 20250218 更新 + +- [PR 2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073) 多言語混合テキスト分割戦略の最適化 **B**. + +### 20250223 更新 + +1. LoRAトレーニングがSoVITS V3のファインチューニングに対応しました.8GBのGPUメモリが必要で、結果はフルパラメータファインチューニングより優れています. +2. [PR 2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078) ボーカルと楽器分離のためにMel Band RoFormerモデルを追加しました. + +### 20250226 更新 + +1. [PR 2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112) Windowsでの非英語ディレクトリによる問題を修正しました. + - `langsegmenter`を使用して韓国語の問題を修正. +2. [PR 2113](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114) Windowsでの非英語ディレクトリによる問題を修正しました. + - `langsegmenter`を使用して韓国語/日本語の問題を修正. + +### 20250227 更新 + +- V3モデルで24Kオーディオを生成する際に発生するこもった音の問題を緩和するために、24Kから48Kのオーディオ超解像モデルを追加しました.[Issue 2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085)、[Issue 2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117)で報告されています. \ No newline at end of file diff --git a/docs/ja/README.md b/docs/ja/README.md new file mode 100644 index 0000000000000000000000000000000000000000..145de219dfebd426014883f384dbf053326ab72c --- /dev/null +++ b/docs/ja/README.md @@ -0,0 +1,383 @@ +
+ +

GPT-SoVITS-WebUI

+パワフルなFew-Shot音声変換・音声合成 WebUI.

+ +[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/RVC-Boss/GPT-SoVITS) + +
+ +[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb) +[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) +[![Huggingface](https://img.shields.io/badge/🤗%20-online%20demo-yellow.svg?style=for-the-badge)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2) +[![Discord](https://img.shields.io/discord/1198701940511617164?color=%23738ADB&label=Discord&style=for-the-badge)](https://discord.gg/dnrgs5GHfG) + +[**English**](../../README.md) | [**中文简体**](../cn/README.md) | **日本語** | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md) + +
+ +--- + +## 機能: + +1. **Zero-Shot TTS:** たった 5 秒間の音声サンプルで、即座にテキストからその音声に変換できます. + +2. **Few-Shot TTS:** わずか 1 分間のトレーニングデータでモデルを微調整し、音声のクオリティを向上. + +3. **多言語サポート:** 現在、英語、日本語、韓国語、広東語、中国語をサポートしています. + +4. **WebUI ツール:** 統合されたツールは、音声と伴奏 (BGM 等) の分離、トレーニングセットの自動セグメンテーション、ASR (中国語のみ)、テキストラベリング等を含むため、初心者の方でもトレーニングデータセットの作成や GPT/SoVITS モデルのトレーニング等を非常に簡単に行えます. + +**[デモ動画](https://www.bilibili.com/video/BV12g4y1m7Uw)をチェック!** + +声の事前学習無しかつ Few-Shot でトレーニングされたモデルのデモ: + +https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb + +**ユーザーマニュアル: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)** + +## インストール + +### テスト済みの環境 + +| Python Version | PyTorch Version | Device | +|----------------|------------------|-----------------| +| Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 | +| Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 | +| Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.9 | PyTorch 2.5.1 | Apple silicon | +| Python 3.11 | PyTorch 2.6.0 | Apple silicon | +| Python 3.9 | PyTorch 2.2.2 | CPU | + +### Windows + +Windows ユーザー: (Windows 10 以降でテスト済み)、[統合パッケージをダウンロード](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true)し、解凍後に _go-webui.bat_ をダブルクリックすると、GPT-SoVITS-WebUI が起動します. + +### Linux + +```bash +conda create -n GPTSoVits python=3.9 +conda activate GPTSoVits +bash install.sh --source [--download-uvr5] +``` + +### macOS + +**注: Mac で GPU を使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面は CPU を使用して訓練することを強く推奨します.** + +1. `xcode-select --install` を実行して、Xcode コマンドラインツールをインストールします. +2. 以下のコマンドを実行してこのプロジェクトをインストールします. + +```bash +conda create -n GPTSoVits python=3.9 +conda activate GPTSoVits +bash install.sh --source [--download-uvr5] +``` + +### 手動インストール + +#### FFmpeg をインストールします. + +##### Conda ユーザー + +```bash +conda install ffmpeg +``` + +##### Ubuntu/Debian ユーザー + +```bash +sudo apt install ffmpeg +sudo apt install libsox-dev +conda install -c conda-forge 'ffmpeg<7' +``` + +##### Windows ユーザー + +[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます. + +##### MacOS ユーザー + +```bash +brew install ffmpeg +``` + +#### 依存関係をインストールします + +```bash +pip install -r extra-req.txt --no-deps +pip install -r requirementx.txt +``` + +### Docker の使用 + +#### docker-compose.yaml の設定 + +0. イメージのタグについて: コードベースの更新が速い割に、イメージのパッケージングとテストが遅いため、[Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(古いバージョン) で現在パッケージされている最新のイメージをご覧になり、ご自身の状況に応じて選択するか、またはご自身のニーズに応じて Dockerfile を使用してローカルでビルドしてください. +1. 環境変数: + + - `is_half`: 半精度/倍精度の制御."SSL 抽出"ステップ中に`4-cnhubert/5-wav32k`ディレクトリ内の内容が正しく生成されない場合、通常これが原因です.実際の状況に応じて True または False に調整してください. + +2. ボリューム設定: コンテナ内のアプリケーションのルートディレクトリは`/workspace`に設定されます.デフォルトの`docker-compose.yaml`には、アップロード/ダウンロードの内容の実例がいくつか記載されています. +3. `shm_size`: Windows の Docker Desktop のデフォルトの利用可能メモリは小さすぎるため、うまく動作しない可能性があります.状況に応じて適宜設定してください. +4. `deploy`セクションの GPU に関連する内容は、システムと実際の状況に応じて慎重に設定してください. + +#### docker compose で実行する + +```markdown +docker compose -f "docker-compose.yaml" up -d +``` + +#### docker コマンドで実行する + +上記と同様に、実際の状況に基づいて対応するパラメータを変更し、次のコマンドを実行します: + +```markdown +docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx +``` + +## 事前訓練済みモデル + +**`install.sh`が正常に実行された場合、No.1,2,3 はスキップしてかまいません.** + +1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) から事前訓練済みモデルをダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリに配置してください. + +2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください. (中国語 TTS のみ) + +3. UVR5 (ボーカル/伴奏 (BGM 等) 分離 & リバーブ除去の追加機能) の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`tools/uvr5/uvr5_weights` ディレクトリに配置してください. + + - UVR5 で bs_roformer または mel_band_roformer モデルを使用する場合、モデルと対応する設定ファイルを手動でダウンロードし、`tools/UVR5/UVR5_weights`フォルダに配置することができます.**モデルファイルと設定ファイルの名前は、拡張子を除いて同じであることを確認してください**.さらに、モデルと設定ファイルの名前には**「roformer」が含まれている必要があります**.これにより、roformer クラスのモデルとして認識されます. + + - モデル名と設定ファイル名には、**直接モデルタイプを指定することをお勧めします**.例: mel_mand_roformer、bs_roformer.指定しない場合、設定文から特徴を照合して、モデルの種類を特定します.例えば、モデル`bs_roformer_ep_368_sdr_12.9628.ckpt`と対応する設定ファイル`bs_roformer_ep_368_sdr_12.9628.yaml`はペアです.同様に、`kim_mel_band_roformer.ckpt`と`kim_mel_band_roformer.yaml`もペアです. + +4. 中国語 ASR (追加機能) の場合は、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、および [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください. + +5. 英語または日本語の ASR (追加機能) を使用する場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください.また、[他のモデル](https://huggingface.co/Systran) は、より小さいサイズで高クオリティな可能性があります. + +## データセット形式 + +TTS アノテーション .list ファイル形式: + +``` +vocal_path|speaker_name|language|text +``` + +言語辞書: + +- 'zh': 中国語 +- 'ja': 日本語 +- 'en': 英語 + +例: + +``` +D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin. +``` + +## 微調整と推論 + +### WebUI を開く + +#### 統合パッケージ利用者 + +`go-webui.bat`をダブルクリックするか、`go-webui.ps1`を使用します. +V1 に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックするか、`go-webui-v1.ps1`を使用してください. + +#### その他 + +```bash +python webui.py <言語(オプション)> +``` + +V1 に切り替えたい場合は + +```bash +python webui.py v1 <言語(オプション)> +``` + +または WebUI で手動でバージョンを切り替えてください. + +### 微調整 + +#### パス自動補完のサポート + + 1. 音声パスを入力する + 2. 音声を小さなチャンクに分割する + 3. ノイズ除去 (オプション) + 4. ASR + 5. ASR転写を校正する + 6. 次のタブに移動し、モデルを微調整する + +### 推論 WebUI を開く + +#### 統合パッケージ利用者 + +`go-webui-v2.bat`をダブルクリックするか、`go-webui-v2.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます. + +#### その他 + +```bash +python GPT_SoVITS/inference_webui.py <言語(オプション)> +``` + +または + +```bash +python webui.py +``` + +その後、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます. + +## V2 リリースノート + +新機能: + +1. 韓国語と広東語をサポート + +2. 最適化されたテキストフロントエンド + +3. 事前学習済みモデルが 2 千時間から 5 千時間に拡張 + +4. 低品質の参照音声に対する合成品質の向上 + + [詳細はこちら]() + +V1 環境から V2 を使用するには: + +1. `pip install -r requirements.txt`を使用していくつかのパッケージを更新 + +2. 最新のコードを github からクローン + +3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)から V2 の事前学習モデルをダウンロードし、それらを`GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`に配置 + + 中国語 V2 追加: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW モデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します) + +## V3 リリースノート + +新機能: + +1. 音色の類似性が向上し、ターゲットスピーカーを近似するために必要な学習データが少なくなりました (音色の類似性は、ファインチューニングなしでベースモデルを直接使用することで顕著に改善されます). + +2. GPT モデルがより安定し、繰り返しや省略が減少し、より豊かな感情表現を持つ音声の生成が容易になりました. + + [詳細情報はこちら]() + +v2 環境から v3 を使用する方法: + +1. `pip install -r requirements.txt` を実行して、いくつかのパッケージを更新します. + +2. GitHub から最新のコードをクローンします. + +3. v3 の事前学習済みモデル (s1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ) を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS\pretrained_models フォルダに配置します. + + 追加: 音声超解像モデルについては、[ダウンロード方法](../../tools/AP_BWE_main/24kto48k/readme.txt)を参照してください. + +## Todo リスト + +- [x] **優先度 高:** + + - [x] 日本語と英語でのローカライズ. + - [x] ユーザーガイド. + - [x] 日本語データセットと英語データセットのファインチューニングトレーニング. + +- [ ] **機能:** + - [x] ゼロショット音声変換 (5 秒) /数ショット音声変換 (1 分). + - [x] TTS スピーキングスピードコントロール. + - [ ] ~~TTS の感情コントロールの強化.~~ + - [ ] SoVITS トークン入力を語彙の確率分布に変更する実験. + - [x] 英語と日本語のテキストフロントエンドを改善. + - [ ] 小型と大型の TTS モデルを開発する. + - [x] Colab のスクリプト. + - [ ] トレーニングデータセットを拡張する (2k→10k). + - [x] より良い sovits ベースモデル (音質向上) + - [ ] モデルミックス + +## (追加の) コマンドラインから実行する方法 + +コマンド ラインを使用して UVR5 の WebUI を開きます + +``` +python tools/uvr5/webui.py "" +``` + + + +コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです. + +``` +python audio_slicer.py \ + --input_path "" \ + --output_root "" \ + --threshold \ + --min_length \ + --min_interval + --hop_size +``` + +コマンドラインを使用してデータセット ASR 処理を行う方法です (中国語のみ) + +``` +python tools/asr/funasr_asr.py -i -o +``` + +ASR 処理は Faster_Whisper を通じて実行されます(中国語を除く ASR マーキング) + +(進行状況バーは表示されません.GPU のパフォーマンスにより時間遅延が発生する可能性があります) + +``` +python ./tools/asr/fasterwhisper_asr.py -i -o -l -p +``` + +カスタムリストの保存パスが有効になっています + +## クレジット + +特に以下のプロジェクトと貢献者に感謝します: + +### 理論研究 + +- [ar-vits](https://github.com/innnky/ar-vits) +- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR) +- [vits](https://github.com/jaywalnut310/vits) +- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556) +- [contentvec](https://github.com/auspicious3000/contentvec/) +- [hifi-gan](https://github.com/jik876/hifi-gan) +- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41) +- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py) +- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py) + +### 事前学習モデル + +- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain) +- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large) +- [BigVGAN](https://github.com/NVIDIA/BigVGAN) + +### 推論用テキストフロントエンド + +- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization) +- [split-lang](https://github.com/DoodleBears/split-lang) +- [g2pW](https://github.com/GitYCC/g2pW) +- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW) +- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw) + +### WebUI ツール + +- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui) +- [audio-slicer](https://github.com/openvpi/audio-slicer) +- [SubFix](https://github.com/cronrpc/SubFix) +- [FFmpeg](https://github.com/FFmpeg/FFmpeg) +- [gradio](https://github.com/gradio-app/gradio) +- [faster-whisper](https://github.com/SYSTRAN/faster-whisper) +- [FunASR](https://github.com/alibaba-damo-academy/FunASR) +- [AP-BWE](https://github.com/yxlu-0102/AP-BWE) + +@Naozumi520 さん、広東語のトレーニングセットの提供と、広東語に関する知識のご指導をいただき、感謝申し上げます. + +## すべてのコントリビューターに感謝します + + + + diff --git a/docs/ko/Changelog_KO.md b/docs/ko/Changelog_KO.md new file mode 100644 index 0000000000000000000000000000000000000000..e0cfc3b08b1f509bf1be1ee46eab40012bff53e3 --- /dev/null +++ b/docs/ko/Changelog_KO.md @@ -0,0 +1,222 @@ +### 20240121 업데이트 + +1. `config`에 `is_share`를 추가했습니다. Colab과 같은 시나리오에서는 이 값을 `True`로 설정하여 WebUI를 공개 네트워크에 매핑할 수 있습니다. +2. WebUI에 영어 시스템 번역 지원을 추가했습니다. +3. `cmd-asr`이 FunASR 모델이 포함되어 있는지 자동으로 감지합니다; 기본 디렉토리에서 찾을 수 없으면 ModelScope에서 다운로드됩니다. +4. [Issue 79](https://github.com/RVC-Boss/GPT-SoVITS/issues/79)에서 보고된 SoVITS 훈련의 ZeroDivisionError를 필터링 샘플 등으로 해결하려고 시도했습니다. +5. `TEMP` 폴더의 캐시된 오디오 파일 및 기타 파일을 정리했습니다. +6. 참조 오디오의 끝이 포함된 합성 오디오 문제를 크게 줄였습니다. + +### 20240122 업데이트 + +1. 지나치게 짧은 출력 파일로 인해 참조 오디오가 반복되는 문제를 수정했습니다. +2. 영어 및 일본어 훈련의 네이티브 지원을 테스트했습니다 (일본어 훈련 시 루트 디렉토리에 비영어 특수 문자가 없어야 합니다). +3. 오디오 경로 확인을 개선했습니다. 잘못된 입력 경로에서 읽으려는 시도가 있을 경우, ffmpeg 오류 대신 경로가 존재하지 않는다고 보고합니다. + +### 20240123 업데이트 + +1. Hubert 추출로 인해 NaN 오류가 발생하여 SoVITS/GPT 훈련에서 ZeroDivisionError가 발생하는 문제를 해결했습니다. +2. 추론 WebUI에서 빠른 모델 전환 지원을 추가했습니다. +3. 모델 파일 정렬 로직을 최적화했습니다. +4. 중국어 단어 분할을 위해 `jieba`를 `jieba_fast`로 교체했습니다. + +### 20240126 업데이트 + +1. 중국어-영어 혼합 및 일본어-영어 혼합 출력 텍스트를 지원합니다. +2. 출력에 대한 선택적 분할 모드를 추가했습니다. +3. UVR5 읽기 문제 및 디렉토리 자동 탈출 문제를 수정했습니다. +4. 추론 오류를 일으키는 여러 줄 바꿈 문제를 수정했습니다. +5. 추론 WebUI 에서 중복 로그를 제거했습니다. +6. Mac에서 훈련 및 추론을 지원합니다. +7. 절반 정밀도를 지원하지 않는 GPU에 대해 자동으로 단정밀도를 강제하며, CPU 추론 시 단정밀도를 적용합니다. + +### 20240128 업데이트 + +1. 숫자의 발음이 중국어 문자로 변환되는 문제를 수정했습니다. +2. 문장 시작 부분에서 몇 개의 문자가 누락되는 문제를 수정했습니다. +3. 비합리적인 참조 오디오 길이를 설정하여 제외했습니다. +4. GPT 훈련 시 체크포인트가 저장되지 않는 문제를 수정했습니다. +5. Dockerfile 에서 모델 다운로드 프로세스를 완료했습니다. + +### 20240129 업데이트 + +1. 절반 정밀도 훈련에 문제가 있는 16 시리즈와 같은 GPU의 훈련 구성을 단정밀도로 변경했습니다. +2. 사용 가능한 Colab 버전을 테스트하고 업데이트했습니다. +3. 이전 버전의 FunASR 로 인해 인터페이스 정렬 오류가 발생하는 ModelScope FunASR 저장소의 git 클로닝 문제를 수정했습니다. + +### 20240130 업데이트 + +1. 모든 경로 관련 항목에서 이중 따옴표를 자동으로 제거하여 초보자가 이중 따옴표가 포함된 경로를 복사하는 오류를 방지했습니다. +2. 중국어 및 영어 문장 부호 분할 문제를 수정하고 문장 시작과 끝에 부호를 추가했습니다. +3. 부호에 의한 분할을 추가했습니다. + +### 20240201 업데이트 + +1. 분리 실패를 일으킨 UVR5 형식 읽기 오류를 수정했습니다. +2. 혼합된 중국어-일본어-영어 텍스트에 대한 자동 분할 및 언어 인식을 지원합니다. + +### 20240202 업데이트 + +1. `/` 로 끝나는 ASR 경로가 파일 이름 저장 시 오류를 발생시키는 문제를 수정했습니다. +2. [PR 377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377) 에서는 PaddleSpeech 의 Normalizer 를 도입하여 "xx.xx%" (백분율 기호)와 "元/吨"이 "元吨"으로 읽히는 문제를 "元每吨"으로 수정하고, 밑줄 오류를 수정했습니다. + +### 20240207 업데이트 + +1. [Issue 391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391) 에서 보고된 중국어 추론 품질 저하를 일으킨 언어 매개변수 혼동을 수정했습니다. +2. [PR 403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403) 에서는 UVR5 를 높은 버전의 librosa에 맞게 조정했습니다. +3. [Commit 14a2851](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8)에서는 `is_half` 매개변수가 불리언으로 변환되지 않아 발생한 UVR5 `inf` 오류를 수정했습니다. 이로 인해 16 시리즈 GPU에서 `inf` 가 발생했습니다. +4. 영어 텍스트 프론트엔드를 최적화했습니다. +5. Gradio 종속성 문제를 수정했습니다. +6. 데이터셋 준비 시 루트 디렉토리를 비워두면 `.list` 전체 경로를 자동으로 읽도록 지원합니다. +7. 일본어와 영어에 대한 Faster Whisper ASR을 통합했습니다. + +### 20240208 업데이트 + +1. [Commit 59f35ad](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b)에서는 Windows 10 1909와 [Issue 232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232) (전통 중국어 시스템 언어)에서 GPT 훈련 멈춤 문제를 수정하려고 했습니다. + +### 20240212 업데이트 + +1. Faster Whisper와 FunASR의 로직을 최적화하고, Faster Whisper를 미러 다운로드로 전환하여 Hugging Face 연결 문제를 피했습니다. +2. [PR 457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457)은 DPO Loss 실험적 훈련 옵션을 활성화하여 GPT의 반복 및 문자 누락 문제를 완화하고, 훈련 중 부정 샘플을 구성하며 여러 추론 매개변수를 추론 WebUI에서 사용할 수 있게 했습니다. + +### 20240214 업데이트 + +1. 훈련 시 중국어 실험 이름을 지원합니다 (이전에는 오류가 발생했습니다). +2. DPO 훈련을 필수 기능 대신 선택적 기능으로 변경했습니다. 선택 시, 배치 크기가 자동으로 절반으로 줄어듭니다. 추론 WebUI에서 새로운 매개변수가 전달되지 않는 문제를 수정했습니다. + +### 20240216 업데이트 + +1. 참조 텍스트 없이 입력을 지원합니다. +2. [Issue 475](https://github.com/RVC-Boss/GPT-SoVITS/issues/475)에서 보고된 중국어 프론트엔드의 버그를 수정했습니다. + +### 20240221 업데이트 + +1. 데이터 처리 중 노이즈 감소 옵션을 추가했습니다 (노이즈 감소는 16kHz 샘플링 비율만 남깁니다; 배경 노이즈가 심한 경우에만 사용하십시오). +2. [PR 559](https://github.com/RVC-Boss/GPT-SoVITS/pull/559), [PR 556](https://github.com/RVC-Boss/GPT-SoVITS/pull/556), [PR 532](https://github.com/RVC-Boss/GPT-SoVITS/pull/532), [PR 507](https://github.com/RVC-Boss/GPT-SoVITS/pull/507), [PR 509](https://github.com/RVC-Boss/GPT-SoVITS/pull/509) 중국어 및 일본어 프론트엔드 처리를 최적화했습니다. +3. Mac CPU 추론을 MPS 대신 CPU를 사용하도록 전환하여 성능을 향상시켰습니다. +4. Colab 공개 URL 문제를 수정했습니다. + +### 20240306 업데이트 + +1. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672)는 추론 속도를 50% 가속화했습니다 (RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39에서 테스트됨). +2. Faster Whisper의 비중국어 ASR을 사용할 때 중국어 FunASR 모델을 먼저 다운로드할 필요가 없습니다. +3. [PR 610](https://github.com/RVC-Boss/GPT-SoVITS/pull/610)은 UVR5 리버브 제거 모델에서 설정이 반대로 되어 있는 문제를 수정했습니다. +4. [PR 675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675)는 CUDA가 없는 경우 Faster Whisper의 자동 CPU 추론을 가능하게 했습니다. +5. [PR 573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573)은 Mac에서 올바른 CPU 추론을 보장하기 위해 `is_half` 체크를 수정했습니다. + +### 202403/202404/202405 업데이트 + +#### 사소한 수정: + +1. 참조 텍스트 없는 모드의 문제를 수정했습니다. +2. 중국어 및 영어 텍스트 프론트엔드를 최적화했습니다. +3. API 형식을 개선했습니다. +4. CMD 형식 문제를 수정했습니다. +5. 훈련 데이터 처리 중 지원되지 않는 언어에 대한 오류 프롬프트를 추가했습니다. +6. Hubert 추출의 버그를 수정했습니다. + +#### 주요 수정: + +1. VQ를 고정하지 않고 SoVITS 훈련의 문제를 수정했습니다(품질 저하를 일으킬 수 있음). +2. 빠른 추론 분기를 추가했습니다. + +### 20240610 업데이트 + +#### 사소한 수정: + +1. [PR 1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168) & [PR 1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169) 순수 구두점 및 다중 구두점 텍스트 입력 로직을 개선했습니다. +2. [Commit 501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232) UVR5에서 MDXNet 디러버브를 위한 CMD 형식을 수정하고 공백이 있는 경로를 지원했습니다. +3. [PR 1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159) `s2_train.py`에서 SoVITS 훈련을 위한 진행률 표시줄 로직을 수정했습니다. + +#### 주요 수정: + +4. [Commit 99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) WebUI의 GPT 미세 조정이 중국어 입력 텍스트의 BERT 기능을 읽지 않아 추론과 불일치 및 잠재적 품질 저하를 일으키는 문제를 수정했습니다. + **주의: 이전에 많은 양의 데이터로 미세 조정한 경우 품질을 향상시키기 위해 모델을 다시 조정하는 것이 좋습니다.** + +### 20240706 업데이트 + +#### 사소한 수정: + +1. [Commit 1250670](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041) CPU 추론에서 기본 배치 크기 소수점 문제를 수정했습니다. +2. [PR 1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR 1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR 1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267) 노이즈 제거 또는 ASR이 예외를 만나면 모든 보류 중인 오디오 파일이 종료되는 문제를 수정했습니다. +3. [PR 1253](https://github.com/RVC-Boss/GPT-SoVITS/pull/1253) 구두점으로 분할할 때 소수점 분할 문제를 수정했습니다. +4. [Commit a208698](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca) 다중 GPU 훈련을 위한 다중 프로세스 저장 로직을 수정했습니다. +5. [PR 1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251) 불필요한 `my_utils`를 제거했습니다. + +#### 주요 수정: + +6. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672)의 가속 추론 코드가 검증되어 메인 브랜치에 병합되었으며, 기본과 일관된 추론 효과를 보장합니다. + 또한 참조 텍스트 없는 모드에서 가속 추론을 지원합니다. + +**향후 업데이트에서는 `fast_inference` 브랜치의 변경 사항의 일관성을 계속 검증할 것입니다**. + +### 20240727 업데이트 + +#### 사소한 수정: + +1. [PR 1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298) 불필요한 i18n 코드를 정리했습니다. +2. [PR 1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299) 사용자 파일 경로의 후행 슬래시가 명령줄 오류를 일으키는 문제를 수정했습니다. +3. [PR 756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756) GPT 훈련의 단계 계산 로직을 수정했습니다. + +#### 주요 수정: + +4. [Commit 9588a3c](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2) 합성을 위한 음성 속도 조절을 지원했습니다. + 음성 속도만 조절하면서 무작위성을 고정할 수 있습니다. + +### 20240806 업데이트 + +1. [PR 1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR 1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) BS RoFormer 보컬 반주 분리 모델에 대한 지원을 추가했습니다. [Commit e62e965](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c) FP16 추론을 활성화했습니다. +2. 중국어 텍스트 프론트엔드를 개선했습니다. + - [PR 488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488) 중국어 다의자 지원 (v2 전용); + - [PR 987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987) 추가된 양자; + - [PR 1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351) 사칙연산 및 기본 수학 공식을 지원합니다; + - [PR 1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404) 혼합 텍스트 오류를 수정했습니다. +3. [PR 1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) WebUI 에서 오디오를 처리할 때 경로를 자동으로 채웠습니다. +4. [Commit bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit 4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) GPU 인식 로직을 최적화했습니다. +5. [Commit 8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) 광동어 ASR 지원을 추가했습니다. +6. GPT-SoVITS v2 지원을 추가했습니다. +7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) 타이밍 로직을 최적화했습니다. + +### 20240821 업데이트 + +1. [PR 1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490) `fast_inference` 브랜치를 메인 브랜치에 병합. +2. [Issue 1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508) SSML 태그를 사용하여 숫자, 전화번호, 날짜 및 시간 최적화 지원. +3. [PR 1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503) API 수정 및 최적화. +4. [PR 1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422) 믹싱을 위한 참조 오디오를 하나만 업로드할 수 있는 버그 수정, 다양한 데이터셋 검사 추가 및 파일이 누락된 경우 경고 팝업. + +### 20250211 업데이트 + +- [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) GPT-SoVITS v3 모델 추가, SoVITS v3의 파인튜닝에는 14GB GPU 메모리가 필요합니다. + +### 20250212 업데이트 + +- [PR 2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040) SoVITS v3의 파인튜닝에 그라디언트 체크포인트 추가, 12GB GPU 메모리가 필요합니다. + +### 20250214 업데이트 + +- [PR 2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047) 다국어 혼합 텍스트 분할 전략 **A** 최적화. + - `split-lang`을 언어 분할 도구로 추가하여 다국어 혼합 텍스트의 분할 능력을 향상시켰습니다. + +### 20250217 업데이트 + +- [PR 2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062) 텍스트 내 숫자와 영어 처리 로직 최적화. + +### 20250218 업데이트 + +- [PR 2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073) 다국어 혼합 텍스트 분할 전략 **B** 최적화. + +### 20250223 업데이트 + +1. SoVITS V3의 파인튜닝에 LoRA 훈련이 지원됩니다. 8GB GPU 메모리가 필요하며, 전체 매개변수 파인튜닝보다 더 나은 결과를 제공합니다. +2. [PR 2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078) 보컬 및 악기 분리를 위해 Mel Band RoFormer 모델 추가. + +### 20250226 업데이트 + +1. [PR 2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112) Windows에서 비영어 디렉토리로 인한 문제 수정. + - 한국어에 대한 `langsegmenter` 사용 문제 수정. +2. [PR 2113](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114) Windows에서 비영어 디렉토리로 인한 문제 수정. + - 한국어/일본어에 대한 `langsegmenter` 사용 문제 수정. + +### 20250227 업데이트 + +- V3 모델로 24K 오디오를 생성할 때 발생하는 음성 뭉침 문제를 완화하기 위해, 24K에서 48K로의 오디오 초해상도 모델을 추가했습니다. [Issue 2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085), [Issue 2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117)에서 보고된 문제입니다. \ No newline at end of file diff --git a/docs/ko/README.md b/docs/ko/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e9fee8fc99592ffb9c9bb47a1482a92ded64c093 --- /dev/null +++ b/docs/ko/README.md @@ -0,0 +1,389 @@ +
+ +

GPT-SoVITS-WebUI

+소량의 데이터로 음성 변환 및 음성 합성을 지원하는 강력한 WebUI.

+ +[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/RVC-Boss/GPT-SoVITS) + +
+ +[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb) +[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) +[![Huggingface](https://img.shields.io/badge/🤗%20-online%20demo-yellow.svg?style=for-the-badge)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2) +[![Discord](https://img.shields.io/discord/1198701940511617164?color=%23738ADB&label=Discord&style=for-the-badge)](https://discord.gg/dnrgs5GHfG) + +[**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | **한국어** | [**Türkçe**](../tr/README.md) + +
+ +--- + +## 기능: + +1. **제로샷 텍스트 음성 변환 (TTS):** 5초의 음성 샘플을 입력하면 즉시 텍스트를 음성으로 변환할 수 있습니다. + +2. **소량의 데이터 TTS:** 1분의 훈련 데이터만으로 모델을 미세 조정하여 음성 유사도와 실제감을 향상시킬 수 있습니다. + +3. **다국어 지원:** 훈련 데이터셋과 다른 언어의 추론을 지원하며, 현재 영어, 일본어, 중국어, 광둥어, 한국어를 지원합니다. + +4. **WebUI 도구:** 음성 반주 분리, 자동 훈련 데이터셋 분할, 중국어 자동 음성 인식(ASR) 및 텍스트 주석 등의 도구를 통합하여 초보자가 훈련 데이터셋과 GPT/SoVITS 모델을 생성하는 데 도움을 줍니다. + +**데모 비디오를 확인하세요! [demo video](https://www.bilibili.com/video/BV12g4y1m7Uw)** + +보지 못한 발화자의 퓨샷(few-shot) 파인튜닝 데모: + +https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb + +**사용자 설명서: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)** + +## 설치 + +### 테스트 통과 환경 + +| Python Version | PyTorch Version | Device | +|----------------|------------------|-----------------| +| Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 | +| Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 | +| Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.9 | PyTorch 2.5.1 | Apple silicon | +| Python 3.11 | PyTorch 2.6.0 | Apple silicon | +| Python 3.9 | PyTorch 2.2.2 | CPU | + +### Windows + +Windows 사용자라면 (win>=10에서 테스트됨), [통합 패키지를 다운로드](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true)한 후 압축을 풀고 _go-webui.bat_ 파일을 더블 클릭하면 GPT-SoVITS-WebUI를 시작할 수 있습니다. + +### Linux + +```bash +conda create -n GPTSoVits python=3.9 +conda activate GPTSoVits +bash install.sh --source [--download-uvr5] +``` + +### macOS + +**주의: Mac에서 GPU로 훈련된 모델은 다른 OS에서 훈련된 모델에 비해 품질이 낮습니다. 해당 문제를 해결하기 전까지 MacOS에선 CPU를 사용하여 훈련을 진행합니다.** + +1. `xcode-select --install`을 실행하여 Xcode 커맨드라인 도구를 설치하세요. +2. 다음 명령어를 실행하여 이 프로젝트를 설치하세요. + +```bash +conda create -n GPTSoVits python=3.9 +conda activate GPTSoVits +bash install.sh --source [--download-uvr5] +``` + +### 수동 설치 + +#### FFmpeg 설치 + +##### Conda 사용자 + +```bash +conda install ffmpeg +``` + +##### Ubuntu/Debian 사용자 + +```bash +sudo apt install ffmpeg +sudo apt install libsox-dev +conda install -c conda-forge 'ffmpeg<7' +``` + +##### Windows 사용자 + +[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe)와 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe)를 GPT-SoVITS root 디렉토리에 넣습니다. + +[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치 (Korean TTS 전용) + +##### MacOS 사용자 + +```bash +brew install ffmpeg +``` + +#### 의존성 설치 + +```bash +pip install -r extra-req.txt --no-deps +pip install -r requirements.txt +``` + +### Docker에서 사용 + +#### docker-compose.yaml 설정 + +0. 이미지 태그: 코드 저장소가 빠르게 업데이트되고 패키지가 느리게 빌드되고 테스트되므로, 현재 빌드된 최신 도커 이미지를 [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(오래된 버전) 에서 확인하고 필요에 따라 Dockerfile을 사용하여 로컬에서 빌드할 수 있습니다. + +1. 환경 변수: + +- is_half: 반정밀/배정밀 제어. "SSL 추출" 단계에서 4-cnhubert/5-wav32k 디렉토리의 내용을 올바르게 생성할 수 없는 경우, 일반적으로 이것 때문입니다. 실제 상황에 따라 True 또는 False로 조정할 수 있습니다. + +2. 볼륨 설정, 컨테이너 내의 애플리케이션 루트 디렉토리를 /workspace로 설정합니다. 기본 docker-compose.yaml에는 실제 예제가 나열되어 있으므로 업로드/다운로드를 쉽게 할 수 있습니다. + +3. shm_size: Windows의 Docker Desktop의 기본 사용 가능한 메모리가 너무 작아 오류가 발생할 수 있으므로 실제 상황에 따라 조정합니다. + +4. deploy 섹션의 gpu 관련 내용은 시스템 및 실제 상황에 따라 조정합니다. + +#### docker compose로 실행 + +``` +docker compose -f "docker-compose.yaml" up -d +``` + +#### docker 명령으로 실행 + +위와 동일하게 실제 상황에 맞게 매개변수를 수정한 다음 다음 명령을 실행합니다: + +``` +docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx +``` + +## 사전 학습된 모델 + +**`install.sh`가 성공적으로 실행되면 No.1,2,3 은 건너뛰어도 됩니다.** + +1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 에서 사전 학습된 모델을 다운로드하고, `GPT_SoVITS/pretrained_models` 디렉토리에 배치하세요. + +2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) 에서 모델을 다운로드하고 압축을 풀어 `G2PWModel`로 이름을 변경한 후, `GPT_SoVITS/text` 디렉토리에 배치하세요. (중국어 TTS 전용) + +3. UVR5 (보컬/반주 분리 & 잔향 제거 추가 기능)의 경우, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 에서 모델을 다운로드하고 `tools/uvr5/uvr5_weights` 디렉토리에 배치하세요. + + - UVR5에서 bs_roformer 또는 mel_band_roformer 모델을 사용할 경우, 모델과 해당 설정 파일을 수동으로 다운로드하여 `tools/UVR5/UVR5_weights` 폴더에 저장할 수 있습니다. **모델 파일과 설정 파일의 이름은 확장자를 제외하고 동일한 이름을 가지도록 해야 합니다**. 또한, 모델과 설정 파일 이름에는 **"roformer"**가 포함되어야 roformer 클래스의 모델로 인식됩니다. + + - 모델 이름과 설정 파일 이름에 **모델 유형을 직접 지정하는 것이 좋습니다**. 예: mel_mand_roformer, bs_roformer. 지정하지 않으면 설정 파일을 기준으로 특성을 비교하여 어떤 유형의 모델인지를 판단합니다. 예를 들어, 모델 `bs_roformer_ep_368_sdr_12.9628.ckpt`와 해당 설정 파일 `bs_roformer_ep_368_sdr_12.9628.yaml`은 한 쌍입니다. `kim_mel_band_roformer.ckpt`와 `kim_mel_band_roformer.yaml`도 한 쌍입니다. + +4. 중국어 ASR (추가 기능)의 경우, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 및 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 에서 모델을 다운로드하고, `tools/asr/models` 디렉토리에 배치하세요. + +5. 영어 또는 일본어 ASR (추가 기능)의 경우, [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 에서 모델을 다운로드하고, `tools/asr/models` 디렉토리에 배치하세요. 또한, [다른 모델](https://huggingface.co/Systran) 은 더 적은 디스크 용량으로 비슷한 효과를 가질 수 있습니다. + +## 데이터셋 형식 + +텍스트 음성 합성(TTS) 주석 .list 파일 형식: + +``` +vocal_path|speaker_name|language|text +``` + +언어 사전: + +- 'zh': 중국어 +- 'ja': 일본어 +- 'en': 영어 + +예시: + +``` +D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin. +``` + +## 미세 조정 및 추론 + +### WebUI 열기 + +#### 통합 패키지 사용자 + +`go-webui.bat`을 더블 클릭하거나 `go-webui.ps1`를 사용하십시오. +V1으로 전환하려면, `go-webui-v1.bat`을 더블 클릭하거나 `go-webui-v1.ps1`를 사용하십시오. + +#### 기타 + +```bash +python webui.py <언어(옵션)> +``` + +V1으로 전환하려면, + +```bash +python webui.py v1 <언어(옵션)> +``` + +또는 WebUI에서 수동으로 버전을 전환하십시오. + +### 미세 조정 + +#### 경로 자동 채우기가 지원됩니다 + + 1. 오디오 경로를 입력하십시오. + 2. 오디오를 작은 청크로 분할하십시오. + 3. 노이즈 제거(옵션) + 4. ASR 수행 + 5. ASR 전사를 교정하십시오. + 6. 다음 탭으로 이동하여 모델을 미세 조정하십시오. + +### 추론 WebUI 열기 + +#### 통합 패키지 사용자 + +`go-webui-v2.bat`을 더블 클릭하거나 `go-webui-v2.ps1`를 사용한 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다. + +#### 기타 + +```bash +python GPT_SoVITS/inference_webui.py <언어(옵션)> +``` + +또는 + +```bash +python webui.py +``` + +그런 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다. + +## V2 릴리스 노트 + +새로운 기능: + +1. 한국어 및 광둥어 지원 + +2. 최적화된 텍스트 프론트엔드 + +3. 사전 학습 모델이 2천 시간에서 5천 시간으로 확장 + +4. 저품질 참조 오디오에 대한 합성 품질 향상 + + [자세한 내용]() + +V1 환경에서 V2를 사용하려면: + +1. `pip install -r requirements.txt`를 사용하여 일부 패키지 업데이트 + +2. github에서 최신 코드를 클론하십시오. + +3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)에서 V2 사전 학습 모델을 다운로드하여 `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`에 넣으십시오. + + 중국어 V2 추가: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.) + +## V3 릴리스 노트 + +새로운 기능: + +1. 음색 유사성이 더 높아져 목표 음성에 대한 학습 데이터가 적게 필요합니다. (기본 모델을 직접 사용하여 미세 조정 없이 음색 유사성이 크게 향상됩니다.) + +2. GPT 모델이 더 안정적이며 반복 및 생략이 적고, 더 풍부한 감정 표현을 가진 음성을 생성하기가 더 쉽습니다. + + [자세한 내용]() + +v2 환경에서 v3 사용하기: + +1. `pip install -r requirements.txt`로 일부 패키지를 업데이트합니다. + +2. 최신 코드를 github 에서 클론합니다. + +3. v3 사전 훈련된 모델(s1v3.ckpt, s2Gv3.pth, 그리고 models--nvidia--bigvgan_v2_24khz_100band_256x 폴더)을 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 다운로드하여 `GPT_SoVITS\pretrained_models` 폴더에 넣습니다. + + 추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요. + +## 할 일 목록 + +- [x] **최우선순위:** + + - [x] 일본어 및 영어 지역화. + - [x] 사용자 가이드. + - [x] 일본어 및 영어 데이터셋 미세 조정 훈련. + +- [ ] **기능:** + + - [x] 제로샷 음성 변환 (5초) / 소량의 음성 변환 (1분). + - [x] TTS 속도 제어. + - [ ] ~~향상된 TTS 감정 제어.~~ + - [ ] SoVITS 토큰 입력을 단어 확률 분포로 변경해 보세요. + - [x] 영어 및 일본어 텍스트 프론트 엔드 개선. + - [ ] 작은 크기와 큰 크기의 TTS 모델 개발. + - [x] Colab 스크립트. + - [ ] 훈련 데이터셋 확장 (2k 시간에서 10k 시간). + - [x] 더 나은 sovits 기본 모델 (향상된 오디오 품질). + - [ ] 모델 블렌딩. + +## (추가적인) 명령줄에서 실행하는 방법 + +명령줄을 사용하여 UVR5용 WebUI 열기 + +``` +python tools/uvr5/webui.py "" +``` + + + +명령줄을 사용하여 데이터세트의 오디오 분할을 수행하는 방법은 다음과 같습니다. + +``` +python audio_slicer.py \ + --input_path "" \ + --output_root "" \ + --threshold \ + --min_length \ + --min_interval + --hop_size +``` + +명령줄을 사용하여 데이터 세트 ASR 처리를 수행하는 방법입니다(중국어만 해당). + +``` +python tools/asr/funasr_asr.py -i -o +``` + +ASR 처리는 Faster_Whisper(중국어를 제외한 ASR 마킹)를 통해 수행됩니다. + +(진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음) + +``` +python ./tools/asr/fasterwhisper_asr.py -i -o -l -p +``` + +사용자 정의 목록 저장 경로가 활성화되었습니다. + +## 감사의 말 + +다음 프로젝트와 기여자들에게 특별히 감사드립니다: + +### 이론 연구 + +- [ar-vits](https://github.com/innnky/ar-vits) +- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR) +- [vits](https://github.com/jaywalnut310/vits) +- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556) +- [contentvec](https://github.com/auspicious3000/contentvec/) +- [hifi-gan](https://github.com/jik876/hifi-gan) +- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41) +- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py) +- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py) + +### 사전 학습 모델 + +- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain) +- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large) +- [BigVGAN](https://github.com/NVIDIA/BigVGAN) + +### 추론용 텍스트 프론트엔드 + +- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization) +- [split-lang](https://github.com/DoodleBears/split-lang) +- [g2pW](https://github.com/GitYCC/g2pW) +- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW) +- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw) + +### WebUI 도구 + +- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui) +- [audio-slicer](https://github.com/openvpi/audio-slicer) +- [SubFix](https://github.com/cronrpc/SubFix) +- [FFmpeg](https://github.com/FFmpeg/FFmpeg) +- [gradio](https://github.com/gradio-app/gradio) +- [faster-whisper](https://github.com/SYSTRAN/faster-whisper) +- [FunASR](https://github.com/alibaba-damo-academy/FunASR) +- [AP-BWE](https://github.com/yxlu-0102/AP-BWE) + +@Naozumi520 님께 감사드립니다. 광둥어 학습 자료를 제공해 주시고, 광둥어 관련 지식을 지도해 주셔서 감사합니다. + +## 모든 기여자들에게 감사드립니다 ;) + + + + diff --git a/docs/tr/Changelog_TR.md b/docs/tr/Changelog_TR.md new file mode 100644 index 0000000000000000000000000000000000000000..3de7c8a2a4f1d152f52fdd3b59f57aa365b09da7 --- /dev/null +++ b/docs/tr/Changelog_TR.md @@ -0,0 +1,222 @@ +### 20240121 Güncellemesi + +1. `config`e `is_share` eklendi. Colab gibi senaryolarda, WebUI'yi halka açık ağa yönlendirmek için bu değeri `True` olarak ayarlayabilirsiniz. +2. WebUI'ye İngilizce sistem çeviri desteği eklendi. +3. `cmd-asr`, FunASR modelinin dahil olup olmadığını otomatik olarak tespit eder; eğer varsayılan dizinde bulunamazsa, ModelScope'dan indirilecektir. +4. [Issue 79](https://github.com/RVC-Boss/GPT-SoVITS/issues/79)de bildirilen SoVITS eğitimindeki ZeroDivisionError'u sıfır uzunlukta örnekleri filtreleyerek düzeltmeye çalıştık. +5. `TEMP` klasöründeki önbelleğe alınmış ses dosyaları ve diğer dosyaları temizledik. +6. Referans sesinin sonunu içeren sentezlenmiş ses sorununu önemli ölçüde azalttık. + +### 20240122 Güncellemesi + +1. Aşırı kısa çıktı dosyalarının referans sesini tekrarlamasına neden olan sorun giderildi. +2. İngilizce ve Japonca eğitim için yerel destek test edildi (Japonca eğitim için kök dizinin İngilizce olmayan özel karakterlerden arındırılmış olması gerekir). +3. Ses yolu denetimi iyileştirildi. Yanlış bir giriş yolundan okumaya çalışıldığında, ffmpeg hatası yerine yolun mevcut olmadığını bildirir. + +### 20240123 Güncellemesi + +1. Hubert çıkarımının NaN hatalarına neden olup SoVITS/GPT eğitiminde ZeroDivisionError'a yol açtığı sorun çözüldü. +2. İnferans WebUI'de hızlı model değiştirme desteği eklendi. +3. Model dosyası sıralama mantığı optimize edildi. +4. Çince kelime ayrımı için `jieba` `jieba_fast` ile değiştirildi. + +### 20240126 Güncellemesi + +1. Çince-İngilizce ve Japonca-İngilizce karışık çıktı metinleri için destek eklendi. +2. Çıktı için isteğe bağlı bir bölme modu eklendi. +3. UVR5'in dizinlerden otomatik olarak çıkmasına neden olan okuma sorununu düzelttik. +4. Çeşitli yeni satır sorunlarını düzelterek çıkarım hatalarını giderdik. +5. Çıkarım WebUI'deki gereksiz günlükleri kaldırdık. +6. Mac'te eğitim ve çıkarım desteği eklendi. +7. Yarım hassasiyeti desteklemeyen GPU'lar için otomatik olarak tek hassasiyet zorlandı; CPU çıkarımında tek hassasiyet uygulandı. + +### 20240128 Güncellemesi + +1. Sayıların Çince karakterlere dönüştürülmesiyle ilgili sorunu düzelttik. +2. Cümlelerin başındaki birkaç karakterin yutulması sorununu düzelttik. +3. Mantıksız referans ses uzunluklarını sınırlamalar koyarak hariç tuttuk. +4. GPT eğitiminin kontrol noktalarını kaydetmemesi sorununu düzelttik. +5. Dockerfile'da model indirme sürecini tamamladık. + +### 20240129 Güncellemesi + +1. Yarım hassasiyet eğitimi ile ilgili sorun yaşayan 16 serisi gibi GPU'lar için eğitim yapılandırmalarını tek hassasiyete değiştirdik. +2. Mevcut Colab sürümünü test ettik ve güncelledik. +3. Eski sürüm FunASR ile ModelScope FunASR deposunun git klonlanmasıyla oluşan arayüz hizalama hatalarını düzelttik. + +### 20240130 Güncellemesi + +1. Çift tırnaklarla yol kopyalama hatalarını önlemek için tüm yol ile ilgili girdilerden otomatik olarak çift tırnakları kaldırdık. +2. Çince ve İngilizce noktalama işaretlerini ayırma sorunlarını düzelttik ve cümlelerin başına ve sonuna noktalama işaretleri ekledik. +3. Noktalama işaretlerine göre ayırma özelliğini ekledik. + +### 20240201 Güncellemesi + +1. Ayrılma hatalarına neden olan UVR5 format okuma hatasını düzelttik. +2. Karışık Çince-Japonca-İngilizce metinler için otomatik segmentasyon ve dil tanıma desteği sağladık. + +### 20240202 Güncellemesi + +1. `/` ile biten bir ASR yolunun dosya adını kaydetme hatasına neden olma sorununu düzelttik. +2. [PR 377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377) PaddleSpeech'in Normalizer'ını tanıtarak "xx.xx%" (yüzde sembolleri) ve "元/吨" ifadesinin "元吨" yerine "元每吨" olarak okunması gibi sorunları düzelttik ve alt çizgi hatalarını giderdik. + +### 20240207 Güncellemesi + +1. [Issue 391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391)de bildirilen dil parametresi karışıklığının Çinçe çıkarım kalitesini düşürme sorununu düzelttik. +2. [PR 403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403) ile UVR5'i daha yüksek versiyonlarda librosa'ya uyarladık. +3. [Commit 14a2851](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8) `is_half` parametresinin booleana dönüştürülmemesi nedeniyle sürekli yarım hassasiyet çıkarımı yaparak 16 serisi GPU'larda `inf` hatasına neden olan UVR5 inf hatasını düzelttik. +4. İngilizce metin önyüzünü optimize ettik. +5. Gradio bağımlılıklarını düzelttik. +6. Veri seti hazırlığı sırasında kök dizini boş bırakıldığında `.list` tam yollarının otomatik olarak okunmasını destekledik. +7. Japonca ve İngilizce için Faster Whisper ASR'yi entegre ettik. + +### 20240208 Güncellemesi + +1. [Commit 59f35ad](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b) ile Windows 10 1909'da ve [Issue 232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232)de (Geleneksel Çince Sistem Dili) bildirilen GPT eğitim durma sorununu düzeltmeye çalıştık. + +### 20240212 Güncellemesi + +1. Faster Whisper ve FunASR için mantığı optimize ettik, Hugging Face bağlantı sorunlarını önlemek için Faster Whisper'ı ayna indirmelere yönlendirdik. +2. [PR 457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457) GPT tekrarı ve eksik karakterleri azaltmak için eğitim sırasında negatif örnekler oluşturarak deneysel DPO Loss eğitim seçeneğini etkinleştirdi ve çıkarım WebUI'de çeşitli çıkarım parametrelerini kullanılabilir hale getirdi. + +### 20240214 Güncellemesi + +1. Eğitimde Çince deney adlarını destekledik (önceden hatalara neden oluyordu). +2. DPO eğitimini zorunlu yerine isteğe bağlı bir özellik yaptık. Seçilirse, parti boyutu otomatik olarak yarıya indirilir. Çıkarım WebUI'de yeni parametrelerin iletilmemesi sorunlarını düzelttik. + +### 20240216 Güncellemesi + +1. Referans metin olmadan girişi destekledik. +2. [Issue 475](https://github.com/RVC-Boss/GPT-SoVITS/issues/475) de bildirilen Çince önyüz hatalarını düzelttik. + +### 20240221 Güncellemesi + +1. Veri işleme sırasında bir gürültü azaltma seçeneği ekledik (gürültü azaltma sadece 16kHz örnekleme hızını bırakır; yalnızca arka plan gürültüsü önemliyse kullanın). +2. [PR 559](https://github.com/RVC-Boss/GPT-SoVITS/pull/559), [PR 556](https://github.com/RVC-Boss/GPT-SoVITS/pull/556), [PR 532](https://github.com/RVC-Boss/GPT-SoVITS/pull/532), [PR 507](https://github.com/RVC-Boss/GPT-SoVITS/pull/507), [PR 509](https://github.com/RVC-Boss/GPT-SoVITS/pull/509) ile Çince ve Japonca önyüz işlemesini optimize ettik. +3. Mac CPU çıkarımını daha hızlı performans için MPS yerine CPU kullanacak şekilde değiştirdik. +4. Colab genel URL sorununu düzelttik. + +### 20240306 Güncellemesi + +1. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) çıkarımı %50 hızlandırdı (RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39 üzerinde test edildi). +2. Faster Whisper'ın Çince olmayan ASR'sini kullanırken artık önce Çin FunASR modelini indirmeyi gerektirmiyor. +3. [PR 610](https://github.com/RVC-Boss/GPT-SoVITS/pull/610) UVR5 yankı giderme modelindeki ayarın tersine çevrildiği sorunu düzeltti. +4. [PR 675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675) CUDA mevcut olmadığında Faster Whisper için otomatik CPU çıkarımını etkinleştirdi. +5. [PR 573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573) Mac'te doğru CPU çıkarımı sağlamak için `is_half` kontrolünü değiştirdi. + +### 202403/202404/202405 Güncellemeleri + +#### Küçük Düzeltmeler: + +1. Referans metin olmayan mod ile ilgili sorunlar düzeltildi. +2. Çince ve İngilizce metin önyüzü optimize edildi. +3. API formatı iyileştirildi. +4. CMD format sorunları düzeltildi. +5. Eğitim verisi işleme sırasında desteklenmeyen diller için hata uyarıları eklendi. +6. Hubert çıkarımındaki hata düzeltildi. + +#### Büyük Düzeltmeler: + +1. VQ'yu dondurmadan yapılan SoVITS eğitimi sorunu (bu kalite düşüşüne neden olabilir) düzeltildi. +2. Hızlı çıkarım dalı eklendi. + +### 20240610 Güncellemesi + +#### Küçük Düzeltmeler: + +1. [PR 1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168) & [PR 1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169) saf noktalama işareti ve çoklu noktalama işareti metin girdisi için mantığı geliştirdi. +2. [Commit 501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232) UVR5'teki MDXNet yankı giderme için CMD formatını düzeltti, boşluk içeren yolları destekledi. +3. [PR 1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159) `s2_train.py` içindeki SoVITS eğitimi için ilerleme çubuğu mantığını düzeltti. + +#### Büyük Düzeltmeler: + +4. [Commit 99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) WebUI'nin GPT ince ayarının, Çince giriş metinlerinin BERT özelliğini okumaması sorununu düzeltti, bu da çıkarım ile tutarsızlığa ve potansiyel kalite düşüşüne neden oluyordu. + **Dikkat: Daha önce büyük miktarda veri ile ince ayar yaptıysanız, modelin kalitesini artırmak için yeniden ayar yapmanız önerilir.** + +### 20240706 Güncellemesi + +#### Küçük Düzeltmeler: + +1. [Commit 1250670](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041) CPU çıkarımında varsayılan yığın boyutu ondalık sorununu düzeltti. +2. [PR 1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR 1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR 1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267) gürültü giderme veya ASR ile ilgili istisnalarla karşılaşıldığında bekleyen tüm ses dosyalarının çıkış yapmasına neden olan sorunları düzeltti. +3. [PR 1253](https://github.com/RVC-Boss/GPT-SoVITS/pull/1253) noktalama işaretlerine göre ayrılırken ondalıkların bölünmesi sorununu düzeltti. +4. [Commit a208698](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca) çoklu GPU eğitimi için çoklu işlem kaydetme mantığını düzeltti. +5. [PR 1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251) gereksiz `my_utils`'ı kaldırdı. + +#### Büyük Düzeltmeler: + +6. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) hızlandırılmış çıkarım kodu doğrulandı ve ana dala birleştirildi, taban ile tutarlı çıkarım etkileri sağlandı. + Ayrıca referans metni olmayan modda hızlandırılmış çıkarımı destekler. + +**Gelecek güncellemeler, `fast_inference` dalındaki değişikliklerin tutarlılığını doğrulamaya devam edecek.** + +### 20240727 Güncellemesi + +#### Küçük Düzeltmeler: + +1. [PR 1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298) gereksiz i18n kodlarını temizledi. +2. [PR 1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299) kullanıcı dosya yollarındaki sonlandırma eğik çizgilerinin komut satırı hatalarına neden olduğu sorunları düzeltti. +3. [PR 756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756) GPT eğitimindeki adım hesaplama mantığını düzeltti. + +#### Büyük Düzeltmeler: + +4. [Commit 9588a3c](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2) sentez için konuşma hızı ayarlamasını destekledi. + Konuşma hızını ayarlarken rastgeleliği dondurmayı etkinleştirdi. + +### 20240806 Güncellemesi + +1. [PR 1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR 1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) BS RoFormer vokal eşlik ayırma modelini desteklemeye başladı. [Commit e62e965](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c) FP16 çıkarımı etkinleştirdi. +2. Çince metin ön yüzünü geliştirdi. + - [PR 488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488) çoklu heceli karakterler için destek ekledi (v2 sadece); + - [PR 987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987) sayı belirleyici ekledi; + - [PR 1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351) aritmetik ve temel matematik formüllerini destekler; + - [PR 1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404) karışık metin hatalarını düzeltti. +3. [PR 1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) WebUI'de ses işlenirken yolları otomatik olarak doldurdu. +4. [Commit bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit 4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) GPU tanıma mantığını optimize etti. +5. [Commit 8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) Kantonca ASR desteği ekledi. +6. GPT-SoVITS v2 desteği eklendi. +7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) zamanlama mantığını optimize etti. + +### 20240821 Güncelleme + +1. [PR 1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490) `fast_inference` dalını ana dala birleştir. +2. [Issue 1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508) SSML etiketlerini kullanarak sayıları, telefon numaralarını, tarihleri ve saatleri optimize etme desteği. +3. [PR 1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503) API düzeltildi ve optimize edildi. +4. [PR 1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422) Karıştırmak için yalnızca bir referans sesi yüklenebiliyordu hatası düzeltildi, çeşitli veri seti kontrolleri eklendi ve eksik dosyalar için uyarılar çıkar. + +### 20250211 Güncellemesi + +- [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) GPT-SoVITS v3 Modeli Eklendi, SoVITS v3'ü ince ayar yapmak için 14GB GPU belleği gereklidir. + +### 20250212 Güncellemesi + +- [PR 2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040) SoVITS v3'ü ince ayar yapmak için gradyan kontrol noktası ekledi, 12GB GPU belleği gereklidir. + +### 20250214 Güncellemesi + +- [PR 2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047) Çok dilli karışık metin segmentasyon stratejisi **A**'yı optimize etti. + - `split-lang` bir dil segmentasyon aracı olarak eklendi ve çok dilli karışık metinlerin segmentasyon yeteneklerini iyileştirdi. + +### 20250217 Güncellemesi + +- [PR 2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062) Metindeki sayılar ve İngilizceyi işleme mantığını optimize etti. + +### 20250218 Güncellemesi + +- [PR 2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073) Çok dilli karışık metin segmentasyon stratejisi **B**'yi optimize etti. + +### 20250223 Güncellemesi + +1. SoVITS V3 için LoRA eğitimi, ince ayar yapmayı destekler. 8GB GPU belleği gereklidir ve sonuçlar tam parametreli ince ayar yapmaktan daha iyidir. +2. [PR 2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078) Mel Band RoFormer modelini vokal ve enstrüman ayrımı için ekledi. + +### 20250226 Güncellemesi + +1. [PR 2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112) Windows'ta İngilizce olmayan dizinlerden kaynaklanan sorunları düzeltti. + - Korece için `langsegmenter` kullanımı ile ilgili sorun düzeltildi. +2. [PR 2113](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114) Windows'ta İngilizce olmayan dizinlerden kaynaklanan sorunları düzeltti. + - Korece/Japonca için `langsegmenter` kullanımı ile ilgili sorun düzeltildi. + +### 20250227 Güncellemesi + +- 24K sesli V3 modeliyle 24K ses oluştururken meydana gelen boğukluk sorununu hafifletmek için, 24K'dan 48K'ya ses süper çözünürlük modelleri eklendi. [Issue 2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085), [Issue 2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117) de bildirilen sorunlar. \ No newline at end of file diff --git a/docs/tr/README.md b/docs/tr/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d59f66baf0da3e708c73d6e47d1113217508af48 --- /dev/null +++ b/docs/tr/README.md @@ -0,0 +1,385 @@ +
+ +

GPT-SoVITS-WebUI

+Güçlü Birkaç Örnekli Ses Dönüştürme ve Metinden Konuşmaya Web Arayüzü.

+ +[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/RVC-Boss/GPT-SoVITS) + +RVC-Boss%2FGPT-SoVITS | Trendshift + + + +[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb) +[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) +[![Huggingface](https://img.shields.io/badge/🤗%20-online%20demo-yellow.svg?style=for-the-badge)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2) +[![Discord](https://img.shields.io/discord/1198701940511617164?color=%23738ADB&label=Discord&style=for-the-badge)](https://discord.gg/dnrgs5GHfG) + +[**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | **Türkçe** + +
+ +--- + +## Özellikler: + +1. **Sıfır Örnekli Metinden Konuşmaya:** 5 saniyelik bir vokal örneği girin ve anında metinden konuşmaya dönüşümünü deneyimleyin. + +2. **Birkaç Örnekli Metinden Konuşmaya:** Daha iyi ses benzerliği ve gerçekçiliği için modeli yalnızca 1 dakikalık eğitim verisiyle ince ayarlayın. + +3. **Çapraz Dil Desteği:** Eğitim veri setinden farklı dillerde çıkarım, şu anda İngilizce, Japonca, Çince, Kantonca ve Koreceyi destekliyor. + +4. **Web Arayüzü Araçları:** Entegre araçlar arasında vokal eşliğinde ayırma, otomatik eğitim seti segmentasyonu, Çince ASR ve metin etiketleme bulunur ve yeni başlayanların eğitim veri setleri ve GPT/SoVITS modelleri oluşturmalarına yardımcı olur. + +**[Demo videomuzu](https://www.bilibili.com/video/BV12g4y1m7Uw) buradan izleyin!** + +Görünmeyen konuşmacılar birkaç örnekli ince ayar demosu: + +https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb + +**Kullanıcı Kılavuzu: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)** + +## Kurulum + +### Test Edilmiş Ortamlar + +| Python Version | PyTorch Version | Device | +|----------------|------------------|-----------------| +| Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 | +| Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 | +| Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.9 | PyTorch 2.5.1 | Apple silicon | +| Python 3.11 | PyTorch 2.6.0 | Apple silicon | +| Python 3.9 | PyTorch 2.2.2 | CPU | + +### Windows + +Eğer bir Windows kullanıcısıysanız (win>=10 ile test edilmiştir), [entegre paketi indirin](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true) ve _go-webui.bat_ dosyasına çift tıklayarak GPT-SoVITS-WebUI'yi başlatın. + +### Linux + +```bash +conda create -n GPTSoVits python=3.9 +conda activate GPTSoVits +bash install.sh --source [--download-uvr5] +``` + +### macOS + +**Not: Mac'lerde GPU'larla eğitilen modeller, diğer cihazlarda eğitilenlere göre önemli ölçüde daha düşük kalitede sonuç verir, bu nedenle geçici olarak CPU'lar kullanıyoruz.** + +1. `xcode-select --install` komutunu çalıştırarak Xcode komut satırı araçlarını yükleyin. +2. Aşağıdaki komutları çalıştırarak programı yükleyin: + +```bash +conda create -n GPTSoVits python=3.9 +conda activate GPTSoVits +bash install.sh --source [--download-uvr5] +``` + +### El ile Yükleme + +#### FFmpeg'i Yükleme + +##### Conda Kullanıcıları + +```bash +conda install ffmpeg +``` + +##### Ubuntu/Debian Kullanıcıları + +```bash +sudo apt install ffmpeg +sudo apt install libsox-dev +conda install -c conda-forge 'ffmpeg<7' +``` + +##### Windows Kullanıcıları + +[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin. + +##### MacOS Kullanıcıları + +```bash +brew install ffmpeg +``` + +#### Bağımlılıkları Yükleme + +```bash +pip install -r extra-req.txt --no-deps +pip install -r requirements.txt +``` + +### Docker Kullanarak + +#### docker-compose.yaml yapılandırması + +0. Görüntü etiketleri hakkında: Kod tabanındaki hızlı güncellemeler ve görüntüleri paketleme ve test etme işleminin yavaş olması nedeniyle, lütfen şu anda paketlenmiş en son görüntüleri kontrol etmek için [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(eski sürüm) adresini kontrol edin ve durumunuza göre seçim yapın veya alternatif olarak, kendi ihtiyaçlarınıza göre bir Dockerfile kullanarak yerel olarak oluşturun. +1. Ortam Değişkenleri: + - is_half: Yarım hassasiyet/çift hassasiyeti kontrol eder. Bu genellikle "SSL çıkarma" adımı sırasında 4-cnhubert/5-wav32k dizinleri altındaki içeriğin doğru şekilde oluşturulmamasının nedenidir. Gerçek durumunuza göre True veya False olarak ayarlayın. +2. Birim Yapılandırması, Kapsayıcı içindeki uygulamanın kök dizini /workspace olarak ayarlanmıştır. Varsayılan docker-compose.yaml, içerik yükleme/indirme için bazı pratik örnekler listeler. +3. shm_size: Windows üzerinde Docker Desktop için varsayılan kullanılabilir bellek çok küçüktür, bu da anormal işlemlere neden olabilir. Kendi durumunuza göre ayarlayın. +4. Dağıtım bölümü altında, GPU ile ilgili ayarlar sisteminize ve gerçek koşullara göre dikkatlice ayarlanmalıdır. + +#### docker compose ile çalıştırma + +``` +docker compose -f "docker-compose.yaml" up -d +``` + +#### docker komutu ile çalıştırma + +Yukarıdaki gibi, ilgili parametreleri gerçek durumunuza göre değiştirin, ardından aşağıdaki komutu çalıştırın: + +``` +docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx +``` + +## Önceden Eğitilmiş Modeller + +**Eğer `install.sh` başarıyla çalıştırılırsa, No.1,2,3 adımını atlayabilirsiniz.** + +1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) üzerinden önceden eğitilmiş modelleri indirip `GPT_SoVITS/pretrained_models` dizinine yerleştirin. + +2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) üzerinden modeli indirip sıkıştırmayı açın ve `G2PWModel` olarak yeniden adlandırın, ardından `GPT_SoVITS/text` dizinine yerleştirin. (Sadece Çince TTS için) + +3. UVR5 (Vokal/Enstrümantal Ayrımı & Yankı Giderme) için, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) üzerinden modelleri indirip `tools/uvr5/uvr5_weights` dizinine yerleştirin. + + - UVR5'te bs_roformer veya mel_band_roformer modellerini kullanıyorsanız, modeli ve ilgili yapılandırma dosyasını manuel olarak indirip `tools/UVR5/UVR5_weights` klasörüne yerleştirebilirsiniz. **Model dosyası ve yapılandırma dosyasının adı, uzantı dışında aynı olmalıdır**. Ayrıca, model ve yapılandırma dosyasının adlarında **"roformer"** kelimesi yer almalıdır, böylece roformer sınıfındaki bir model olarak tanınır. + + - Model adı ve yapılandırma dosyası adı içinde **doğrudan model tipini belirtmek önerilir**. Örneğin: mel_mand_roformer, bs_roformer. Belirtilmezse, yapılandırma dosyasından özellikler karşılaştırılarak model tipi belirlenir. Örneğin, `bs_roformer_ep_368_sdr_12.9628.ckpt` modeli ve karşılık gelen yapılandırma dosyası `bs_roformer_ep_368_sdr_12.9628.yaml` bir çifttir. Aynı şekilde, `kim_mel_band_roformer.ckpt` ve `kim_mel_band_roformer.yaml` da bir çifttir. + +4. Çince ASR için, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) ve [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) üzerinden modelleri indirip `tools/asr/models` dizinine yerleştirin. + +5. İngilizce veya Japonca ASR için, [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) üzerinden modeli indirip `tools/asr/models` dizinine yerleştirin. Ayrıca, [diğer modeller](https://huggingface.co/Systran) benzer bir etki yaratabilir ve daha az disk alanı kaplayabilir. + +## Veri Seti Formatı + +TTS açıklama .list dosya formatı: + +``` +vocal_path|speaker_name|language|text +``` + +Dil sözlüğü: + +- 'zh': Çince +- 'ja': Japonca +- 'en': İngilizce +- 'ko': Korece +- 'yue': Kantonca + +Örnek: + +``` +D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin. +``` + +## İnce Ayar ve Çıkarım + +### WebUI'yi Açın + +#### Entegre Paket Kullanıcıları + +`go-webui.bat` dosyasına çift tıklayın veya `go-webui.ps1` kullanın. +V1'e geçmek istiyorsanız, `go-webui-v1.bat` dosyasına çift tıklayın veya `go-webui-v1.ps1` kullanın. + +#### Diğerleri + +```bash +python webui.py +``` + +V1'e geçmek istiyorsanız, + +```bash +python webui.py v1 +``` + +veya WebUI'de manuel olarak sürüm değiştirin. + +### İnce Ayar + +#### Yol Otomatik Doldurma artık destekleniyor + + 1. Ses yolunu doldurun + 2. Sesi küçük parçalara ayırın + 3. Gürültü azaltma (isteğe bağlı) + 4. ASR + 5. ASR transkripsiyonlarını düzeltin + 6. Bir sonraki sekmeye geçin ve modeli ince ayar yapın + +### Çıkarım WebUI'sini Açın + +#### Entegre Paket Kullanıcıları + +`go-webui-v2.bat` dosyasına çift tıklayın veya `go-webui-v2.ps1` kullanın, ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın. + +#### Diğerleri + +```bash +python GPT_SoVITS/inference_webui.py +``` + +VEYA + +```bash +python webui.py +``` + +ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın. + +## V2 Sürüm Notları + +Yeni Özellikler: + +1. Korece ve Kantonca destekler + +2. Optimize edilmiş metin ön yüzü + +3. Önceden eğitilmiş model 2k saatten 5k saate kadar genişletildi + +4. Düşük kaliteli referans sesler için geliştirilmiş sentez kalitesi + + [detaylar burada]() + +V1 ortamından V2'yi kullanmak için: + +1. `pip install -r requirements.txt` ile bazı paketleri güncelleyin + +2. github'dan en son kodları klonlayın. + +3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) adresinden v2 önceden eğitilmiş modelleri indirin ve bunları `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained` dizinine yerleştirin. + + Ek olarak Çince V2: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.) + +## V3 Sürüm Notları + +### Yeni Özellikler: + +1. **Tını benzerliği** daha yüksek olup, hedef konuşmacıyı yakınsamak için daha az eğitim verisi gerekmektedir (tını benzerliği, base model doğrudan kullanılacak şekilde fine-tuning yapılmadan önemli ölçüde iyileştirilmiştir). + +2. GPT modeli daha **kararlı** hale geldi, tekrarlar ve atlamalar azaldı ve **daha zengin duygusal ifadeler** ile konuşma üretmek daha kolay hale geldi. + + [daha fazla detay]() + +### v2 ortamında v3 kullanımı: + +1. `pip install -r requirements.txt` ile bazı paketleri güncelleyin. + +2. GitHub'dan en son kodları klonlayın. + +3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden v3 önceden eğitilmiş modellerini (s1v3.ckpt, s2Gv3.pth ve models--nvidia--bigvgan_v2_24khz_100band_256x klasörünü) indirin ve `GPT_SoVITS\pretrained_models` dizinine yerleştirin. + + ek: Ses Süper Çözünürlük modeli için [nasıl indirileceği](../../tools/AP_BWE_main/24kto48k/readme.txt) hakkında bilgi alabilirsiniz. + +## Yapılacaklar Listesi + +- [x] **Yüksek Öncelikli:** + + - [x] Japonca ve İngilizceye yerelleştirme. + - [x] Kullanıcı kılavuzu. + - [x] Japonca ve İngilizce veri seti ince ayar eğitimi. + +- [ ] **Özellikler:** + - [x] Sıfır örnekli ses dönüştürme (5s) / birkaç örnekli ses dönüştürme (1dk). + - [x] Metinden konuşmaya konuşma hızı kontrolü. + - [ ] ~~Gelişmiş metinden konuşmaya duygu kontrolü.~~ + - [ ] SoVITS token girdilerini kelime dağarcığı olasılık dağılımına değiştirme denemesi. + - [x] İngilizce ve Japonca metin ön ucunu iyileştirme. + - [ ] Küçük ve büyük boyutlu metinden konuşmaya modelleri geliştirme. + - [x] Colab betikleri. + - [ ] Eğitim veri setini genişletmeyi dene (2k saat -> 10k saat). + - [x] daha iyi sovits temel modeli (geliştirilmiş ses kalitesi) + - [ ] model karışımı + +## (Ekstra) Komut satırından çalıştırma yöntemi + +UVR5 için Web Arayüzünü açmak için komut satırını kullanın + +``` +python tools/uvr5/webui.py "" +``` + + + +Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır + +``` +python audio_slicer.py \ + --input_path "" \ + --output_root "" \ + --threshold \ + --min_length \ + --min_interval + --hop_size +``` + +Veri seti ASR işleme komut satırı kullanılarak bu şekilde yapılır (Yalnızca Çince) + +``` +python tools/asr/funasr_asr.py -i -o <çıktı> +``` + +ASR işleme Faster_Whisper aracılığıyla gerçekleştirilir (Çince dışındaki ASR işaretleme) + +(İlerleme çubukları yok, GPU performansı zaman gecikmelerine neden olabilir) + +``` +python ./tools/asr/fasterwhisper_asr.py -i -o <çıktı> -l +``` + +Özel bir liste kaydetme yolu etkinleştirildi + +## Katkı Verenler + +Özellikle aşağıdaki projelere ve katkıda bulunanlara teşekkür ederiz: + +### Teorik Araştırma + +- [ar-vits](https://github.com/innnky/ar-vits) +- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR) +- [vits](https://github.com/jaywalnut310/vits) +- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556) +- [contentvec](https://github.com/auspicious3000/contentvec/) +- [hifi-gan](https://github.com/jik876/hifi-gan) +- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41) +- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py) +- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py) + +### Önceden Eğitilmiş Modeller + +- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain) +- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large) +- [BigVGAN](https://github.com/NVIDIA/BigVGAN) + +### Tahmin İçin Metin Ön Ucu + +- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization) +- [split-lang](https://github.com/DoodleBears/split-lang) +- [g2pW](https://github.com/GitYCC/g2pW) +- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW) +- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw) + +### WebUI Araçları + +- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui) +- [audio-slicer](https://github.com/openvpi/audio-slicer) +- [SubFix](https://github.com/cronrpc/SubFix) +- [FFmpeg](https://github.com/FFmpeg/FFmpeg) +- [gradio](https://github.com/gradio-app/gradio) +- [faster-whisper](https://github.com/SYSTRAN/faster-whisper) +- [FunASR](https://github.com/alibaba-damo-academy/FunASR) +- [AP-BWE](https://github.com/yxlu-0102/AP-BWE) + +@Naozumi520'ye Kantonca eğitim setini sağladığı ve Kantonca ile ilgili bilgiler konusunda rehberlik ettiği için minnettarım. + +## Tüm katkıda bulunanlara çabaları için teşekkürler + + + + diff --git a/extra-req.txt b/extra-req.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d2324117f5edb41b3c5f3c13b2b0bf2dd395942 --- /dev/null +++ b/extra-req.txt @@ -0,0 +1 @@ +faster-whisper diff --git a/go-webui.bat b/go-webui.bat new file mode 100644 index 0000000000000000000000000000000000000000..a2dfff6c0a5444ac2524cd981078d56399b79505 --- /dev/null +++ b/go-webui.bat @@ -0,0 +1,2 @@ +runtime\python.exe -I webui.py zh_CN +pause diff --git a/go-webui.ps1 b/go-webui.ps1 new file mode 100644 index 0000000000000000000000000000000000000000..f9427263d1a1402a5d09923f18b19af3d35e8b2f --- /dev/null +++ b/go-webui.ps1 @@ -0,0 +1,4 @@ +$ErrorActionPreference = "SilentlyContinue" +chcp 65001 +& "$PSScriptRoot\runtime\python.exe" -I "$PSScriptRoot\webui.py" zh_CN +pause diff --git a/gpt-sovits_kaggle.ipynb b/gpt-sovits_kaggle.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..764c23c424c9d307f70a1fd7ce151c2978e00226 --- /dev/null +++ b/gpt-sovits_kaggle.ipynb @@ -0,0 +1,243 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9fd922fb", + "metadata": {}, + "source": [ + "# Deprecated" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45857cb2", + "metadata": { + "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", + "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", + "execution": { + "iopub.execute_input": "2024-02-18T14:43:46.735480Z", + "iopub.status.busy": "2024-02-18T14:43:46.735183Z", + "iopub.status.idle": "2024-02-18T14:48:10.724175Z", + "shell.execute_reply": "2024-02-18T14:48:10.723059Z" + }, + "papermill": { + "duration": 263.994935, + "end_time": "2024-02-18T14:48:10.726613", + "exception": false, + "start_time": "2024-02-18T14:43:46.731678", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n", + "%cd GPT-SoVITS\n", + "!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n", + "!pip install -r requirements.txt\n", + "!pip install -r extra-req.txt --no-deps" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9d346b4", + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-18T14:48:10.815802Z", + "iopub.status.busy": "2024-02-18T14:48:10.814899Z", + "iopub.status.idle": "2024-02-18T14:50:31.253276Z", + "shell.execute_reply": "2024-02-18T14:50:31.252024Z" + }, + "papermill": { + "duration": 140.484893, + "end_time": "2024-02-18T14:50:31.255720", + "exception": false, + "start_time": "2024-02-18T14:48:10.770827", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# @title Download pretrained models 下载预训练模型\n", + "!mkdir -p /kaggle/working/GPT-SoVITS/GPT_SoVITS/pretrained_models\n", + "!mkdir -p /kaggle/working/GPT-SoVITS/tools/asr/models\n", + "!mkdir -p /kaggle/working/GPT-SoVITS/tools/uvr5\n", + "%cd /kaggle/working/GPT-SoVITS/GPT_SoVITS/pretrained_models\n", + "!git clone https://huggingface.co/lj1995/GPT-SoVITS\n", + "%cd /kaggle/working/GPT-SoVITS/tools/asr/models\n", + "!git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git\n", + "!git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git\n", + "!git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git\n", + "# # @title UVR5 pretrains 安装uvr5模型\n", + "%cd /kaggle/working/GPT-SoVITS/tools/uvr5\n", + "!git clone https://huggingface.co/Delik/uvr5_weights\n", + "!git config core.sparseCheckout true\n", + "!mv /kaggle/working/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /kaggle/working/GPT-SoVITS/GPT_SoVITS/pretrained_models/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea94d245", + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-18T14:29:01.071549Z", + "iopub.status.busy": "2024-02-18T14:29:01.070592Z", + "iopub.status.idle": "2024-02-18T14:40:45.318368Z", + "shell.execute_reply": "2024-02-18T14:40:45.317130Z", + "shell.execute_reply.started": "2024-02-18T14:29:01.071512Z" + }, + "papermill": { + "duration": null, + "end_time": null, + "exception": false, + "start_time": "2024-02-18T14:50:31.309013", + "status": "running" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# @title launch WebUI 启动WebUI\n", + "%cd /kaggle/working/GPT-SoVITS/\n", + "!npm install -g localtunnel\n", + "import subprocess\n", + "import threading\n", + "import time\n", + "import socket\n", + "import urllib.request\n", + "\n", + "\n", + "def iframe_thread(port):\n", + " while True:\n", + " time.sleep(0.5)\n", + " sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n", + " result = sock.connect_ex((\"127.0.0.1\", port))\n", + " if result == 0:\n", + " break\n", + " sock.close()\n", + "\n", + " from colorama import Fore, Style\n", + " print(\n", + " Fore.GREEN + \"\\nIP: \",\n", + " Fore.RED,\n", + " urllib.request.urlopen(\"https://ipv4.icanhazip.com\").read().decode(\"utf8\").strip(\"\\n\"),\n", + " \"\\n\",\n", + " Style.RESET_ALL,\n", + " )\n", + " p = subprocess.Popen([\"lt\", \"--port\", \"{}\".format(port)], stdout=subprocess.PIPE)\n", + " for line in p.stdout:\n", + " print(line.decode(), end=\"\")\n", + "\n", + "\n", + "threading.Thread(target=iframe_thread, daemon=True, args=(9874,)).start()\n", + "\n", + "!python webui.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dda88a6d", + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-18T14:40:56.880608Z", + "iopub.status.busy": "2024-02-18T14:40:56.879879Z" + }, + "papermill": { + "duration": null, + "end_time": null, + "exception": null, + "start_time": null, + "status": "pending" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# 开启推理页面\n", + "%cd /kaggle/working/GPT-SoVITS/\n", + "!npm install -g localtunnel\n", + "import threading\n", + "\n", + "\n", + "def iframe_thread(port):\n", + " while True:\n", + " time.sleep(0.5)\n", + " sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n", + " result = sock.connect_ex((\"127.0.0.1\", port))\n", + " if result == 0:\n", + " break\n", + " sock.close()\n", + "\n", + " from colorama import Fore, Style\n", + " print(\n", + " Fore.GREEN + \"\\nIP: \",\n", + " Fore.RED,\n", + " urllib.request.urlopen(\"https://ipv4.icanhazip.com\").read().decode(\"utf8\").strip(\"\\n\"),\n", + " \"\\n\",\n", + " Style.RESET_ALL,\n", + " )\n", + " p = subprocess.Popen([\"lt\", \"--port\", \"{}\".format(port)], stdout=subprocess.PIPE)\n", + " for line in p.stdout:\n", + " print(line.decode(), end=\"\")\n", + "\n", + "\n", + "threading.Thread(target=iframe_thread, daemon=True, args=(9872,)).start()\n", + "\n", + "!python ./GPT_SoVITS/inference_webui.py" + ] + } + ], + "metadata": { + "kaggle": { + "accelerator": "nvidiaTeslaT4", + "dataSources": [ + { + "datasetId": 4459328, + "sourceId": 7649639, + "sourceType": "datasetVersion" + } + ], + "dockerImageVersionId": 30646, + "isGpuEnabled": true, + "isInternetEnabled": true, + "language": "python", + "sourceType": "notebook" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + }, + "papermill": { + "default_parameters": {}, + "duration": null, + "end_time": null, + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2024-02-18T14:43:44.011910", + "version": "2.5.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/install.sh b/install.sh new file mode 100644 index 0000000000000000000000000000000000000000..e45b91b1e63124b381f949be37d147f772c8bfc9 --- /dev/null +++ b/install.sh @@ -0,0 +1,215 @@ +#!/bin/bash + +# cd into GPT-SoVITS Base Path +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" + +cd "$SCRIPT_DIR" || exit 1 + +set -e + +if ! command -v conda &>/dev/null; then + echo "Conda Not Found" + exit 1 +fi + +trap 'echo "Error Occured at \"$BASH_COMMAND\" with exit code $?"; exit 1' ERR + +is_HF=false +is_HF_MIRROR=false +is_MODELSCOPE=false +DOWNLOAD_UVR5=false + +print_help() { + echo "Usage: bash install.sh [OPTIONS]" + echo "" + echo "Options:" + echo " --source HF|HF-Mirror|ModelScope Specify the model source (REQUIRED)" + echo " --download-uvr5 Enable downloading the UVR5 model" + echo " -h, --help Show this help message and exit" + echo "" + echo "Examples:" + echo " bash install.sh --source HF --download-uvr5" + echo " bash install.sh --source ModelScope" +} + +# Show help if no arguments provided +if [[ $# -eq 0 ]]; then + print_help + exit 0 +fi + +# Parse arguments +while [[ $# -gt 0 ]]; do + case "$1" in + --source) + case "$2" in + HF) + is_HF=true + ;; + HF-Mirror) + is_HF_MIRROR=true + ;; + ModelScope) + is_MODELSCOPE=true + ;; + *) + echo "Error: Invalid Download Source: $2" + echo "Choose From: [HF, HF-Mirror, ModelScope]" + exit 1 + ;; + esac + shift 2 + ;; + --download-uvr5) + DOWNLOAD_UVR5=true + shift + ;; + -h|--help) + print_help + exit 0 + ;; + *) + echo "Unknown Argument: $1" + echo "Use -h or --help to see available options." + exit 1 + ;; + esac +done + +if ! $is_HF && ! $is_HF_MIRROR && ! $is_MODELSCOPE; then + echo "Error: Download Source is REQUIRED" + echo "" + print_help + exit 1 +fi + +if [ "$is_HF" = "true" ]; then + echo "Download Model From HuggingFace" + PRETRINED_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip" + G2PW_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip" + UVR5_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip" +elif [ "$is_HF_MIRROR" = "true" ]; then + echo "Download Model From HuggingFace-Mirror" + PRETRINED_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip" + G2PW_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip" + UVR5_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip" +elif [ "$is_MODELSCOPE" = "true" ]; then + echo "Download Model From ModelScope" + PRETRINED_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/pretrained_models.zip" + G2PW_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip" + UVR5_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/uvr5_weights.zip" +fi + +if find "GPT_SoVITS/pretrained_models" -mindepth 1 ! -name '.gitignore' | grep -q .; then + echo "Pretrained Model Exists" +else + echo "Download Pretrained Models" + wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404 "$PRETRINED_URL" + + unzip pretrained_models.zip + rm -rf pretrained_models.zip + mv pretrained_models/* GPT_SoVITS/pretrained_models + rm -rf pretrained_models +fi + +if [ ! -d "GPT_SoVITS/text/G2PWModel" ]; then + echo "Download G2PWModel" + wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404 "$G2PW_URL" + + unzip G2PWModel.zip + rm -rf G2PWModel.zip + mv G2PWModel GPT_SoVITS/text/G2PWModel +else + echo "G2PWModel Exists" +fi + +if [ "$DOWNLOAD_UVR5" = "true" ];then + if find "tools/uvr5/uvr5_weights" -mindepth 1 ! -name '.gitignore' | grep -q .; then + echo "UVR5 Model Exists" + else + echo "Download UVR5 Model" + wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404 "$UVR5_URL" + + unzip uvr5_weights.zip + rm -rf uvr5_weights.zip + mv uvr5_weights/* tools/uvr5/uvr5_weights + rm -rf uvr5_weights + fi +fi + +# 安装构建工具 +# Install build tools +echo "Installing GCC..." +conda install -c conda-forge gcc=14 -y + +echo "Installing G++..." +conda install -c conda-forge gxx -y + +echo "Installing ffmpeg and cmake..." +conda install ffmpeg cmake -y + +echo "Installing git-lfs and zip..." +conda install git-lfs -y +conda install zip -y + +git-lfs install + +echo "Checking for CUDA installation..." +if command -v nvidia-smi &>/dev/null; then + USE_CUDA=true + echo "CUDA found." +else + echo "CUDA not found." + USE_CUDA=false +fi + +if [ "$USE_CUDA" = false ]; then + echo "Checking for ROCm installation..." + if [ -d "/opt/rocm" ]; then + USE_ROCM=true + echo "ROCm found." + if grep -qi "microsoft" /proc/version; then + echo "You are running WSL." + IS_WSL=true + else + echo "You are NOT running WSL." + IS_WSL=false + fi + else + echo "ROCm not found." + USE_ROCM=false + fi +fi + +if [ "$USE_CUDA" = true ]; then + echo "Installing PyTorch with CUDA support..." + pip install torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124 +elif [ "$USE_ROCM" = true ]; then + echo "Installing PyTorch with ROCm support..." + pip install torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/rocm6.2 +else + echo "Installing PyTorch for CPU..." + pip install torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cpu +fi + +echo "Installing Python dependencies from requirements.txt..." + +# 刷新环境 +# Refresh environment +hash -r + +pip install -r extra-req.txt --no-deps + +pip install -r requirements.txt + +python -c "import nltk; nltk.download(['averaged_perceptron_tagger','averaged_perceptron_tagger_eng','cmudict'])" + +if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ]; then + echo "Update to WSL compatible runtime lib..." + location=$(pip show torch | grep Location | awk -F ": " '{print $2}') + cd "${location}"/torch/lib/ || exit + rm libhsa-runtime64.so* + cp /opt/rocm/lib/libhsa-runtime64.so.1.2 libhsa-runtime64.so +fi + +echo "Installation completed successfully!" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..9703b250e968310ad2611607ef038a491f984955 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,45 @@ +numpy<2.0 +scipy +tensorboard +librosa==0.10.2 +numba +pytorch-lightning>=2.4 +gradio<5 +ffmpeg-python +onnxruntime; sys_platform == 'darwin' +onnxruntime-gpu; sys_platform != 'darwin' +tqdm +funasr==1.0.27 +cn2an +pypinyin +pyopenjtalk>=0.4.1 +g2p_en +torchaudio +modelscope==1.10.0 +sentencepiece +transformers>=4.43 +peft +chardet +PyYAML +psutil +jieba_fast +jieba +split-lang +fast_langdetect>=0.3.1 +wordsegment +rotary_embedding_torch +ToJyutping +g2pk2 +ko_pron +opencc; sys_platform != 'linux' +opencc==1.1.1; sys_platform == 'linux' +python_mecab_ko; sys_platform != 'win32' +fastapi[standard]>=0.115.2 +x_transformers +torchmetrics<=1.5 +pydantic<=2.10.6 +ctranslate2>=4.0,<5 +huggingface_hub>=0.13 +tokenizers>=0.13,<1 +av>=11 +tqdm diff --git a/webui.py b/webui.py new file mode 100644 index 0000000000000000000000000000000000000000..cddbb02969d1a06c47e8b2f7037fabf75264f437 --- /dev/null +++ b/webui.py @@ -0,0 +1,1963 @@ +import os +import sys + +if len(sys.argv) == 1: + sys.argv.append("v2") +version = "v1" if sys.argv[1] == "v1" else "v2" +os.environ["version"] = version +now_dir = os.getcwd() +sys.path.insert(0, now_dir) +import warnings + +warnings.filterwarnings("ignore") +import json +import platform +import re +import shutil +import signal + +import psutil +import torch +import yaml + +os.environ["TORCH_DISTRIBUTED_DEBUG"] = "INFO" +torch.manual_seed(233333) +tmp = os.path.join(now_dir, "TEMP") +os.makedirs(tmp, exist_ok=True) +os.environ["TEMP"] = tmp +if os.path.exists(tmp): + for name in os.listdir(tmp): + if name == "jieba.cache": + continue + path = "%s/%s" % (tmp, name) + delete = os.remove if os.path.isfile(path) else shutil.rmtree + try: + delete(path) + except Exception as e: + print(str(e)) + pass +import site +import traceback + +site_packages_roots = [] +for path in site.getsitepackages(): + if "packages" in path: + site_packages_roots.append(path) +if site_packages_roots == []: + site_packages_roots = ["%s/runtime/Lib/site-packages" % now_dir] +# os.environ["OPENBLAS_NUM_THREADS"] = "4" +os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1" +os.environ["all_proxy"] = "" +for site_packages_root in site_packages_roots: + if os.path.exists(site_packages_root): + try: + with open("%s/users.pth" % (site_packages_root), "w") as f: + f.write( + # "%s\n%s/runtime\n%s/tools\n%s/tools/asr\n%s/GPT_SoVITS\n%s/tools/uvr5" + "%s\n%s/GPT_SoVITS/BigVGAN\n%s/tools\n%s/tools/asr\n%s/GPT_SoVITS\n%s/tools/uvr5" + % (now_dir, now_dir, now_dir, now_dir, now_dir, now_dir) + ) + break + except PermissionError: + traceback.print_exc() +import shutil +import subprocess +from subprocess import Popen + +from config import ( + exp_root, + infer_device, + is_half, + is_share, + python_exec, + webui_port_infer_tts, + webui_port_main, + webui_port_subfix, + webui_port_uvr5, +) +from tools import my_utils +from tools.i18n.i18n import I18nAuto, scan_language_list + +language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto" +os.environ["language"] = language +i18n = I18nAuto(language=language) +from multiprocessing import cpu_count + +from tools.my_utils import check_details, check_for_existance + +# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu +try: + import gradio.analytics as analytics + + analytics.version_check = lambda: None +except: + ... +import gradio as gr + +n_cpu = cpu_count() + +ngpu = torch.cuda.device_count() +gpu_infos = [] +mem = [] +if_gpu_ok = False + +# 判断是否有能用来训练和加速推理的N卡 +ok_gpu_keywords = { + "10", + "16", + "20", + "30", + "40", + "A2", + "A3", + "A4", + "P4", + "A50", + "500", + "A60", + "70", + "80", + "90", + "M4", + "T4", + "TITAN", + "L4", + "4060", + "H", + "600", + "506", + "507", + "508", + "509", +} +set_gpu_numbers = set() +if torch.cuda.is_available() or ngpu != 0: + for i in range(ngpu): + gpu_name = torch.cuda.get_device_name(i) + if any(value in gpu_name.upper() for value in ok_gpu_keywords): + # A10#A100#V100#A40#P40#M40#K80#A4500 + if_gpu_ok = True # 至少有一张能用的N卡 + gpu_infos.append("%s\t%s" % (i, gpu_name)) + set_gpu_numbers.add(i) + mem.append(int(torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024 + 0.4)) +# # 判断是否支持mps加速 +# if torch.backends.mps.is_available(): +# if_gpu_ok = True +# gpu_infos.append("%s\t%s" % ("0", "Apple GPU")) +# mem.append(psutil.virtual_memory().total/ 1024 / 1024 / 1024) # 实测使用系统内存作为显存不会爆显存 + + +v3v4set={"v3","v4"} +def set_default(): + global \ + default_batch_size, \ + default_max_batch_size, \ + gpu_info, \ + default_sovits_epoch, \ + default_sovits_save_every_epoch, \ + max_sovits_epoch, \ + max_sovits_save_every_epoch, \ + default_batch_size_s1, \ + if_force_ckpt + if_force_ckpt = False + if if_gpu_ok and len(gpu_infos) > 0: + gpu_info = "\n".join(gpu_infos) + minmem = min(mem) + # if version == "v3" and minmem < 14: + # # API读取不到共享显存,直接填充确认 + # try: + # torch.zeros((1024,1024,1024,14),dtype=torch.int8,device="cuda") + # torch.cuda.empty_cache() + # minmem = 14 + # except RuntimeError as _: + # # 强制梯度检查只需要12G显存 + # if minmem >= 12 : + # if_force_ckpt = True + # minmem = 14 + # else: + # try: + # torch.zeros((1024,1024,1024,12),dtype=torch.int8,device="cuda") + # torch.cuda.empty_cache() + # if_force_ckpt = True + # minmem = 14 + # except RuntimeError as _: + # print("显存不足以开启V3训练") + default_batch_size = minmem // 2 if version not in v3v4set else minmem // 8 + default_batch_size_s1 = minmem // 2 + else: + gpu_info = "%s\t%s" % ("0", "CPU") + gpu_infos.append("%s\t%s" % ("0", "CPU")) + set_gpu_numbers.add(0) + default_batch_size = default_batch_size_s1 = int(psutil.virtual_memory().total / 1024 / 1024 / 1024 / 4) + if version not in v3v4set: + default_sovits_epoch = 8 + default_sovits_save_every_epoch = 4 + max_sovits_epoch = 25 # 40 + max_sovits_save_every_epoch = 25 # 10 + else: + default_sovits_epoch = 2 + default_sovits_save_every_epoch = 1 + max_sovits_epoch = 20 # 40 # 3 + max_sovits_save_every_epoch = 10 # 10 # 3 + + default_batch_size = max(1, default_batch_size) + default_batch_size_s1 = max(1, default_batch_size_s1) + default_max_batch_size = default_batch_size * 3 + + +set_default() + +gpus = "-".join([i[0] for i in gpu_infos]) +default_gpu_numbers = str(sorted(list(set_gpu_numbers))[0]) + + +def fix_gpu_number(input): # 将越界的number强制改到界内 + try: + if int(input) not in set_gpu_numbers: + return default_gpu_numbers + except: + return input + return input + + +def fix_gpu_numbers(inputs): + output = [] + try: + for input in inputs.split(","): + output.append(str(fix_gpu_number(input))) + return ",".join(output) + except: + return inputs + + +pretrained_sovits_name = [ + "GPT_SoVITS/pretrained_models/s2G488k.pth", + "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", + "GPT_SoVITS/pretrained_models/s2Gv3.pth", + "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth", +] +pretrained_gpt_name = [ + "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", + "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", + "GPT_SoVITS/pretrained_models/s1v3.ckpt", + "GPT_SoVITS/pretrained_models/s1v3.ckpt", +] + +pretrained_model_list = ( + pretrained_sovits_name[int(version[-1]) - 1], + pretrained_sovits_name[int(version[-1]) - 1].replace("s2G", "s2D"), + pretrained_gpt_name[int(version[-1]) - 1], + "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", + "GPT_SoVITS/pretrained_models/chinese-hubert-base", +) + +_ = "" +for i in pretrained_model_list: + if "s2Dv3" not in i and os.path.exists(i) == False: + _ += f"\n {i}" +if _: + print("warning: ", i18n("以下模型不存在:") + _) + +_ = [[], []] +for i in range(4): + if os.path.exists(pretrained_gpt_name[i]): + _[0].append(pretrained_gpt_name[i]) + else: + _[0].append("") ##没有下pretrained模型的,说不定他们是想自己从零训底模呢 + if os.path.exists(pretrained_sovits_name[i]): + _[-1].append(pretrained_sovits_name[i]) + else: + _[-1].append("") +pretrained_gpt_name, pretrained_sovits_name = _ + +SoVITS_weight_root = ["SoVITS_weights", "SoVITS_weights_v2", "SoVITS_weights_v3", "SoVITS_weights_v4"] +GPT_weight_root = ["GPT_weights", "GPT_weights_v2", "GPT_weights_v3", "GPT_weights_v4"] +for root in SoVITS_weight_root + GPT_weight_root: + os.makedirs(root, exist_ok=True) + + +def get_weights_names(): + SoVITS_names = [name for name in pretrained_sovits_name if name != ""] + for path in SoVITS_weight_root: + for name in os.listdir(path): + if name.endswith(".pth"): + SoVITS_names.append("%s/%s" % (path, name)) + GPT_names = [name for name in pretrained_gpt_name if name != ""] + for path in GPT_weight_root: + for name in os.listdir(path): + if name.endswith(".ckpt"): + GPT_names.append("%s/%s" % (path, name)) + return SoVITS_names, GPT_names + + +SoVITS_names, GPT_names = get_weights_names() +for path in SoVITS_weight_root + GPT_weight_root: + os.makedirs(path, exist_ok=True) + + +def custom_sort_key(s): + # 使用正则表达式提取字符串中的数字部分和非数字部分 + parts = re.split("(\d+)", s) + # 将数字部分转换为整数,非数字部分保持不变 + parts = [int(part) if part.isdigit() else part for part in parts] + return parts + + +def change_choices(): + SoVITS_names, GPT_names = get_weights_names() + return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, { + "choices": sorted(GPT_names, key=custom_sort_key), + "__type__": "update", + } + + +p_label = None +p_uvr5 = None +p_asr = None +p_denoise = None +p_tts_inference = None + + +def kill_proc_tree(pid, including_parent=True): + try: + parent = psutil.Process(pid) + except psutil.NoSuchProcess: + # Process already terminated + return + + children = parent.children(recursive=True) + for child in children: + try: + os.kill(child.pid, signal.SIGTERM) # or signal.SIGKILL + except OSError: + pass + if including_parent: + try: + os.kill(parent.pid, signal.SIGTERM) # or signal.SIGKILL + except OSError: + pass + + +system = platform.system() + + +def kill_process(pid, process_name=""): + if system == "Windows": + cmd = "taskkill /t /f /pid %s" % pid + # os.system(cmd) + subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + else: + kill_proc_tree(pid) + print(process_name + i18n("进程已终止")) + + +def process_info(process_name="", indicator=""): + if indicator == "opened": + return process_name + i18n("已开启") + elif indicator == "open": + return i18n("开启") + process_name + elif indicator == "closed": + return process_name + i18n("已关闭") + elif indicator == "close": + return i18n("关闭") + process_name + elif indicator == "running": + return process_name + i18n("运行中") + elif indicator == "occupy": + return process_name + i18n("占用中") + "," + i18n("需先终止才能开启下一次任务") + elif indicator == "finish": + return process_name + i18n("已完成") + elif indicator == "failed": + return process_name + i18n("失败") + elif indicator == "info": + return process_name + i18n("进程输出信息") + else: + return process_name + + +process_name_subfix = i18n("音频标注WebUI") + + +def change_label(path_list): + global p_label + if p_label is None: + check_for_existance([path_list]) + path_list = my_utils.clean_path(path_list) + cmd = '"%s" tools/subfix_webui.py --load_list "%s" --webui_port %s --is_share %s' % ( + python_exec, + path_list, + webui_port_subfix, + is_share, + ) + yield ( + process_info(process_name_subfix, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + print(cmd) + p_label = Popen(cmd, shell=True) + else: + kill_process(p_label.pid, process_name_subfix) + p_label = None + yield ( + process_info(process_name_subfix, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + + +process_name_uvr5 = i18n("人声分离WebUI") + + +def change_uvr5(): + global p_uvr5 + if p_uvr5 is None: + cmd = '"%s" tools/uvr5/webui.py "%s" %s %s %s' % (python_exec, infer_device, is_half, webui_port_uvr5, is_share) + yield ( + process_info(process_name_uvr5, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + print(cmd) + p_uvr5 = Popen(cmd, shell=True) + else: + kill_process(p_uvr5.pid, process_name_uvr5) + p_uvr5 = None + yield ( + process_info(process_name_uvr5, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + + +process_name_tts = i18n("TTS推理WebUI") + + +def change_tts_inference(bert_path, cnhubert_base_path, gpu_number, gpt_path, sovits_path, batched_infer_enabled): + global p_tts_inference + if batched_infer_enabled: + cmd = '"%s" GPT_SoVITS/inference_webui_fast.py "%s"' % (python_exec, language) + else: + cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"' % (python_exec, language) + # #####v3暂不支持加速推理 + # if version=="v3": + # cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language) + if p_tts_inference is None: + os.environ["gpt_path"] = gpt_path if "/" in gpt_path else "%s/%s" % (GPT_weight_root, gpt_path) + os.environ["sovits_path"] = sovits_path if "/" in sovits_path else "%s/%s" % (SoVITS_weight_root, sovits_path) + os.environ["cnhubert_base_path"] = cnhubert_base_path + os.environ["bert_path"] = bert_path + os.environ["_CUDA_VISIBLE_DEVICES"] = fix_gpu_number(gpu_number) + os.environ["is_half"] = str(is_half) + os.environ["infer_ttswebui"] = str(webui_port_infer_tts) + os.environ["is_share"] = str(is_share) + yield ( + process_info(process_name_tts, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + print(cmd) + p_tts_inference = Popen(cmd, shell=True) + else: + kill_process(p_tts_inference.pid, process_name_tts) + p_tts_inference = None + yield ( + process_info(process_name_tts, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + + +from tools.asr.config import asr_dict + +process_name_asr = i18n("语音识别") + + +def open_asr(asr_inp_dir, asr_opt_dir, asr_model, asr_model_size, asr_lang, asr_precision): + global p_asr + if p_asr is None: + asr_inp_dir = my_utils.clean_path(asr_inp_dir) + asr_opt_dir = my_utils.clean_path(asr_opt_dir) + check_for_existance([asr_inp_dir]) + cmd = f'"{python_exec}" tools/asr/{asr_dict[asr_model]["path"]}' + cmd += f' -i "{asr_inp_dir}"' + cmd += f' -o "{asr_opt_dir}"' + cmd += f" -s {asr_model_size}" + cmd += f" -l {asr_lang}" + cmd += f" -p {asr_precision}" + output_file_name = os.path.basename(asr_inp_dir) + output_folder = asr_opt_dir or "output/asr_opt" + output_file_path = os.path.abspath(f"{output_folder}/{output_file_name}.list") + yield ( + process_info(process_name_asr, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, + {"__type__": "update"}, + ) + print(cmd) + p_asr = Popen(cmd, shell=True) + p_asr.wait() + p_asr = None + yield ( + process_info(process_name_asr, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + {"__type__": "update", "value": output_file_path}, + {"__type__": "update", "value": output_file_path}, + {"__type__": "update", "value": asr_inp_dir}, + ) + else: + yield ( + process_info(process_name_asr, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, + {"__type__": "update"}, + ) + + +def close_asr(): + global p_asr + if p_asr is not None: + kill_process(p_asr.pid, process_name_asr) + p_asr = None + return ( + process_info(process_name_asr, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + + +process_name_denoise = i18n("语音降噪") + + +def open_denoise(denoise_inp_dir, denoise_opt_dir): + global p_denoise + if p_denoise == None: + denoise_inp_dir = my_utils.clean_path(denoise_inp_dir) + denoise_opt_dir = my_utils.clean_path(denoise_opt_dir) + check_for_existance([denoise_inp_dir]) + cmd = '"%s" tools/cmd-denoise.py -i "%s" -o "%s" -p %s' % ( + python_exec, + denoise_inp_dir, + denoise_opt_dir, + "float16" if is_half == True else "float32", + ) + + yield ( + process_info(process_name_denoise, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, + ) + print(cmd) + p_denoise = Popen(cmd, shell=True) + p_denoise.wait() + p_denoise = None + yield ( + process_info(process_name_denoise, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + {"__type__": "update", "value": denoise_opt_dir}, + {"__type__": "update", "value": denoise_opt_dir}, + ) + else: + yield ( + process_info(process_name_denoise, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, + ) + + +def close_denoise(): + global p_denoise + if p_denoise is not None: + kill_process(p_denoise.pid, process_name_denoise) + p_denoise = None + return ( + process_info(process_name_denoise, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + + +p_train_SoVITS = None +process_name_sovits = i18n("SoVITS训练") + +def open1Ba( + batch_size, + total_epoch, + exp_name, + text_low_lr_rate, + if_save_latest, + if_save_every_weights, + save_every_epoch, + gpu_numbers1Ba, + pretrained_s2G, + pretrained_s2D, + if_grad_ckpt, + lora_rank, +): + global p_train_SoVITS + if p_train_SoVITS == None: + with open("GPT_SoVITS/configs/s2.json") as f: + data = f.read() + data = json.loads(data) + s2_dir = "%s/%s" % (exp_root, exp_name) + os.makedirs("%s/logs_s2_%s" % (s2_dir, version), exist_ok=True) + if check_for_existance([s2_dir], is_train=True): + check_details([s2_dir], is_train=True) + if is_half == False: + data["train"]["fp16_run"] = False + batch_size = max(1, batch_size // 2) + data["train"]["batch_size"] = batch_size + data["train"]["epochs"] = total_epoch + data["train"]["text_low_lr_rate"] = text_low_lr_rate + data["train"]["pretrained_s2G"] = pretrained_s2G + data["train"]["pretrained_s2D"] = pretrained_s2D + data["train"]["if_save_latest"] = if_save_latest + data["train"]["if_save_every_weights"] = if_save_every_weights + data["train"]["save_every_epoch"] = save_every_epoch + data["train"]["gpu_numbers"] = gpu_numbers1Ba + data["train"]["grad_ckpt"] = if_grad_ckpt + data["train"]["lora_rank"] = lora_rank + data["model"]["version"] = version + data["data"]["exp_dir"] = data["s2_ckpt_dir"] = s2_dir + data["save_weight_dir"] = SoVITS_weight_root[int(version[-1]) - 1] + data["name"] = exp_name + data["version"] = version + tmp_config_path = "%s/tmp_s2.json" % tmp + with open(tmp_config_path, "w") as f: + f.write(json.dumps(data)) + if version in ["v1", "v2"]: + cmd = '"%s" GPT_SoVITS/s2_train.py --config "%s"' % (python_exec, tmp_config_path) + else: + cmd = '"%s" GPT_SoVITS/s2_train_v3_lora.py --config "%s"' % (python_exec, tmp_config_path) + yield ( + process_info(process_name_sovits, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True},{"__type__": "update"},{"__type__": "update"} + ) + print(cmd) + p_train_SoVITS = Popen(cmd, shell=True) + p_train_SoVITS.wait() + p_train_SoVITS = None + SoVITS_dropdown_update, GPT_dropdown_update = change_choices() + yield ( + process_info(process_name_sovits, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False},SoVITS_dropdown_update,GPT_dropdown_update + ) + else: + yield ( + process_info(process_name_sovits, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True},{"__type__": "update"},{"__type__": "update"} + ) + + +def close1Ba(): + global p_train_SoVITS + if p_train_SoVITS is not None: + kill_process(p_train_SoVITS.pid, process_name_sovits) + p_train_SoVITS = None + return ( + process_info(process_name_sovits, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + + +p_train_GPT = None +process_name_gpt = i18n("GPT训练") + + +def open1Bb( + batch_size, + total_epoch, + exp_name, + if_dpo, + if_save_latest, + if_save_every_weights, + save_every_epoch, + gpu_numbers, + pretrained_s1, +): + global p_train_GPT + if p_train_GPT == None: + with open( + "GPT_SoVITS/configs/s1longer.yaml" if version == "v1" else "GPT_SoVITS/configs/s1longer-v2.yaml" + ) as f: + data = f.read() + data = yaml.load(data, Loader=yaml.FullLoader) + s1_dir = "%s/%s" % (exp_root, exp_name) + os.makedirs("%s/logs_s1" % (s1_dir), exist_ok=True) + if check_for_existance([s1_dir], is_train=True): + check_details([s1_dir], is_train=True) + if is_half == False: + data["train"]["precision"] = "32" + batch_size = max(1, batch_size // 2) + data["train"]["batch_size"] = batch_size + data["train"]["epochs"] = total_epoch + data["pretrained_s1"] = pretrained_s1 + data["train"]["save_every_n_epoch"] = save_every_epoch + data["train"]["if_save_every_weights"] = if_save_every_weights + data["train"]["if_save_latest"] = if_save_latest + data["train"]["if_dpo"] = if_dpo + data["train"]["half_weights_save_dir"] = GPT_weight_root[int(version[-1]) - 1] + data["train"]["exp_name"] = exp_name + data["train_semantic_path"] = "%s/6-name2semantic.tsv" % s1_dir + data["train_phoneme_path"] = "%s/2-name2text.txt" % s1_dir + data["output_dir"] = "%s/logs_s1_%s" % (s1_dir, version) + # data["version"]=version + + os.environ["_CUDA_VISIBLE_DEVICES"] = fix_gpu_numbers(gpu_numbers.replace("-", ",")) + os.environ["hz"] = "25hz" + tmp_config_path = "%s/tmp_s1.yaml" % tmp + with open(tmp_config_path, "w") as f: + f.write(yaml.dump(data, default_flow_style=False)) + # cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" --train_semantic_path "%s/6-name2semantic.tsv" --train_phoneme_path "%s/2-name2text.txt" --output_dir "%s/logs_s1"'%(python_exec,tmp_config_path,s1_dir,s1_dir,s1_dir) + cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" ' % (python_exec, tmp_config_path) + yield ( + process_info(process_name_gpt, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True},{"__type__": "update"},{"__type__": "update"} + ) + print(cmd) + p_train_GPT = Popen(cmd, shell=True) + p_train_GPT.wait() + p_train_GPT = None + SoVITS_dropdown_update, GPT_dropdown_update = change_choices() + yield ( + process_info(process_name_gpt, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False},SoVITS_dropdown_update,GPT_dropdown_update + ) + else: + yield ( + process_info(process_name_gpt, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True},{"__type__": "update"},{"__type__": "update"} + ) + + +def close1Bb(): + global p_train_GPT + if p_train_GPT is not None: + kill_process(p_train_GPT.pid, process_name_gpt) + p_train_GPT = None + return ( + process_info(process_name_gpt, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + + +ps_slice = [] +process_name_slice = i18n("语音切分") + + +def open_slice(inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, n_parts): + global ps_slice + inp = my_utils.clean_path(inp) + opt_root = my_utils.clean_path(opt_root) + check_for_existance([inp]) + if os.path.exists(inp) == False: + yield ( + i18n("输入路径不存在"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + {"__type__": "update"}, + {"__type__": "update"}, + {"__type__": "update"}, + ) + return + if os.path.isfile(inp): + n_parts = 1 + elif os.path.isdir(inp): + pass + else: + yield ( + i18n("输入路径存在但不可用"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + {"__type__": "update"}, + {"__type__": "update"}, + {"__type__": "update"}, + ) + return + if ps_slice == []: + for i_part in range(n_parts): + cmd = '"%s" tools/slice_audio.py "%s" "%s" %s %s %s %s %s %s %s %s %s' % ( + python_exec, + inp, + opt_root, + threshold, + min_length, + min_interval, + hop_size, + max_sil_kept, + _max, + alpha, + i_part, + n_parts, + ) + print(cmd) + p = Popen(cmd, shell=True) + ps_slice.append(p) + yield ( + process_info(process_name_slice, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, + {"__type__": "update"}, + ) + for p in ps_slice: + p.wait() + ps_slice = [] + yield ( + process_info(process_name_slice, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + {"__type__": "update", "value": opt_root}, + {"__type__": "update", "value": opt_root}, + {"__type__": "update", "value": opt_root}, + ) + else: + yield ( + process_info(process_name_slice, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, + {"__type__": "update"}, + ) + + +def close_slice(): + global ps_slice + if ps_slice != []: + for p_slice in ps_slice: + try: + kill_process(p_slice.pid, process_name_slice) + except: + traceback.print_exc() + ps_slice = [] + return ( + process_info(process_name_slice, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + + +ps1a = [] +process_name_1a = i18n("文本分词与特征提取") + + +def open1a(inp_text, inp_wav_dir, exp_name, gpu_numbers, bert_pretrained_dir): + global ps1a + inp_text = my_utils.clean_path(inp_text) + inp_wav_dir = my_utils.clean_path(inp_wav_dir) + if check_for_existance([inp_text, inp_wav_dir], is_dataset_processing=True): + check_details([inp_text, inp_wav_dir], is_dataset_processing=True) + if ps1a == []: + opt_dir = "%s/%s" % (exp_root, exp_name) + config = { + "inp_text": inp_text, + "inp_wav_dir": inp_wav_dir, + "exp_name": exp_name, + "opt_dir": opt_dir, + "bert_pretrained_dir": bert_pretrained_dir, + } + gpu_names = gpu_numbers.split("-") + all_parts = len(gpu_names) + for i_part in range(all_parts): + config.update( + { + "i_part": str(i_part), + "all_parts": str(all_parts), + "_CUDA_VISIBLE_DEVICES": fix_gpu_number(gpu_names[i_part]), + "is_half": str(is_half), + } + ) + os.environ.update(config) + cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py' % python_exec + print(cmd) + p = Popen(cmd, shell=True) + ps1a.append(p) + yield ( + process_info(process_name_1a, "running"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + for p in ps1a: + p.wait() + opt = [] + for i_part in range(all_parts): + txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part) + with open(txt_path, "r", encoding="utf8") as f: + opt += f.read().strip("\n").split("\n") + os.remove(txt_path) + path_text = "%s/2-name2text.txt" % opt_dir + with open(path_text, "w", encoding="utf8") as f: + f.write("\n".join(opt) + "\n") + ps1a = [] + if len("".join(opt)) > 0: + yield ( + process_info(process_name_1a, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + else: + yield ( + process_info(process_name_1a, "failed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + else: + yield ( + process_info(process_name_1a, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + + +def close1a(): + global ps1a + if ps1a != []: + for p1a in ps1a: + try: + kill_process(p1a.pid, process_name_1a) + except: + traceback.print_exc() + ps1a = [] + return ( + process_info(process_name_1a, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + + +ps1b = [] +process_name_1b = i18n("语音自监督特征提取") + + +def open1b(inp_text, inp_wav_dir, exp_name, gpu_numbers, ssl_pretrained_dir): + global ps1b + inp_text = my_utils.clean_path(inp_text) + inp_wav_dir = my_utils.clean_path(inp_wav_dir) + if check_for_existance([inp_text, inp_wav_dir], is_dataset_processing=True): + check_details([inp_text, inp_wav_dir], is_dataset_processing=True) + if ps1b == []: + config = { + "inp_text": inp_text, + "inp_wav_dir": inp_wav_dir, + "exp_name": exp_name, + "opt_dir": "%s/%s" % (exp_root, exp_name), + "cnhubert_base_dir": ssl_pretrained_dir, + "is_half": str(is_half), + } + gpu_names = gpu_numbers.split("-") + all_parts = len(gpu_names) + for i_part in range(all_parts): + config.update( + { + "i_part": str(i_part), + "all_parts": str(all_parts), + "_CUDA_VISIBLE_DEVICES": fix_gpu_number(gpu_names[i_part]), + } + ) + os.environ.update(config) + cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py' % python_exec + print(cmd) + p = Popen(cmd, shell=True) + ps1b.append(p) + yield ( + process_info(process_name_1b, "running"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + for p in ps1b: + p.wait() + ps1b = [] + yield ( + process_info(process_name_1b, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + else: + yield ( + process_info(process_name_1b, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + + +def close1b(): + global ps1b + if ps1b != []: + for p1b in ps1b: + try: + kill_process(p1b.pid, process_name_1b) + except: + traceback.print_exc() + ps1b = [] + return ( + process_info(process_name_1b, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + + +ps1c = [] +process_name_1c = i18n("语义Token提取") + + +def open1c(inp_text, exp_name, gpu_numbers, pretrained_s2G_path): + global ps1c + inp_text = my_utils.clean_path(inp_text) + if check_for_existance([inp_text, ""], is_dataset_processing=True): + check_details([inp_text, ""], is_dataset_processing=True) + if ps1c == []: + opt_dir = "%s/%s" % (exp_root, exp_name) + config = { + "inp_text": inp_text, + "exp_name": exp_name, + "opt_dir": opt_dir, + "pretrained_s2G": pretrained_s2G_path, + "s2config_path": "GPT_SoVITS/configs/s2.json", + "is_half": str(is_half), + } + gpu_names = gpu_numbers.split("-") + all_parts = len(gpu_names) + for i_part in range(all_parts): + config.update( + { + "i_part": str(i_part), + "all_parts": str(all_parts), + "_CUDA_VISIBLE_DEVICES": fix_gpu_number(gpu_names[i_part]), + } + ) + os.environ.update(config) + cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py' % python_exec + print(cmd) + p = Popen(cmd, shell=True) + ps1c.append(p) + yield ( + process_info(process_name_1c, "running"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + for p in ps1c: + p.wait() + opt = ["item_name\tsemantic_audio"] + path_semantic = "%s/6-name2semantic.tsv" % opt_dir + for i_part in range(all_parts): + semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part) + with open(semantic_path, "r", encoding="utf8") as f: + opt += f.read().strip("\n").split("\n") + os.remove(semantic_path) + with open(path_semantic, "w", encoding="utf8") as f: + f.write("\n".join(opt) + "\n") + ps1c = [] + yield ( + process_info(process_name_1c, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + else: + yield ( + process_info(process_name_1c, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + + +def close1c(): + global ps1c + if ps1c != []: + for p1c in ps1c: + try: + kill_process(p1c.pid, process_name_1c) + except: + traceback.print_exc() + ps1c = [] + return ( + process_info(process_name_1c, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + + +ps1abc = [] +process_name_1abc = i18n("训练集格式化一键三连") + + +def open1abc( + inp_text, + inp_wav_dir, + exp_name, + gpu_numbers1a, + gpu_numbers1Ba, + gpu_numbers1c, + bert_pretrained_dir, + ssl_pretrained_dir, + pretrained_s2G_path, +): + global ps1abc + inp_text = my_utils.clean_path(inp_text) + inp_wav_dir = my_utils.clean_path(inp_wav_dir) + if check_for_existance([inp_text, inp_wav_dir], is_dataset_processing=True): + check_details([inp_text, inp_wav_dir], is_dataset_processing=True) + if ps1abc == []: + opt_dir = "%s/%s" % (exp_root, exp_name) + try: + #############################1a + path_text = "%s/2-name2text.txt" % opt_dir + if os.path.exists(path_text) == False or ( + os.path.exists(path_text) == True + and len(open(path_text, "r", encoding="utf8").read().strip("\n").split("\n")) < 2 + ): + config = { + "inp_text": inp_text, + "inp_wav_dir": inp_wav_dir, + "exp_name": exp_name, + "opt_dir": opt_dir, + "bert_pretrained_dir": bert_pretrained_dir, + "is_half": str(is_half), + } + gpu_names = gpu_numbers1a.split("-") + all_parts = len(gpu_names) + for i_part in range(all_parts): + config.update( + { + "i_part": str(i_part), + "all_parts": str(all_parts), + "_CUDA_VISIBLE_DEVICES": fix_gpu_number(gpu_names[i_part]), + } + ) + os.environ.update(config) + cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py' % python_exec + print(cmd) + p = Popen(cmd, shell=True) + ps1abc.append(p) + yield ( + i18n("进度") + ": 1A-Doing", + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + for p in ps1abc: + p.wait() + + opt = [] + for i_part in range(all_parts): # txt_path="%s/2-name2text-%s.txt"%(opt_dir,i_part) + txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part) + with open(txt_path, "r", encoding="utf8") as f: + opt += f.read().strip("\n").split("\n") + os.remove(txt_path) + with open(path_text, "w", encoding="utf8") as f: + f.write("\n".join(opt) + "\n") + assert len("".join(opt)) > 0, process_info(process_name_1a, "failed") + yield ( + i18n("进度") + ": 1A-Done", + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + ps1abc = [] + #############################1b + config = { + "inp_text": inp_text, + "inp_wav_dir": inp_wav_dir, + "exp_name": exp_name, + "opt_dir": opt_dir, + "cnhubert_base_dir": ssl_pretrained_dir, + } + gpu_names = gpu_numbers1Ba.split("-") + all_parts = len(gpu_names) + for i_part in range(all_parts): + config.update( + { + "i_part": str(i_part), + "all_parts": str(all_parts), + "_CUDA_VISIBLE_DEVICES": fix_gpu_number(gpu_names[i_part]), + } + ) + os.environ.update(config) + cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py' % python_exec + print(cmd) + p = Popen(cmd, shell=True) + ps1abc.append(p) + yield ( + i18n("进度") + ": 1A-Done, 1B-Doing", + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + for p in ps1abc: + p.wait() + yield ( + i18n("进度") + ": 1A-Done, 1B-Done", + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + ps1abc = [] + #############################1c + path_semantic = "%s/6-name2semantic.tsv" % opt_dir + if os.path.exists(path_semantic) == False or ( + os.path.exists(path_semantic) == True and os.path.getsize(path_semantic) < 31 + ): + config = { + "inp_text": inp_text, + "exp_name": exp_name, + "opt_dir": opt_dir, + "pretrained_s2G": pretrained_s2G_path, + "s2config_path": "GPT_SoVITS/configs/s2.json", + } + gpu_names = gpu_numbers1c.split("-") + all_parts = len(gpu_names) + for i_part in range(all_parts): + config.update( + { + "i_part": str(i_part), + "all_parts": str(all_parts), + "_CUDA_VISIBLE_DEVICES": fix_gpu_number(gpu_names[i_part]), + } + ) + os.environ.update(config) + cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py' % python_exec + print(cmd) + p = Popen(cmd, shell=True) + ps1abc.append(p) + yield ( + i18n("进度") + ": 1A-Done, 1B-Done, 1C-Doing", + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + for p in ps1abc: + p.wait() + + opt = ["item_name\tsemantic_audio"] + for i_part in range(all_parts): + semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part) + with open(semantic_path, "r", encoding="utf8") as f: + opt += f.read().strip("\n").split("\n") + os.remove(semantic_path) + with open(path_semantic, "w", encoding="utf8") as f: + f.write("\n".join(opt) + "\n") + yield ( + i18n("进度") + ": 1A-Done, 1B-Done, 1C-Done", + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + ps1abc = [] + yield ( + process_info(process_name_1abc, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + except: + traceback.print_exc() + close1abc() + yield ( + process_info(process_name_1abc, "failed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + else: + yield ( + process_info(process_name_1abc, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + + +def close1abc(): + global ps1abc + if ps1abc != []: + for p1abc in ps1abc: + try: + kill_process(p1abc.pid, process_name_1abc) + except: + traceback.print_exc() + ps1abc = [] + return ( + process_info(process_name_1abc, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + +def switch_version(version_): + os.environ["version"] = version_ + global version + version = version_ + if pretrained_sovits_name[int(version[-1]) - 1] != "" and pretrained_gpt_name[int(version[-1]) - 1] != "": + ... + else: + gr.Warning(i18n("未下载模型") + ": " + version.upper()) + set_default() + return ( + {"__type__": "update", "value": pretrained_sovits_name[int(version[-1]) - 1]}, + {"__type__": "update", "value": pretrained_sovits_name[int(version[-1]) - 1].replace("s2G", "s2D")}, + {"__type__": "update", "value": pretrained_gpt_name[int(version[-1]) - 1]}, + {"__type__": "update", "value": pretrained_gpt_name[int(version[-1]) - 1]}, + {"__type__": "update", "value": pretrained_sovits_name[int(version[-1]) - 1]}, + {"__type__": "update", "value": default_batch_size, "maximum": default_max_batch_size}, + {"__type__": "update", "value": default_sovits_epoch, "maximum": max_sovits_epoch}, + {"__type__": "update", "value": default_sovits_save_every_epoch, "maximum": max_sovits_save_every_epoch}, + {"__type__": "update", "visible": True if version not in v3v4set else False}, + { + "__type__": "update", + "value": False if not if_force_ckpt else True, + "interactive": True if not if_force_ckpt else False, + }, + {"__type__": "update", "interactive": True, "value": False}, + {"__type__": "update", "visible": True if version in v3v4set else False}, + ) # {'__type__': 'update', "interactive": False if version in v3v4set else True, "value": False}, \ ####batch infer + + +if os.path.exists("GPT_SoVITS/text/G2PWModel"): + ... +else: + cmd = '"%s" GPT_SoVITS/download.py' % python_exec + p = Popen(cmd, shell=True) + p.wait() + + +def sync(text): + return {"__type__": "update", "value": text} + + +with gr.Blocks(title="GPT-SoVITS WebUI") as app: + gr.Markdown( + value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + + "
" + + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") + ) + gr.Markdown(value=i18n("中文教程文档") + ": " + "https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e") + + with gr.Tabs(): + with gr.TabItem("0-" + i18n("前置数据集获取工具")): # 提前随机切片防止uvr5爆内存->uvr5->slicer->asr->打标 + gr.Markdown(value="0a-" + i18n("UVR5人声伴奏分离&去混响去延迟工具")) + with gr.Row(): + with gr.Column(scale=3): + with gr.Row(): + uvr5_info = gr.Textbox(label=process_info(process_name_uvr5, "info")) + open_uvr5 = gr.Button(value=process_info(process_name_uvr5, "open"), variant="primary", visible=True) + close_uvr5 = gr.Button(value=process_info(process_name_uvr5, "close"), variant="primary", visible=False) + + gr.Markdown(value="0b-" + i18n("语音切分工具")) + with gr.Row(): + with gr.Column(scale=3): + with gr.Row(): + slice_inp_path = gr.Textbox(label=i18n("音频自动切分输入路径,可文件可文件夹"), value="") + slice_opt_root = gr.Textbox(label=i18n("切分后的子音频的输出根目录"), value="output/slicer_opt") + with gr.Row(): + threshold = gr.Textbox(label=i18n("threshold:音量小于这个值视作静音的备选切割点"), value="-34") + min_length = gr.Textbox( + label=i18n("min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值"), + value="4000", + ) + min_interval = gr.Textbox(label=i18n("min_interval:最短切割间隔"), value="300") + hop_size = gr.Textbox( + label=i18n("hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)"), + value="10", + ) + max_sil_kept = gr.Textbox(label=i18n("max_sil_kept:切完后静音最多留多长"), value="500") + with gr.Row(): + _max = gr.Slider( + minimum=0, + maximum=1, + step=0.05, + label=i18n("max:归一化后最大值多少"), + value=0.9, + interactive=True, + ) + alpha = gr.Slider( + minimum=0, + maximum=1, + step=0.05, + label=i18n("alpha_mix:混多少比例归一化后音频进来"), + value=0.25, + interactive=True, + ) + with gr.Row(): + n_process = gr.Slider( + minimum=1, maximum=n_cpu, step=1, label=i18n("切割使用的进程数"), value=4, interactive=True + ) + slicer_info = gr.Textbox(label=process_info(process_name_slice, "info")) + open_slicer_button = gr.Button( + value=process_info(process_name_slice, "open"), variant="primary", visible=True + ) + close_slicer_button = gr.Button( + value=process_info(process_name_slice, "close"), variant="primary", visible=False + ) + + gr.Markdown(value="0bb-" + i18n("语音降噪工具")) + with gr.Row(): + with gr.Column(scale=3): + with gr.Row(): + denoise_input_dir = gr.Textbox(label=i18n("输入文件夹路径"), value="") + denoise_output_dir = gr.Textbox(label=i18n("输出文件夹路径"), value="output/denoise_opt") + with gr.Row(): + denoise_info = gr.Textbox(label=process_info(process_name_denoise, "info")) + open_denoise_button = gr.Button( + value=process_info(process_name_denoise, "open"), variant="primary", visible=True + ) + close_denoise_button = gr.Button( + value=process_info(process_name_denoise, "close"), variant="primary", visible=False + ) + + gr.Markdown(value="0c-" + i18n("语音识别工具")) + with gr.Row(): + with gr.Column(scale=3): + with gr.Row(): + asr_inp_dir = gr.Textbox( + label=i18n("输入文件夹路径"), value="D:\\GPT-SoVITS\\raw\\xxx", interactive=True + ) + asr_opt_dir = gr.Textbox(label=i18n("输出文件夹路径"), value="output/asr_opt", interactive=True) + with gr.Row(): + asr_model = gr.Dropdown( + label=i18n("ASR 模型"), + choices=list(asr_dict.keys()), + interactive=True, + value="达摩 ASR (中文)", + ) + asr_size = gr.Dropdown( + label=i18n("ASR 模型尺寸"), choices=["large"], interactive=True, value="large" + ) + asr_lang = gr.Dropdown( + label=i18n("ASR 语言设置"), choices=["zh", "yue"], interactive=True, value="zh" + ) + asr_precision = gr.Dropdown( + label=i18n("数据类型精度"), choices=["float32"], interactive=True, value="float32" + ) + with gr.Row(): + asr_info = gr.Textbox(label=process_info(process_name_asr, "info")) + open_asr_button = gr.Button( + value=process_info(process_name_asr, "open"), variant="primary", visible=True + ) + close_asr_button = gr.Button( + value=process_info(process_name_asr, "close"), variant="primary", visible=False + ) + + def change_lang_choices(key): # 根据选择的模型修改可选的语言 + return {"__type__": "update", "choices": asr_dict[key]["lang"], "value": asr_dict[key]["lang"][0]} + + def change_size_choices(key): # 根据选择的模型修改可选的模型尺寸 + return {"__type__": "update", "choices": asr_dict[key]["size"], "value": asr_dict[key]["size"][-1]} + + def change_precision_choices(key): # 根据选择的模型修改可选的语言 + if key == "Faster Whisper (多语种)": + if default_batch_size <= 4: + precision = "int8" + elif is_half: + precision = "float16" + else: + precision = "float32" + else: + precision = "float32" + return {"__type__": "update", "choices": asr_dict[key]["precision"], "value": precision} + + asr_model.change(change_lang_choices, [asr_model], [asr_lang]) + asr_model.change(change_size_choices, [asr_model], [asr_size]) + asr_model.change(change_precision_choices, [asr_model], [asr_precision]) + + gr.Markdown(value="0d-" + i18n("语音文本校对标注工具")) + with gr.Row(): + with gr.Column(scale=3): + with gr.Row(): + path_list = gr.Textbox( + label=i18n("标注文件路径 (含文件后缀 *.list)"), + value="D:\\RVC1006\\GPT-SoVITS\\raw\\xxx.list", + interactive=True, + ) + label_info = gr.Textbox(label=process_info(process_name_subfix, "info")) + open_label = gr.Button(value=process_info(process_name_subfix, "open"), variant="primary", visible=True) + close_label = gr.Button( + value=process_info(process_name_subfix, "close"), variant="primary", visible=False + ) + + open_label.click(change_label, [path_list], [label_info, open_label, close_label]) + close_label.click(change_label, [path_list], [label_info, open_label, close_label]) + open_uvr5.click(change_uvr5, [], [uvr5_info, open_uvr5, close_uvr5]) + close_uvr5.click(change_uvr5, [], [uvr5_info, open_uvr5, close_uvr5]) + + with gr.TabItem(i18n("1-GPT-SoVITS-TTS")): + with gr.Row(): + with gr.Row(): + exp_name = gr.Textbox(label=i18n("*实验/模型名"), value="xxx", interactive=True) + gpu_info = gr.Textbox(label=i18n("显卡信息"), value=gpu_info, visible=True, interactive=False) + version_checkbox = gr.Radio(label=i18n("版本"), value=version, choices=["v1", "v2", "v4"])#, "v3" + with gr.Row(): + pretrained_s2G = gr.Textbox( + label=i18n("预训练SoVITS-G模型路径"), + value=pretrained_sovits_name[int(version[-1]) - 1], + interactive=True, + lines=2, + max_lines=3, + scale=9, + ) + pretrained_s2D = gr.Textbox( + label=i18n("预训练SoVITS-D模型路径"), + value=pretrained_sovits_name[int(version[-1]) - 1].replace("s2G", "s2D"), + interactive=True, + lines=2, + max_lines=3, + scale=9, + ) + pretrained_s1 = gr.Textbox( + label=i18n("预训练GPT模型路径"), + value=pretrained_gpt_name[int(version[-1]) - 1], + interactive=True, + lines=2, + max_lines=3, + scale=10, + ) + + with gr.TabItem("1A-" + i18n("训练集格式化工具")): + gr.Markdown(value=i18n("输出logs/实验名目录下应有23456开头的文件和文件夹")) + with gr.Row(): + with gr.Row(): + inp_text = gr.Textbox( + label=i18n("*文本标注文件"), + value=r"D:\RVC1006\GPT-SoVITS\raw\xxx.list", + interactive=True, + scale=10, + ) + with gr.Row(): + inp_wav_dir = gr.Textbox( + label=i18n("*训练集音频文件目录"), + # value=r"D:\RVC1006\GPT-SoVITS\raw\xxx", + interactive=True, + placeholder=i18n( + "填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。" + ), + scale=10, + ) + + gr.Markdown(value="1Aa-" + process_name_1a) + with gr.Row(): + with gr.Row(): + gpu_numbers1a = gr.Textbox( + label=i18n("GPU卡号以-分割,每个卡号一个进程"), + value="%s-%s" % (gpus, gpus), + interactive=True, + ) + with gr.Row(): + bert_pretrained_dir = gr.Textbox( + label=i18n("预训练中文BERT模型路径"), + value="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", + interactive=False, + lines=2, + ) + with gr.Row(): + button1a_open = gr.Button( + value=process_info(process_name_1a, "open"), variant="primary", visible=True + ) + button1a_close = gr.Button( + value=process_info(process_name_1a, "close"), variant="primary", visible=False + ) + with gr.Row(): + info1a = gr.Textbox(label=process_info(process_name_1a, "info")) + + gr.Markdown(value="1Ab-" + process_name_1b) + with gr.Row(): + with gr.Row(): + gpu_numbers1Ba = gr.Textbox( + label=i18n("GPU卡号以-分割,每个卡号一个进程"), + value="%s-%s" % (gpus, gpus), + interactive=True, + ) + with gr.Row(): + cnhubert_base_dir = gr.Textbox( + label=i18n("预训练SSL模型路径"), + value="GPT_SoVITS/pretrained_models/chinese-hubert-base", + interactive=False, + lines=2, + ) + with gr.Row(): + button1b_open = gr.Button( + value=process_info(process_name_1b, "open"), variant="primary", visible=True + ) + button1b_close = gr.Button( + value=process_info(process_name_1b, "close"), variant="primary", visible=False + ) + with gr.Row(): + info1b = gr.Textbox(label=process_info(process_name_1b, "info")) + + gr.Markdown(value="1Ac-" + process_name_1c) + with gr.Row(): + with gr.Row(): + gpu_numbers1c = gr.Textbox( + label=i18n("GPU卡号以-分割,每个卡号一个进程"), + value="%s-%s" % (gpus, gpus), + interactive=True, + ) + with gr.Row(): + pretrained_s2G_ = gr.Textbox( + label=i18n("预训练SoVITS-G模型路径"), + value=pretrained_sovits_name[int(version[-1]) - 1], + interactive=False, + lines=2, + ) + with gr.Row(): + button1c_open = gr.Button( + value=process_info(process_name_1c, "open"), variant="primary", visible=True + ) + button1c_close = gr.Button( + value=process_info(process_name_1c, "close"), variant="primary", visible=False + ) + with gr.Row(): + info1c = gr.Textbox(label=process_info(process_name_1c, "info")) + + gr.Markdown(value="1Aabc-" + process_name_1abc) + with gr.Row(): + with gr.Row(): + button1abc_open = gr.Button( + value=process_info(process_name_1abc, "open"), variant="primary", visible=True + ) + button1abc_close = gr.Button( + value=process_info(process_name_1abc, "close"), variant="primary", visible=False + ) + with gr.Row(): + info1abc = gr.Textbox(label=process_info(process_name_1abc, "info")) + + pretrained_s2G.change(sync, [pretrained_s2G], [pretrained_s2G_]) + open_asr_button.click( + open_asr, + [asr_inp_dir, asr_opt_dir, asr_model, asr_size, asr_lang, asr_precision], + [asr_info, open_asr_button, close_asr_button, path_list, inp_text, inp_wav_dir], + ) + close_asr_button.click(close_asr, [], [asr_info, open_asr_button, close_asr_button]) + open_slicer_button.click( + open_slice, + [ + slice_inp_path, + slice_opt_root, + threshold, + min_length, + min_interval, + hop_size, + max_sil_kept, + _max, + alpha, + n_process, + ], + [slicer_info, open_slicer_button, close_slicer_button, asr_inp_dir, denoise_input_dir, inp_wav_dir], + ) + close_slicer_button.click(close_slice, [], [slicer_info, open_slicer_button, close_slicer_button]) + open_denoise_button.click( + open_denoise, + [denoise_input_dir, denoise_output_dir], + [denoise_info, open_denoise_button, close_denoise_button, asr_inp_dir, inp_wav_dir], + ) + close_denoise_button.click(close_denoise, [], [denoise_info, open_denoise_button, close_denoise_button]) + + button1a_open.click( + open1a, + [inp_text, inp_wav_dir, exp_name, gpu_numbers1a, bert_pretrained_dir], + [info1a, button1a_open, button1a_close], + ) + button1a_close.click(close1a, [], [info1a, button1a_open, button1a_close]) + button1b_open.click( + open1b, + [inp_text, inp_wav_dir, exp_name, gpu_numbers1Ba, cnhubert_base_dir], + [info1b, button1b_open, button1b_close], + ) + button1b_close.click(close1b, [], [info1b, button1b_open, button1b_close]) + button1c_open.click( + open1c, [inp_text, exp_name, gpu_numbers1c, pretrained_s2G], [info1c, button1c_open, button1c_close] + ) + button1c_close.click(close1c, [], [info1c, button1c_open, button1c_close]) + button1abc_open.click( + open1abc, + [ + inp_text, + inp_wav_dir, + exp_name, + gpu_numbers1a, + gpu_numbers1Ba, + gpu_numbers1c, + bert_pretrained_dir, + cnhubert_base_dir, + pretrained_s2G, + ], + [info1abc, button1abc_open, button1abc_close], + ) + button1abc_close.click(close1abc, [], [info1abc, button1abc_open, button1abc_close]) + + with gr.TabItem("1B-" + i18n("微调训练")): + gr.Markdown(value="1Ba-" + i18n("SoVITS 训练: 模型权重文件在 SoVITS_weights/")) + with gr.Row(): + with gr.Column(): + with gr.Row(): + batch_size = gr.Slider( + minimum=1, + maximum=default_max_batch_size, + step=1, + label=i18n("每张显卡的batch_size"), + value=default_batch_size, + interactive=True, + ) + total_epoch = gr.Slider( + minimum=1, + maximum=max_sovits_epoch, + step=1, + label=i18n("总训练轮数total_epoch,不建议太高"), + value=default_sovits_epoch, + interactive=True, + ) + with gr.Row(): + text_low_lr_rate = gr.Slider( + minimum=0.2, + maximum=0.6, + step=0.05, + label=i18n("文本模块学习率权重"), + value=0.4, + visible=True if version not in v3v4set else False, + ) # v3v4 not need + lora_rank = gr.Radio( + label=i18n("LoRA秩"), + value="32", + choices=["16", "32", "64", "128"], + visible=True if version in v3v4set else False, + ) # v1v2 not need + save_every_epoch = gr.Slider( + minimum=1, + maximum=max_sovits_save_every_epoch, + step=1, + label=i18n("保存频率save_every_epoch"), + value=default_sovits_save_every_epoch, + interactive=True, + ) + with gr.Column(): + with gr.Column(): + if_save_latest = gr.Checkbox( + label=i18n("是否仅保存最新的权重文件以节省硬盘空间"), + value=True, + interactive=True, + show_label=True, + ) + if_save_every_weights = gr.Checkbox( + label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), + value=True, + interactive=True, + show_label=True, + ) + if_grad_ckpt = gr.Checkbox( + label="v3是否开启梯度检查点节省显存占用", + value=False, + interactive=True if version in v3v4set else False, + show_label=True, + visible=False, + ) # 只有V3s2可以用 + with gr.Row(): + gpu_numbers1Ba = gr.Textbox( + label=i18n("GPU卡号以-分割,每个卡号一个进程"), value="%s" % (gpus), interactive=True + ) + with gr.Row(): + with gr.Row(): + button1Ba_open = gr.Button( + value=process_info(process_name_sovits, "open"), variant="primary", visible=True + ) + button1Ba_close = gr.Button( + value=process_info(process_name_sovits, "close"), variant="primary", visible=False + ) + with gr.Row(): + info1Ba = gr.Textbox(label=process_info(process_name_sovits, "info")) + gr.Markdown(value="1Bb-" + i18n("GPT 训练: 模型权重文件在 GPT_weights/")) + with gr.Row(): + with gr.Column(): + with gr.Row(): + batch_size1Bb = gr.Slider( + minimum=1, + maximum=40, + step=1, + label=i18n("每张显卡的batch_size"), + value=default_batch_size_s1, + interactive=True, + ) + total_epoch1Bb = gr.Slider( + minimum=2, + maximum=50, + step=1, + label=i18n("总训练轮数total_epoch"), + value=15, + interactive=True, + ) + with gr.Row(): + save_every_epoch1Bb = gr.Slider( + minimum=1, + maximum=50, + step=1, + label=i18n("保存频率save_every_epoch"), + value=5, + interactive=True, + ) + if_dpo = gr.Checkbox( + label=i18n("是否开启DPO训练选项(实验性)"), + value=False, + interactive=True, + show_label=True, + ) + with gr.Column(): + with gr.Column(): + if_save_latest1Bb = gr.Checkbox( + label=i18n("是否仅保存最新的权重文件以节省硬盘空间"), + value=True, + interactive=True, + show_label=True, + ) + if_save_every_weights1Bb = gr.Checkbox( + label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), + value=True, + interactive=True, + show_label=True, + ) + with gr.Row(): + gpu_numbers1Bb = gr.Textbox( + label=i18n("GPU卡号以-分割,每个卡号一个进程"), value="%s" % (gpus), interactive=True + ) + with gr.Row(): + with gr.Row(): + button1Bb_open = gr.Button( + value=process_info(process_name_gpt, "open"), variant="primary", visible=True + ) + button1Bb_close = gr.Button( + value=process_info(process_name_gpt, "close"), variant="primary", visible=False + ) + with gr.Row(): + info1Bb = gr.Textbox(label=process_info(process_name_gpt, "info")) + + button1Ba_close.click(close1Ba, [], [info1Ba, button1Ba_open, button1Ba_close]) + button1Bb_close.click(close1Bb, [], [info1Bb, button1Bb_open, button1Bb_close]) + + with gr.TabItem("1C-" + i18n("推理")): + gr.Markdown( + value=i18n( + "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。" + ) + ) + with gr.Row(): + with gr.Row(): + GPT_dropdown = gr.Dropdown( + label=i18n("GPT模型列表"), + choices=sorted(GPT_names, key=custom_sort_key), + value=pretrained_gpt_name[0], + interactive=True, + ) + SoVITS_dropdown = gr.Dropdown( + label=i18n("SoVITS模型列表"), + choices=sorted(SoVITS_names, key=custom_sort_key), + value=pretrained_sovits_name[0], + interactive=True, + ) + with gr.Row(): + gpu_number_1C = gr.Textbox(label=i18n("GPU卡号,只能填1个整数"), value=gpus, interactive=True) + refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary") + refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown]) + with gr.Row(): + with gr.Row(): + batched_infer_enabled = gr.Checkbox( + label=i18n("启用并行推理版本"), value=False, interactive=True, show_label=True + ) + with gr.Row(): + open_tts = gr.Button( + value=process_info(process_name_tts, "open"), variant="primary", visible=True + ) + close_tts = gr.Button( + value=process_info(process_name_tts, "close"), variant="primary", visible=False + ) + with gr.Row(): + tts_info = gr.Textbox(label=process_info(process_name_tts, "info")) + open_tts.click( + change_tts_inference, + [ + bert_pretrained_dir, + cnhubert_base_dir, + gpu_number_1C, + GPT_dropdown, + SoVITS_dropdown, + batched_infer_enabled, + ], + [tts_info, open_tts, close_tts], + ) + close_tts.click( + change_tts_inference, + [ + bert_pretrained_dir, + cnhubert_base_dir, + gpu_number_1C, + GPT_dropdown, + SoVITS_dropdown, + batched_infer_enabled, + ], + [tts_info, open_tts, close_tts], + ) + button1Ba_open.click( + open1Ba, + [ + batch_size, + total_epoch, + exp_name, + text_low_lr_rate, + if_save_latest, + if_save_every_weights, + save_every_epoch, + gpu_numbers1Ba, + pretrained_s2G, + pretrained_s2D, + if_grad_ckpt, + lora_rank, + ], + [info1Ba, button1Ba_open, button1Ba_close,SoVITS_dropdown,GPT_dropdown], + ) + button1Bb_open.click( + open1Bb, + [ + batch_size1Bb, + total_epoch1Bb, + exp_name, + if_dpo, + if_save_latest1Bb, + if_save_every_weights1Bb, + save_every_epoch1Bb, + gpu_numbers1Bb, + pretrained_s1, + ], + [info1Bb, button1Bb_open, button1Bb_close,SoVITS_dropdown,GPT_dropdown], + ) + version_checkbox.change( + switch_version, + [version_checkbox], + [ + pretrained_s2G, + pretrained_s2D, + pretrained_s1, + GPT_dropdown, + SoVITS_dropdown, + batch_size, + total_epoch, + save_every_epoch, + text_low_lr_rate, + if_grad_ckpt, + batched_infer_enabled, + lora_rank, + ], + ) + + with gr.TabItem(i18n("2-GPT-SoVITS-变声")): + gr.Markdown(value=i18n("施工中,请静候佳音")) + + app.queue().launch( # concurrency_count=511, max_size=1022 + server_name="0.0.0.0", + inbrowser=True, + share=is_share, + server_port=webui_port_main, + # quiet=True, + )