diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..1aecc638e2db2994100893e0155fb17d8c17d56e 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+GPT_SoVITS/text/ja_userdic/userdict.csv filter=lfs diff=lfs merge=lfs -text
diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b7ad11a2f052c3cd0fed65802a448f18fbee2d4
--- /dev/null
+++ b/GPT_SoVITS/TTS_infer_pack/TTS.py
@@ -0,0 +1,1463 @@
+import gc
+import math
+import os
+import random
+import sys
+import time
+import traceback
+from copy import deepcopy
+
+import torchaudio
+from tqdm import tqdm
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+import os
+from typing import List, Tuple, Union
+
+import ffmpeg
+import librosa
+import numpy as np
+import torch
+import torch.nn.functional as F
+import yaml
+from AR.models.t2s_lightning_module import Text2SemanticLightningModule
+from BigVGAN.bigvgan import BigVGAN
+from feature_extractor.cnhubert import CNHubert
+from module.mel_processing import mel_spectrogram_torch, spectrogram_torch
+from module.models import SynthesizerTrn, SynthesizerTrnV3
+from peft import LoraConfig, get_peft_model
+from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+
+from tools.audio_sr import AP_BWE
+from tools.i18n.i18n import I18nAuto, scan_language_list
+from tools.my_utils import load_audio
+from TTS_infer_pack.text_segmentation_method import splits
+from TTS_infer_pack.TextPreprocessor import TextPreprocessor
+
+language = os.environ.get("language", "Auto")
+language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
+i18n = I18nAuto(language=language)
+
+
+spec_min = -12
+spec_max = 2
+
+
+def norm_spec(x):
+ return (x - spec_min) / (spec_max - spec_min) * 2 - 1
+
+
+def denorm_spec(x):
+ return (x + 1) / 2 * (spec_max - spec_min) + spec_min
+
+
+mel_fn = lambda x: mel_spectrogram_torch(
+ x,
+ **{
+ "n_fft": 1024,
+ "win_size": 1024,
+ "hop_size": 256,
+ "num_mels": 100,
+ "sampling_rate": 24000,
+ "fmin": 0,
+ "fmax": None,
+ "center": False,
+ },
+)
+
+
+def speed_change(input_audio: np.ndarray, speed: float, sr: int):
+ # 将 NumPy 数组转换为原始 PCM 流
+ raw_audio = input_audio.astype(np.int16).tobytes()
+
+ # 设置 ffmpeg 输入流
+ input_stream = ffmpeg.input("pipe:", format="s16le", acodec="pcm_s16le", ar=str(sr), ac=1)
+
+ # 变速处理
+ output_stream = input_stream.filter("atempo", speed)
+
+ # 输出流到管道
+ out, _ = output_stream.output("pipe:", format="s16le", acodec="pcm_s16le").run(
+ input=raw_audio, capture_stdout=True, capture_stderr=True
+ )
+
+ # 将管道输出解码为 NumPy 数组
+ processed_audio = np.frombuffer(out, np.int16)
+
+ return processed_audio
+
+
+resample_transform_dict = {}
+
+
+def resample(audio_tensor, sr0, device):
+ global resample_transform_dict
+ if sr0 not in resample_transform_dict:
+ resample_transform_dict[sr0] = torchaudio.transforms.Resample(sr0, 24000).to(device)
+ return resample_transform_dict[sr0](audio_tensor)
+
+
+class DictToAttrRecursive(dict):
+ def __init__(self, input_dict):
+ super().__init__(input_dict)
+ for key, value in input_dict.items():
+ if isinstance(value, dict):
+ value = DictToAttrRecursive(value)
+ self[key] = value
+ setattr(self, key, value)
+
+ def __getattr__(self, item):
+ try:
+ return self[item]
+ except KeyError:
+ raise AttributeError(f"Attribute {item} not found")
+
+ def __setattr__(self, key, value):
+ if isinstance(value, dict):
+ value = DictToAttrRecursive(value)
+ super(DictToAttrRecursive, self).__setitem__(key, value)
+ super().__setattr__(key, value)
+
+ def __delattr__(self, item):
+ try:
+ del self[item]
+ except KeyError:
+ raise AttributeError(f"Attribute {item} not found")
+
+
+class NO_PROMPT_ERROR(Exception):
+ pass
+
+
+# configs/tts_infer.yaml
+"""
+custom:
+ bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
+ cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
+ device: cpu
+ is_half: false
+ t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
+ vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
+ version: v2
+default:
+ bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
+ cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
+ device: cpu
+ is_half: false
+ t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
+ vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth
+ version: v1
+default_v2:
+ bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
+ cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
+ device: cpu
+ is_half: false
+ t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
+ vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
+ version: v2
+default_v3:
+ bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
+ cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
+ device: cpu
+ is_half: false
+ t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt
+ vits_weights_path: GPT_SoVITS/pretrained_models/s2Gv3.pth
+ version: v3
+"""
+
+
+def set_seed(seed: int):
+ seed = int(seed)
+ seed = seed if seed != -1 else random.randint(0, 2**32 - 1)
+ print(f"Set seed to {seed}")
+ os.environ["PYTHONHASHSEED"] = str(seed)
+ random.seed(seed)
+ np.random.seed(seed)
+ torch.manual_seed(seed)
+ try:
+ if torch.cuda.is_available():
+ torch.cuda.manual_seed(seed)
+ torch.cuda.manual_seed_all(seed)
+ # torch.backends.cudnn.deterministic = True
+ # torch.backends.cudnn.benchmark = False
+ # torch.backends.cudnn.enabled = True
+ # 开启后会影响精度
+ torch.backends.cuda.matmul.allow_tf32 = False
+ torch.backends.cudnn.allow_tf32 = False
+ except:
+ pass
+ return seed
+
+
+class TTS_Config:
+ default_configs = {
+ "v1": {
+ "device": "cpu",
+ "is_half": False,
+ "version": "v1",
+ "t2s_weights_path": "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
+ "vits_weights_path": "GPT_SoVITS/pretrained_models/s2G488k.pth",
+ "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base",
+ "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
+ },
+ "v2": {
+ "device": "cpu",
+ "is_half": False,
+ "version": "v2",
+ "t2s_weights_path": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt",
+ "vits_weights_path": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",
+ "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base",
+ "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
+ },
+ "v3": {
+ "device": "cpu",
+ "is_half": False,
+ "version": "v3",
+ "t2s_weights_path": "GPT_SoVITS/pretrained_models/s1v3.ckpt",
+ "vits_weights_path": "GPT_SoVITS/pretrained_models/s2Gv3.pth",
+ "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base",
+ "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
+ },
+ }
+ configs: dict = None
+ v1_languages: list = ["auto", "en", "zh", "ja", "all_zh", "all_ja"]
+ v2_languages: list = ["auto", "auto_yue", "en", "zh", "ja", "yue", "ko", "all_zh", "all_ja", "all_yue", "all_ko"]
+ languages: list = v2_languages
+ # "all_zh",#全部按中文识别
+ # "en",#全部按英文识别#######不变
+ # "all_ja",#全部按日文识别
+ # "all_yue",#全部按中文识别
+ # "all_ko",#全部按韩文识别
+ # "zh",#按中英混合识别####不变
+ # "ja",#按日英混合识别####不变
+ # "yue",#按粤英混合识别####不变
+ # "ko",#按韩英混合识别####不变
+ # "auto",#多语种启动切分识别语种
+ # "auto_yue",#多语种启动切分识别语种
+
+ def __init__(self, configs: Union[dict, str] = None):
+ # 设置默认配置文件路径
+ configs_base_path: str = "GPT_SoVITS/configs/"
+ os.makedirs(configs_base_path, exist_ok=True)
+ self.configs_path: str = os.path.join(configs_base_path, "tts_infer.yaml")
+
+ if configs in ["", None]:
+ if not os.path.exists(self.configs_path):
+ self.save_configs()
+ print(f"Create default config file at {self.configs_path}")
+ configs: dict = deepcopy(self.default_configs)
+
+ if isinstance(configs, str):
+ self.configs_path = configs
+ configs: dict = self._load_configs(self.configs_path)
+
+ assert isinstance(configs, dict)
+ version = configs.get("version", "v2").lower()
+ assert version in ["v1", "v2", "v3"]
+ self.default_configs[version] = configs.get(version, self.default_configs[version])
+ self.configs: dict = configs.get("custom", deepcopy(self.default_configs[version]))
+
+ self.device = self.configs.get("device", torch.device("cpu"))
+ if "cuda" in str(self.device) and not torch.cuda.is_available():
+ print("Warning: CUDA is not available, set device to CPU.")
+ self.device = torch.device("cpu")
+
+ self.is_half = self.configs.get("is_half", False)
+ # if str(self.device) == "cpu" and self.is_half:
+ # print(f"Warning: Half precision is not supported on CPU, set is_half to False.")
+ # self.is_half = False
+
+ self.version = version
+ self.t2s_weights_path = self.configs.get("t2s_weights_path", None)
+ self.vits_weights_path = self.configs.get("vits_weights_path", None)
+ self.bert_base_path = self.configs.get("bert_base_path", None)
+ self.cnhuhbert_base_path = self.configs.get("cnhuhbert_base_path", None)
+ self.languages = self.v1_languages if self.version == "v1" else self.v2_languages
+
+ self.is_v3_synthesizer: bool = False
+
+ if (self.t2s_weights_path in [None, ""]) or (not os.path.exists(self.t2s_weights_path)):
+ self.t2s_weights_path = self.default_configs[version]["t2s_weights_path"]
+ print(f"fall back to default t2s_weights_path: {self.t2s_weights_path}")
+ if (self.vits_weights_path in [None, ""]) or (not os.path.exists(self.vits_weights_path)):
+ self.vits_weights_path = self.default_configs[version]["vits_weights_path"]
+ print(f"fall back to default vits_weights_path: {self.vits_weights_path}")
+ if (self.bert_base_path in [None, ""]) or (not os.path.exists(self.bert_base_path)):
+ self.bert_base_path = self.default_configs[version]["bert_base_path"]
+ print(f"fall back to default bert_base_path: {self.bert_base_path}")
+ if (self.cnhuhbert_base_path in [None, ""]) or (not os.path.exists(self.cnhuhbert_base_path)):
+ self.cnhuhbert_base_path = self.default_configs[version]["cnhuhbert_base_path"]
+ print(f"fall back to default cnhuhbert_base_path: {self.cnhuhbert_base_path}")
+ self.update_configs()
+
+ self.max_sec = None
+ self.hz: int = 50
+ self.semantic_frame_rate: str = "25hz"
+ self.segment_size: int = 20480
+ self.filter_length: int = 2048
+ self.sampling_rate: int = 32000
+ self.hop_length: int = 640
+ self.win_length: int = 2048
+ self.n_speakers: int = 300
+
+ def _load_configs(self, configs_path: str) -> dict:
+ if os.path.exists(configs_path):
+ ...
+ else:
+ print(i18n("路径不存在,使用默认配置"))
+ self.save_configs(configs_path)
+ with open(configs_path, "r", encoding="utf-8") as f:
+ configs = yaml.load(f, Loader=yaml.FullLoader)
+
+ return configs
+
+ def save_configs(self, configs_path: str = None) -> None:
+ configs = deepcopy(self.default_configs)
+ if self.configs is not None:
+ configs["custom"] = self.update_configs()
+
+ if configs_path is None:
+ configs_path = self.configs_path
+ with open(configs_path, "w") as f:
+ yaml.dump(configs, f)
+
+ def update_configs(self):
+ self.config = {
+ "device": str(self.device),
+ "is_half": self.is_half,
+ "version": self.version,
+ "t2s_weights_path": self.t2s_weights_path,
+ "vits_weights_path": self.vits_weights_path,
+ "bert_base_path": self.bert_base_path,
+ "cnhuhbert_base_path": self.cnhuhbert_base_path,
+ }
+ return self.config
+
+ def update_version(self, version: str) -> None:
+ self.version = version
+ self.languages = self.v1_languages if self.version == "v1" else self.v2_languages
+
+ def __str__(self):
+ self.configs = self.update_configs()
+ string = "TTS Config".center(100, "-") + "\n"
+ for k, v in self.configs.items():
+ string += f"{str(k).ljust(20)}: {str(v)}\n"
+ string += "-" * 100 + "\n"
+ return string
+
+ def __repr__(self):
+ return self.__str__()
+
+ def __hash__(self):
+ return hash(self.configs_path)
+
+ def __eq__(self, other):
+ return isinstance(other, TTS_Config) and self.configs_path == other.configs_path
+
+
+class TTS:
+ def __init__(self, configs: Union[dict, str, TTS_Config]):
+ if isinstance(configs, TTS_Config):
+ self.configs = configs
+ else:
+ self.configs: TTS_Config = TTS_Config(configs)
+
+ self.t2s_model: Text2SemanticLightningModule = None
+ self.vits_model: Union[SynthesizerTrn, SynthesizerTrnV3] = None
+ self.bert_tokenizer: AutoTokenizer = None
+ self.bert_model: AutoModelForMaskedLM = None
+ self.cnhuhbert_model: CNHubert = None
+ self.bigvgan_model: BigVGAN = None
+ self.sr_model: AP_BWE = None
+ self.sr_model_not_exist: bool = False
+
+ self._init_models()
+
+ self.text_preprocessor: TextPreprocessor = TextPreprocessor(
+ self.bert_model, self.bert_tokenizer, self.configs.device
+ )
+
+ self.prompt_cache: dict = {
+ "ref_audio_path": None,
+ "prompt_semantic": None,
+ "refer_spec": [],
+ "prompt_text": None,
+ "prompt_lang": None,
+ "phones": None,
+ "bert_features": None,
+ "norm_text": None,
+ "aux_ref_audio_paths": [],
+ }
+
+ self.stop_flag: bool = False
+ self.precision: torch.dtype = torch.float16 if self.configs.is_half else torch.float32
+
+ def _init_models(
+ self,
+ ):
+ self.init_t2s_weights(self.configs.t2s_weights_path)
+ self.init_vits_weights(self.configs.vits_weights_path)
+ self.init_bert_weights(self.configs.bert_base_path)
+ self.init_cnhuhbert_weights(self.configs.cnhuhbert_base_path)
+ # self.enable_half_precision(self.configs.is_half)
+
+ def init_cnhuhbert_weights(self, base_path: str):
+ print(f"Loading CNHuBERT weights from {base_path}")
+ self.cnhuhbert_model = CNHubert(base_path)
+ self.cnhuhbert_model = self.cnhuhbert_model.eval()
+ self.cnhuhbert_model = self.cnhuhbert_model.to(self.configs.device)
+ if self.configs.is_half and str(self.configs.device) != "cpu":
+ self.cnhuhbert_model = self.cnhuhbert_model.half()
+
+ def init_bert_weights(self, base_path: str):
+ print(f"Loading BERT weights from {base_path}")
+ self.bert_tokenizer = AutoTokenizer.from_pretrained(base_path)
+ self.bert_model = AutoModelForMaskedLM.from_pretrained(base_path)
+ self.bert_model = self.bert_model.eval()
+ self.bert_model = self.bert_model.to(self.configs.device)
+ if self.configs.is_half and str(self.configs.device) != "cpu":
+ self.bert_model = self.bert_model.half()
+
+ def init_vits_weights(self, weights_path: str):
+ self.configs.vits_weights_path = weights_path
+ version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(weights_path)
+ path_sovits_v3 = self.configs.default_configs["v3"]["vits_weights_path"]
+
+ if if_lora_v3 == True and os.path.exists(path_sovits_v3) == False:
+ info = path_sovits_v3 + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重")
+ raise FileExistsError(info)
+
+ # dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False)
+ dict_s2 = load_sovits_new(weights_path)
+ hps = dict_s2["config"]
+
+ hps["model"]["semantic_frame_rate"] = "25hz"
+ if "enc_p.text_embedding.weight" not in dict_s2["weight"]:
+ hps["model"]["version"] = "v2" # v3model,v2sybomls
+ elif dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322:
+ hps["model"]["version"] = "v1"
+ else:
+ hps["model"]["version"] = "v2"
+ # version = hps["model"]["version"]
+
+ self.configs.filter_length = hps["data"]["filter_length"]
+ self.configs.segment_size = hps["train"]["segment_size"]
+ self.configs.sampling_rate = hps["data"]["sampling_rate"]
+ self.configs.hop_length = hps["data"]["hop_length"]
+ self.configs.win_length = hps["data"]["win_length"]
+ self.configs.n_speakers = hps["data"]["n_speakers"]
+ self.configs.semantic_frame_rate = hps["model"]["semantic_frame_rate"]
+ kwargs = hps["model"]
+ # print(f"self.configs.sampling_rate:{self.configs.sampling_rate}")
+
+ self.configs.update_version(model_version)
+
+ # print(f"model_version:{model_version}")
+ # print(f'hps["model"]["version"]:{hps["model"]["version"]}')
+ if model_version != "v3":
+ vits_model = SynthesizerTrn(
+ self.configs.filter_length // 2 + 1,
+ self.configs.segment_size // self.configs.hop_length,
+ n_speakers=self.configs.n_speakers,
+ **kwargs,
+ )
+ self.configs.is_v3_synthesizer = False
+ else:
+ vits_model = SynthesizerTrnV3(
+ self.configs.filter_length // 2 + 1,
+ self.configs.segment_size // self.configs.hop_length,
+ n_speakers=self.configs.n_speakers,
+ **kwargs,
+ )
+ self.configs.is_v3_synthesizer = True
+ self.init_bigvgan()
+ if "pretrained" not in weights_path and hasattr(vits_model, "enc_q"):
+ del vits_model.enc_q
+
+ if if_lora_v3 == False:
+ print(
+ f"Loading VITS weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}"
+ )
+ else:
+ print(
+ f"Loading VITS pretrained weights from {weights_path}. {vits_model.load_state_dict(load_sovits_new(path_sovits_v3)['weight'], strict=False)}"
+ )
+ lora_rank = dict_s2["lora_rank"]
+ lora_config = LoraConfig(
+ target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+ r=lora_rank,
+ lora_alpha=lora_rank,
+ init_lora_weights=True,
+ )
+ vits_model.cfm = get_peft_model(vits_model.cfm, lora_config)
+ print(
+ f"Loading LoRA weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}"
+ )
+
+ vits_model.cfm = vits_model.cfm.merge_and_unload()
+
+ vits_model = vits_model.to(self.configs.device)
+ vits_model = vits_model.eval()
+
+ self.vits_model = vits_model
+ if self.configs.is_half and str(self.configs.device) != "cpu":
+ self.vits_model = self.vits_model.half()
+
+ def init_t2s_weights(self, weights_path: str):
+ print(f"Loading Text2Semantic weights from {weights_path}")
+ self.configs.t2s_weights_path = weights_path
+ self.configs.save_configs()
+ self.configs.hz = 50
+ dict_s1 = torch.load(weights_path, map_location=self.configs.device)
+ config = dict_s1["config"]
+ self.configs.max_sec = config["data"]["max_sec"]
+ t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
+ t2s_model.load_state_dict(dict_s1["weight"])
+ t2s_model = t2s_model.to(self.configs.device)
+ t2s_model = t2s_model.eval()
+ self.t2s_model = t2s_model
+ if self.configs.is_half and str(self.configs.device) != "cpu":
+ self.t2s_model = self.t2s_model.half()
+
+ def init_bigvgan(self):
+ if self.bigvgan_model is not None:
+ return
+ self.bigvgan_model = BigVGAN.from_pretrained(
+ "%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,),
+ use_cuda_kernel=False,
+ ) # if True, RuntimeError: Ninja is required to load C++ extensions
+ # remove weight norm in the model and set to eval mode
+ self.bigvgan_model.remove_weight_norm()
+ self.bigvgan_model = self.bigvgan_model.eval()
+ if self.configs.is_half == True:
+ self.bigvgan_model = self.bigvgan_model.half().to(self.configs.device)
+ else:
+ self.bigvgan_model = self.bigvgan_model.to(self.configs.device)
+
+ def init_sr_model(self):
+ if self.sr_model is not None:
+ return
+ try:
+ self.sr_model: AP_BWE = AP_BWE(self.configs.device, DictToAttrRecursive)
+ self.sr_model_not_exist = False
+ except FileNotFoundError:
+ print(i18n("你没有下载超分模型的参数,因此不进行超分。如想超分请先参照教程把文件下载好"))
+ self.sr_model_not_exist = True
+
+ def enable_half_precision(self, enable: bool = True, save: bool = True):
+ """
+ To enable half precision for the TTS model.
+ Args:
+ enable: bool, whether to enable half precision.
+
+ """
+ if str(self.configs.device) == "cpu" and enable:
+ print("Half precision is not supported on CPU.")
+ return
+
+ self.configs.is_half = enable
+ self.precision = torch.float16 if enable else torch.float32
+ if save:
+ self.configs.save_configs()
+ if enable:
+ if self.t2s_model is not None:
+ self.t2s_model = self.t2s_model.half()
+ if self.vits_model is not None:
+ self.vits_model = self.vits_model.half()
+ if self.bert_model is not None:
+ self.bert_model = self.bert_model.half()
+ if self.cnhuhbert_model is not None:
+ self.cnhuhbert_model = self.cnhuhbert_model.half()
+ if self.bigvgan_model is not None:
+ self.bigvgan_model = self.bigvgan_model.half()
+ else:
+ if self.t2s_model is not None:
+ self.t2s_model = self.t2s_model.float()
+ if self.vits_model is not None:
+ self.vits_model = self.vits_model.float()
+ if self.bert_model is not None:
+ self.bert_model = self.bert_model.float()
+ if self.cnhuhbert_model is not None:
+ self.cnhuhbert_model = self.cnhuhbert_model.float()
+ if self.bigvgan_model is not None:
+ self.bigvgan_model = self.bigvgan_model.float()
+
+ def set_device(self, device: torch.device, save: bool = True):
+ """
+ To set the device for all models.
+ Args:
+ device: torch.device, the device to use for all models.
+ """
+ self.configs.device = device
+ if save:
+ self.configs.save_configs()
+ if self.t2s_model is not None:
+ self.t2s_model = self.t2s_model.to(device)
+ if self.vits_model is not None:
+ self.vits_model = self.vits_model.to(device)
+ if self.bert_model is not None:
+ self.bert_model = self.bert_model.to(device)
+ if self.cnhuhbert_model is not None:
+ self.cnhuhbert_model = self.cnhuhbert_model.to(device)
+ if self.bigvgan_model is not None:
+ self.bigvgan_model = self.bigvgan_model.to(device)
+ if self.sr_model is not None:
+ self.sr_model = self.sr_model.to(device)
+
+ def set_ref_audio(self, ref_audio_path: str):
+ """
+ To set the reference audio for the TTS model,
+ including the prompt_semantic and refer_spepc.
+ Args:
+ ref_audio_path: str, the path of the reference audio.
+ """
+ self._set_prompt_semantic(ref_audio_path)
+ self._set_ref_spec(ref_audio_path)
+ self._set_ref_audio_path(ref_audio_path)
+
+ def _set_ref_audio_path(self, ref_audio_path):
+ self.prompt_cache["ref_audio_path"] = ref_audio_path
+
+ def _set_ref_spec(self, ref_audio_path):
+ spec = self._get_ref_spec(ref_audio_path)
+ if self.prompt_cache["refer_spec"] in [[], None]:
+ self.prompt_cache["refer_spec"] = [spec]
+ else:
+ self.prompt_cache["refer_spec"][0] = spec
+
+ def _get_ref_spec(self, ref_audio_path):
+ raw_audio, raw_sr = torchaudio.load(ref_audio_path)
+ raw_audio = raw_audio.to(self.configs.device).float()
+ self.prompt_cache["raw_audio"] = raw_audio
+ self.prompt_cache["raw_sr"] = raw_sr
+
+ audio = load_audio(ref_audio_path, int(self.configs.sampling_rate))
+ audio = torch.FloatTensor(audio)
+ maxx = audio.abs().max()
+ if maxx > 1:
+ audio /= min(2, maxx)
+ audio_norm = audio
+ audio_norm = audio_norm.unsqueeze(0)
+ spec = spectrogram_torch(
+ audio_norm,
+ self.configs.filter_length,
+ self.configs.sampling_rate,
+ self.configs.hop_length,
+ self.configs.win_length,
+ center=False,
+ )
+ spec = spec.to(self.configs.device)
+ if self.configs.is_half:
+ spec = spec.half()
+ return spec
+
+ def _set_prompt_semantic(self, ref_wav_path: str):
+ zero_wav = np.zeros(
+ int(self.configs.sampling_rate * 0.3),
+ dtype=np.float16 if self.configs.is_half else np.float32,
+ )
+ with torch.no_grad():
+ wav16k, sr = librosa.load(ref_wav_path, sr=16000)
+ if wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000:
+ raise OSError(i18n("参考音频在3~10秒范围外,请更换!"))
+ wav16k = torch.from_numpy(wav16k)
+ zero_wav_torch = torch.from_numpy(zero_wav)
+ wav16k = wav16k.to(self.configs.device)
+ zero_wav_torch = zero_wav_torch.to(self.configs.device)
+ if self.configs.is_half:
+ wav16k = wav16k.half()
+ zero_wav_torch = zero_wav_torch.half()
+
+ wav16k = torch.cat([wav16k, zero_wav_torch])
+ hubert_feature = self.cnhuhbert_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(
+ 1, 2
+ ) # .float()
+ codes = self.vits_model.extract_latent(hubert_feature)
+
+ prompt_semantic = codes[0, 0].to(self.configs.device)
+ self.prompt_cache["prompt_semantic"] = prompt_semantic
+
+ def batch_sequences(self, sequences: List[torch.Tensor], axis: int = 0, pad_value: int = 0, max_length: int = None):
+ seq = sequences[0]
+ ndim = seq.dim()
+ if axis < 0:
+ axis += ndim
+ dtype: torch.dtype = seq.dtype
+ pad_value = torch.tensor(pad_value, dtype=dtype)
+ seq_lengths = [seq.shape[axis] for seq in sequences]
+ if max_length is None:
+ max_length = max(seq_lengths)
+ else:
+ max_length = max(seq_lengths) if max_length < max(seq_lengths) else max_length
+
+ padded_sequences = []
+ for seq, length in zip(sequences, seq_lengths):
+ padding = [0] * axis + [0, max_length - length] + [0] * (ndim - axis - 1)
+ padded_seq = torch.nn.functional.pad(seq, padding, value=pad_value)
+ padded_sequences.append(padded_seq)
+ batch = torch.stack(padded_sequences)
+ return batch
+
+ def to_batch(
+ self,
+ data: list,
+ prompt_data: dict = None,
+ batch_size: int = 5,
+ threshold: float = 0.75,
+ split_bucket: bool = True,
+ device: torch.device = torch.device("cpu"),
+ precision: torch.dtype = torch.float32,
+ ):
+ _data: list = []
+ index_and_len_list = []
+ for idx, item in enumerate(data):
+ norm_text_len = len(item["norm_text"])
+ index_and_len_list.append([idx, norm_text_len])
+
+ batch_index_list = []
+ if split_bucket:
+ index_and_len_list.sort(key=lambda x: x[1])
+ index_and_len_list = np.array(index_and_len_list, dtype=np.int64)
+
+ batch_index_list_len = 0
+ pos = 0
+ while pos < index_and_len_list.shape[0]:
+ # batch_index_list.append(index_and_len_list[pos:min(pos+batch_size,len(index_and_len_list))])
+ pos_end = min(pos + batch_size, index_and_len_list.shape[0])
+ while pos < pos_end:
+ batch = index_and_len_list[pos:pos_end, 1].astype(np.float32)
+ score = batch[(pos_end - pos) // 2] / (batch.mean() + 1e-8)
+ if (score >= threshold) or (pos_end - pos == 1):
+ batch_index = index_and_len_list[pos:pos_end, 0].tolist()
+ batch_index_list_len += len(batch_index)
+ batch_index_list.append(batch_index)
+ pos = pos_end
+ break
+ pos_end = pos_end - 1
+
+ assert batch_index_list_len == len(data)
+
+ else:
+ for i in range(len(data)):
+ if i % batch_size == 0:
+ batch_index_list.append([])
+ batch_index_list[-1].append(i)
+
+ for batch_idx, index_list in enumerate(batch_index_list):
+ item_list = [data[idx] for idx in index_list]
+ phones_list = []
+ phones_len_list = []
+ # bert_features_list = []
+ all_phones_list = []
+ all_phones_len_list = []
+ all_bert_features_list = []
+ norm_text_batch = []
+ all_bert_max_len = 0
+ all_phones_max_len = 0
+ for item in item_list:
+ if prompt_data is not None:
+ all_bert_features = torch.cat([prompt_data["bert_features"], item["bert_features"]], 1).to(
+ dtype=precision, device=device
+ )
+ all_phones = torch.LongTensor(prompt_data["phones"] + item["phones"]).to(device)
+ phones = torch.LongTensor(item["phones"]).to(device)
+ # norm_text = prompt_data["norm_text"]+item["norm_text"]
+ else:
+ all_bert_features = item["bert_features"].to(dtype=precision, device=device)
+ phones = torch.LongTensor(item["phones"]).to(device)
+ all_phones = phones
+ # norm_text = item["norm_text"]
+
+ all_bert_max_len = max(all_bert_max_len, all_bert_features.shape[-1])
+ all_phones_max_len = max(all_phones_max_len, all_phones.shape[-1])
+
+ phones_list.append(phones)
+ phones_len_list.append(phones.shape[-1])
+ all_phones_list.append(all_phones)
+ all_phones_len_list.append(all_phones.shape[-1])
+ all_bert_features_list.append(all_bert_features)
+ norm_text_batch.append(item["norm_text"])
+
+ phones_batch = phones_list
+ all_phones_batch = all_phones_list
+ all_bert_features_batch = all_bert_features_list
+
+ max_len = max(all_bert_max_len, all_phones_max_len)
+ # phones_batch = self.batch_sequences(phones_list, axis=0, pad_value=0, max_length=max_len)
+ #### 直接对phones和bert_features进行pad。(padding策略会影响T2S模型生成的结果,但不直接影响复读概率。影响复读概率的主要因素是mask的策略)
+ # all_phones_batch = self.batch_sequences(all_phones_list, axis=0, pad_value=0, max_length=max_len)
+ # all_bert_features_batch = all_bert_features_list
+ # all_bert_features_batch = torch.zeros((len(all_bert_features_list), 1024, max_len), dtype=precision, device=device)
+ # for idx, item in enumerate(all_bert_features_list):
+ # all_bert_features_batch[idx, :, : item.shape[-1]] = item
+
+ # #### 先对phones进行embedding、对bert_features进行project,再pad到相同长度,(padding策略会影响T2S模型生成的结果,但不直接影响复读概率。影响复读概率的主要因素是mask的策略)
+ # all_phones_list = [self.t2s_model.model.ar_text_embedding(item.to(self.t2s_model.device)) for item in all_phones_list]
+ # all_phones_list = [F.pad(item,(0,0,0,max_len-item.shape[0]),value=0) for item in all_phones_list]
+ # all_phones_batch = torch.stack(all_phones_list, dim=0)
+
+ # all_bert_features_list = [self.t2s_model.model.bert_proj(item.to(self.t2s_model.device).transpose(0, 1)) for item in all_bert_features_list]
+ # all_bert_features_list = [F.pad(item,(0,0,0,max_len-item.shape[0]), value=0) for item in all_bert_features_list]
+ # all_bert_features_batch = torch.stack(all_bert_features_list, dim=0)
+
+ batch = {
+ "phones": phones_batch,
+ "phones_len": torch.LongTensor(phones_len_list).to(device),
+ "all_phones": all_phones_batch,
+ "all_phones_len": torch.LongTensor(all_phones_len_list).to(device),
+ "all_bert_features": all_bert_features_batch,
+ "norm_text": norm_text_batch,
+ "max_len": max_len,
+ }
+ _data.append(batch)
+
+ return _data, batch_index_list
+
+ def recovery_order(self, data: list, batch_index_list: list) -> list:
+ """
+ Recovery the order of the audio according to the batch_index_list.
+
+ Args:
+ data (List[list(torch.Tensor)]): the out of order audio .
+ batch_index_list (List[list[int]]): the batch index list.
+
+ Returns:
+ list (List[torch.Tensor]): the data in the original order.
+ """
+ length = len(sum(batch_index_list, []))
+ _data = [None] * length
+ for i, index_list in enumerate(batch_index_list):
+ for j, index in enumerate(index_list):
+ _data[index] = data[i][j]
+ return _data
+
+ def stop(
+ self,
+ ):
+ """
+ Stop the inference process.
+ """
+ self.stop_flag = True
+
+ @torch.no_grad()
+ def run(self, inputs: dict):
+ """
+ Text to speech inference.
+
+ Args:
+ inputs (dict):
+ {
+ "text": "", # str.(required) text to be synthesized
+ "text_lang: "", # str.(required) language of the text to be synthesized
+ "ref_audio_path": "", # str.(required) reference audio path
+ "aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion
+ "prompt_text": "", # str.(optional) prompt text for the reference audio
+ "prompt_lang": "", # str.(required) language of the prompt text for the reference audio
+ "top_k": 5, # int. top k sampling
+ "top_p": 1, # float. top p sampling
+ "temperature": 1, # float. temperature for sampling
+ "text_split_method": "cut0", # str. text split method, see text_segmentation_method.py for details.
+ "batch_size": 1, # int. batch size for inference
+ "batch_threshold": 0.75, # float. threshold for batch splitting.
+ "split_bucket: True, # bool. whether to split the batch into multiple buckets.
+ "return_fragment": False, # bool. step by step return the audio fragment.
+ "speed_factor":1.0, # float. control the speed of the synthesized audio.
+ "fragment_interval":0.3, # float. to control the interval of the audio fragment.
+ "seed": -1, # int. random seed for reproducibility.
+ "parallel_infer": True, # bool. whether to use parallel inference.
+ "repetition_penalty": 1.35 # float. repetition penalty for T2S model.
+ "sample_steps": 32, # int. number of sampling steps for VITS model V3.
+ "super_sampling": False, # bool. whether to use super-sampling for audio when using VITS model V3.
+ }
+ returns:
+ Tuple[int, np.ndarray]: sampling rate and audio data.
+ """
+ ########## variables initialization ###########
+ self.stop_flag: bool = False
+ text: str = inputs.get("text", "")
+ text_lang: str = inputs.get("text_lang", "")
+ ref_audio_path: str = inputs.get("ref_audio_path", "")
+ aux_ref_audio_paths: list = inputs.get("aux_ref_audio_paths", [])
+ prompt_text: str = inputs.get("prompt_text", "")
+ prompt_lang: str = inputs.get("prompt_lang", "")
+ top_k: int = inputs.get("top_k", 5)
+ top_p: float = inputs.get("top_p", 1)
+ temperature: float = inputs.get("temperature", 1)
+ text_split_method: str = inputs.get("text_split_method", "cut0")
+ batch_size = inputs.get("batch_size", 1)
+ batch_threshold = inputs.get("batch_threshold", 0.75)
+ speed_factor = inputs.get("speed_factor", 1.0)
+ split_bucket = inputs.get("split_bucket", True)
+ return_fragment = inputs.get("return_fragment", False)
+ fragment_interval = inputs.get("fragment_interval", 0.3)
+ seed = inputs.get("seed", -1)
+ seed = -1 if seed in ["", None] else seed
+ actual_seed = set_seed(seed)
+ parallel_infer = inputs.get("parallel_infer", True)
+ repetition_penalty = inputs.get("repetition_penalty", 1.35)
+ sample_steps = inputs.get("sample_steps", 32)
+ super_sampling = inputs.get("super_sampling", False)
+
+ if parallel_infer:
+ print(i18n("并行推理模式已开启"))
+ self.t2s_model.model.infer_panel = self.t2s_model.model.infer_panel_batch_infer
+ else:
+ print(i18n("并行推理模式已关闭"))
+ self.t2s_model.model.infer_panel = self.t2s_model.model.infer_panel_naive_batched
+
+ if return_fragment:
+ print(i18n("分段返回模式已开启"))
+ if split_bucket:
+ split_bucket = False
+ print(i18n("分段返回模式不支持分桶处理,已自动关闭分桶处理"))
+
+ if split_bucket and speed_factor == 1.0 and not (self.configs.is_v3_synthesizer and parallel_infer):
+ print(i18n("分桶处理模式已开启"))
+ elif speed_factor != 1.0:
+ print(i18n("语速调节不支持分桶处理,已自动关闭分桶处理"))
+ split_bucket = False
+ elif self.configs.is_v3_synthesizer and parallel_infer:
+ print(i18n("当开启并行推理模式时,SoVits V3模型不支持分桶处理,已自动关闭分桶处理"))
+ split_bucket = False
+ else:
+ print(i18n("分桶处理模式已关闭"))
+
+ if fragment_interval < 0.01:
+ fragment_interval = 0.01
+ print(i18n("分段间隔过小,已自动设置为0.01"))
+
+ no_prompt_text = False
+ if prompt_text in [None, ""]:
+ no_prompt_text = True
+
+ assert text_lang in self.configs.languages
+ if not no_prompt_text:
+ assert prompt_lang in self.configs.languages
+
+ if no_prompt_text and self.configs.is_v3_synthesizer:
+ raise NO_PROMPT_ERROR("prompt_text cannot be empty when using SoVITS_V3")
+
+ if ref_audio_path in [None, ""] and (
+ (self.prompt_cache["prompt_semantic"] is None) or (self.prompt_cache["refer_spec"] in [None, []])
+ ):
+ raise ValueError(
+ "ref_audio_path cannot be empty, when the reference audio is not set using set_ref_audio()"
+ )
+
+ ###### setting reference audio and prompt text preprocessing ########
+ t0 = time.perf_counter()
+ if (ref_audio_path is not None) and (ref_audio_path != self.prompt_cache["ref_audio_path"]):
+ if not os.path.exists(ref_audio_path):
+ raise ValueError(f"{ref_audio_path} not exists")
+ self.set_ref_audio(ref_audio_path)
+
+ aux_ref_audio_paths = aux_ref_audio_paths if aux_ref_audio_paths is not None else []
+ paths = set(aux_ref_audio_paths) & set(self.prompt_cache["aux_ref_audio_paths"])
+ if not (len(list(paths)) == len(aux_ref_audio_paths) == len(self.prompt_cache["aux_ref_audio_paths"])):
+ self.prompt_cache["aux_ref_audio_paths"] = aux_ref_audio_paths
+ self.prompt_cache["refer_spec"] = [self.prompt_cache["refer_spec"][0]]
+ for path in aux_ref_audio_paths:
+ if path in [None, ""]:
+ continue
+ if not os.path.exists(path):
+ print(i18n("音频文件不存在,跳过:"), path)
+ continue
+ self.prompt_cache["refer_spec"].append(self._get_ref_spec(path))
+
+ if not no_prompt_text:
+ prompt_text = prompt_text.strip("\n")
+ if prompt_text[-1] not in splits:
+ prompt_text += "。" if prompt_lang != "en" else "."
+ print(i18n("实际输入的参考文本:"), prompt_text)
+ if self.prompt_cache["prompt_text"] != prompt_text:
+ phones, bert_features, norm_text = self.text_preprocessor.segment_and_extract_feature_for_text(
+ prompt_text, prompt_lang, self.configs.version
+ )
+ self.prompt_cache["prompt_text"] = prompt_text
+ self.prompt_cache["prompt_lang"] = prompt_lang
+ self.prompt_cache["phones"] = phones
+ self.prompt_cache["bert_features"] = bert_features
+ self.prompt_cache["norm_text"] = norm_text
+
+ ###### text preprocessing ########
+ t1 = time.perf_counter()
+ data: list = None
+ if not return_fragment:
+ data = self.text_preprocessor.preprocess(text, text_lang, text_split_method, self.configs.version)
+ if len(data) == 0:
+ yield 16000, np.zeros(int(16000), dtype=np.int16)
+ return
+
+ batch_index_list: list = None
+ data, batch_index_list = self.to_batch(
+ data,
+ prompt_data=self.prompt_cache if not no_prompt_text else None,
+ batch_size=batch_size,
+ threshold=batch_threshold,
+ split_bucket=split_bucket,
+ device=self.configs.device,
+ precision=self.precision,
+ )
+ else:
+ print(f"############ {i18n('切分文本')} ############")
+ texts = self.text_preprocessor.pre_seg_text(text, text_lang, text_split_method)
+ data = []
+ for i in range(len(texts)):
+ if i % batch_size == 0:
+ data.append([])
+ data[-1].append(texts[i])
+
+ def make_batch(batch_texts):
+ batch_data = []
+ print(f"############ {i18n('提取文本Bert特征')} ############")
+ for text in tqdm(batch_texts):
+ phones, bert_features, norm_text = self.text_preprocessor.segment_and_extract_feature_for_text(
+ text, text_lang, self.configs.version
+ )
+ if phones is None:
+ continue
+ res = {
+ "phones": phones,
+ "bert_features": bert_features,
+ "norm_text": norm_text,
+ }
+ batch_data.append(res)
+ if len(batch_data) == 0:
+ return None
+ batch, _ = self.to_batch(
+ batch_data,
+ prompt_data=self.prompt_cache if not no_prompt_text else None,
+ batch_size=batch_size,
+ threshold=batch_threshold,
+ split_bucket=False,
+ device=self.configs.device,
+ precision=self.precision,
+ )
+ return batch[0]
+
+ t2 = time.perf_counter()
+ try:
+ print("############ 推理 ############")
+ ###### inference ######
+ t_34 = 0.0
+ t_45 = 0.0
+ audio = []
+ output_sr = self.configs.sampling_rate if not self.configs.is_v3_synthesizer else 24000
+ for item in data:
+ t3 = time.perf_counter()
+ if return_fragment:
+ item = make_batch(item)
+ if item is None:
+ continue
+
+ batch_phones: List[torch.LongTensor] = item["phones"]
+ # batch_phones:torch.LongTensor = item["phones"]
+ batch_phones_len: torch.LongTensor = item["phones_len"]
+ all_phoneme_ids: torch.LongTensor = item["all_phones"]
+ all_phoneme_lens: torch.LongTensor = item["all_phones_len"]
+ all_bert_features: torch.LongTensor = item["all_bert_features"]
+ norm_text: str = item["norm_text"]
+ max_len = item["max_len"]
+
+ print(i18n("前端处理后的文本(每句):"), norm_text)
+ if no_prompt_text:
+ prompt = None
+ else:
+ prompt = (
+ self.prompt_cache["prompt_semantic"].expand(len(all_phoneme_ids), -1).to(self.configs.device)
+ )
+
+ print(f"############ {i18n('预测语义Token')} ############")
+ pred_semantic_list, idx_list = self.t2s_model.model.infer_panel(
+ all_phoneme_ids,
+ all_phoneme_lens,
+ prompt,
+ all_bert_features,
+ # prompt_phone_len=ph_offset,
+ top_k=top_k,
+ top_p=top_p,
+ temperature=temperature,
+ early_stop_num=self.configs.hz * self.configs.max_sec,
+ max_len=max_len,
+ repetition_penalty=repetition_penalty,
+ )
+ t4 = time.perf_counter()
+ t_34 += t4 - t3
+
+ refer_audio_spec: torch.Tensor = [
+ item.to(dtype=self.precision, device=self.configs.device)
+ for item in self.prompt_cache["refer_spec"]
+ ]
+
+ batch_audio_fragment = []
+
+ # ## vits并行推理 method 1
+ # pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)]
+ # pred_semantic_len = torch.LongTensor([item.shape[0] for item in pred_semantic_list]).to(self.configs.device)
+ # pred_semantic = self.batch_sequences(pred_semantic_list, axis=0, pad_value=0).unsqueeze(0)
+ # max_len = 0
+ # for i in range(0, len(batch_phones)):
+ # max_len = max(max_len, batch_phones[i].shape[-1])
+ # batch_phones = self.batch_sequences(batch_phones, axis=0, pad_value=0, max_length=max_len)
+ # batch_phones = batch_phones.to(self.configs.device)
+ # batch_audio_fragment = (self.vits_model.batched_decode(
+ # pred_semantic, pred_semantic_len, batch_phones, batch_phones_len,refer_audio_spec
+ # ))
+ print(f"############ {i18n('合成音频')} ############")
+ if not self.configs.is_v3_synthesizer:
+ if speed_factor == 1.0:
+ print(f"{i18n('并行合成中')}...")
+ # ## vits并行推理 method 2
+ pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)]
+ upsample_rate = math.prod(self.vits_model.upsample_rates)
+ audio_frag_idx = [
+ pred_semantic_list[i].shape[0] * 2 * upsample_rate
+ for i in range(0, len(pred_semantic_list))
+ ]
+ audio_frag_end_idx = [sum(audio_frag_idx[: i + 1]) for i in range(0, len(audio_frag_idx))]
+ all_pred_semantic = (
+ torch.cat(pred_semantic_list).unsqueeze(0).unsqueeze(0).to(self.configs.device)
+ )
+ _batch_phones = torch.cat(batch_phones).unsqueeze(0).to(self.configs.device)
+ _batch_audio_fragment = self.vits_model.decode(
+ all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor
+ ).detach()[0, 0, :]
+ audio_frag_end_idx.insert(0, 0)
+ batch_audio_fragment = [
+ _batch_audio_fragment[audio_frag_end_idx[i - 1] : audio_frag_end_idx[i]]
+ for i in range(1, len(audio_frag_end_idx))
+ ]
+ else:
+ # ## vits串行推理
+ for i, idx in enumerate(tqdm(idx_list)):
+ phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
+ _pred_semantic = (
+ pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)
+ ) # .unsqueeze(0)#mq要多unsqueeze一次
+ audio_fragment = self.vits_model.decode(
+ _pred_semantic, phones, refer_audio_spec, speed=speed_factor
+ ).detach()[0, 0, :]
+ batch_audio_fragment.append(audio_fragment) ###试试重建不带上prompt部分
+ else:
+ if parallel_infer:
+ print(f"{i18n('并行合成中')}...")
+ audio_fragments = self.v3_synthesis_batched_infer(
+ idx_list, pred_semantic_list, batch_phones, speed=speed_factor, sample_steps=sample_steps
+ )
+ batch_audio_fragment.extend(audio_fragments)
+ else:
+ for i, idx in enumerate(tqdm(idx_list)):
+ phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
+ _pred_semantic = (
+ pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)
+ ) # .unsqueeze(0)#mq要多unsqueeze一次
+ audio_fragment = self.v3_synthesis(
+ _pred_semantic, phones, speed=speed_factor, sample_steps=sample_steps
+ )
+ batch_audio_fragment.append(audio_fragment)
+
+ t5 = time.perf_counter()
+ t_45 += t5 - t4
+ if return_fragment:
+ print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t4 - t3, t5 - t4))
+ yield self.audio_postprocess(
+ [batch_audio_fragment],
+ output_sr,
+ None,
+ speed_factor,
+ False,
+ fragment_interval,
+ super_sampling if self.configs.is_v3_synthesizer else False,
+ )
+ else:
+ audio.append(batch_audio_fragment)
+
+ if self.stop_flag:
+ yield 16000, np.zeros(int(16000), dtype=np.int16)
+ return
+
+ if not return_fragment:
+ print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t_34, t_45))
+ if len(audio) == 0:
+ yield 16000, np.zeros(int(16000), dtype=np.int16)
+ return
+ yield self.audio_postprocess(
+ audio,
+ output_sr,
+ batch_index_list,
+ speed_factor,
+ split_bucket,
+ fragment_interval,
+ super_sampling if self.configs.is_v3_synthesizer else False,
+ )
+
+ except Exception as e:
+ traceback.print_exc()
+ # 必须返回一个空音频, 否则会导致显存不释放。
+ yield 16000, np.zeros(int(16000), dtype=np.int16)
+ # 重置模型, 否则会导致显存释放不完全。
+ del self.t2s_model
+ del self.vits_model
+ self.t2s_model = None
+ self.vits_model = None
+ self.init_t2s_weights(self.configs.t2s_weights_path)
+ self.init_vits_weights(self.configs.vits_weights_path)
+ raise e
+ finally:
+ self.empty_cache()
+
+ def empty_cache(self):
+ try:
+ gc.collect() # 触发gc的垃圾回收。避免内存一直增长。
+ if "cuda" in str(self.configs.device):
+ torch.cuda.empty_cache()
+ elif str(self.configs.device) == "mps":
+ torch.mps.empty_cache()
+ except:
+ pass
+
+ def audio_postprocess(
+ self,
+ audio: List[torch.Tensor],
+ sr: int,
+ batch_index_list: list = None,
+ speed_factor: float = 1.0,
+ split_bucket: bool = True,
+ fragment_interval: float = 0.3,
+ super_sampling: bool = False,
+ ) -> Tuple[int, np.ndarray]:
+ zero_wav = torch.zeros(
+ int(self.configs.sampling_rate * fragment_interval), dtype=self.precision, device=self.configs.device
+ )
+
+ for i, batch in enumerate(audio):
+ for j, audio_fragment in enumerate(batch):
+ max_audio = torch.abs(audio_fragment).max() # 简单防止16bit爆音
+ if max_audio > 1:
+ audio_fragment /= max_audio
+ audio_fragment: torch.Tensor = torch.cat([audio_fragment, zero_wav], dim=0)
+ audio[i][j] = audio_fragment
+
+ if split_bucket:
+ audio = self.recovery_order(audio, batch_index_list)
+ else:
+ # audio = [item for batch in audio for item in batch]
+ audio = sum(audio, [])
+
+ audio = torch.cat(audio, dim=0)
+
+ if super_sampling:
+ print(f"############ {i18n('音频超采样')} ############")
+ t1 = time.perf_counter()
+ self.init_sr_model()
+ if not self.sr_model_not_exist:
+ audio, sr = self.sr_model(audio.unsqueeze(0), sr)
+ max_audio = np.abs(audio).max()
+ if max_audio > 1:
+ audio /= max_audio
+ t2 = time.perf_counter()
+ print(f"超采样用时:{t2 - t1:.3f}s")
+ else:
+ audio = audio.cpu().numpy()
+
+ audio = (audio * 32768).astype(np.int16)
+
+ # try:
+ # if speed_factor != 1.0:
+ # audio = speed_change(audio, speed=speed_factor, sr=int(sr))
+ # except Exception as e:
+ # print(f"Failed to change speed of audio: \n{e}")
+
+ return sr, audio
+
+ def v3_synthesis(
+ self, semantic_tokens: torch.Tensor, phones: torch.Tensor, speed: float = 1.0, sample_steps: int = 32
+ ):
+ prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device)
+ prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device)
+ refer_audio_spec = self.prompt_cache["refer_spec"][0].to(dtype=self.precision, device=self.configs.device)
+
+ fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
+ ref_audio: torch.Tensor = self.prompt_cache["raw_audio"]
+ ref_sr = self.prompt_cache["raw_sr"]
+ ref_audio = ref_audio.to(self.configs.device).float()
+ if ref_audio.shape[0] == 2:
+ ref_audio = ref_audio.mean(0).unsqueeze(0)
+ if ref_sr != 24000:
+ ref_audio = resample(ref_audio, ref_sr, self.configs.device)
+
+ mel2 = mel_fn(ref_audio)
+ mel2 = norm_spec(mel2)
+ T_min = min(mel2.shape[2], fea_ref.shape[2])
+ mel2 = mel2[:, :, :T_min]
+ fea_ref = fea_ref[:, :, :T_min]
+ if T_min > 468:
+ mel2 = mel2[:, :, -468:]
+ fea_ref = fea_ref[:, :, -468:]
+ T_min = 468
+ chunk_len = 934 - T_min
+
+ mel2 = mel2.to(self.precision)
+ fea_todo, ge = self.vits_model.decode_encp(semantic_tokens, phones, refer_audio_spec, ge, speed)
+
+ cfm_resss = []
+ idx = 0
+ while 1:
+ fea_todo_chunk = fea_todo[:, :, idx : idx + chunk_len]
+ if fea_todo_chunk.shape[-1] == 0:
+ break
+ idx += chunk_len
+ fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1)
+
+ cfm_res = self.vits_model.cfm.inference(
+ fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0
+ )
+ cfm_res = cfm_res[:, :, mel2.shape[2] :]
+
+ mel2 = cfm_res[:, :, -T_min:]
+ fea_ref = fea_todo_chunk[:, :, -T_min:]
+
+ cfm_resss.append(cfm_res)
+ cfm_res = torch.cat(cfm_resss, 2)
+ cfm_res = denorm_spec(cfm_res)
+
+ with torch.inference_mode():
+ wav_gen = self.bigvgan_model(cfm_res)
+ audio = wav_gen[0][0] # .cpu().detach().numpy()
+
+ return audio
+
+ def v3_synthesis_batched_infer(
+ self,
+ idx_list: List[int],
+ semantic_tokens_list: List[torch.Tensor],
+ batch_phones: List[torch.Tensor],
+ speed: float = 1.0,
+ sample_steps: int = 32,
+ ) -> List[torch.Tensor]:
+ prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device)
+ prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device)
+ refer_audio_spec = self.prompt_cache["refer_spec"][0].to(dtype=self.precision, device=self.configs.device)
+
+ fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
+ ref_audio: torch.Tensor = self.prompt_cache["raw_audio"]
+ ref_sr = self.prompt_cache["raw_sr"]
+ ref_audio = ref_audio.to(self.configs.device).float()
+ if ref_audio.shape[0] == 2:
+ ref_audio = ref_audio.mean(0).unsqueeze(0)
+ if ref_sr != 24000:
+ ref_audio = resample(ref_audio, ref_sr, self.configs.device)
+
+ mel2 = mel_fn(ref_audio)
+ mel2 = norm_spec(mel2)
+ T_min = min(mel2.shape[2], fea_ref.shape[2])
+ mel2 = mel2[:, :, :T_min]
+ fea_ref = fea_ref[:, :, :T_min]
+ if T_min > 468:
+ mel2 = mel2[:, :, -468:]
+ fea_ref = fea_ref[:, :, -468:]
+ T_min = 468
+ chunk_len = 934 - T_min
+
+ mel2 = mel2.to(self.precision)
+
+ # #### batched inference
+ overlapped_len = 12
+ feat_chunks = []
+ feat_lens = []
+ feat_list = []
+
+ for i, idx in enumerate(idx_list):
+ phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
+ semantic_tokens = (
+ semantic_tokens_list[i][-idx:].unsqueeze(0).unsqueeze(0)
+ ) # .unsqueeze(0)#mq要多unsqueeze一次
+ feat, _ = self.vits_model.decode_encp(semantic_tokens, phones, refer_audio_spec, ge, speed)
+ feat_list.append(feat)
+ feat_lens.append(feat.shape[2])
+
+ feats = torch.cat(feat_list, 2)
+ feats_padded = F.pad(feats, (overlapped_len, 0), "constant", 0)
+ pos = 0
+ padding_len = 0
+ while True:
+ if pos == 0:
+ chunk = feats_padded[:, :, pos : pos + chunk_len]
+ else:
+ pos = pos - overlapped_len
+ chunk = feats_padded[:, :, pos : pos + chunk_len]
+ pos += chunk_len
+ if chunk.shape[-1] == 0:
+ break
+
+ # padding for the last chunk
+ padding_len = chunk_len - chunk.shape[2]
+ if padding_len != 0:
+ chunk = F.pad(chunk, (0, padding_len), "constant", 0)
+ feat_chunks.append(chunk)
+
+ feat_chunks = torch.cat(feat_chunks, 0)
+ bs = feat_chunks.shape[0]
+ fea_ref = fea_ref.repeat(bs, 1, 1)
+ fea = torch.cat([fea_ref, feat_chunks], 2).transpose(2, 1)
+ pred_spec = self.vits_model.cfm.inference(
+ fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0
+ )
+ pred_spec = pred_spec[:, :, -chunk_len:]
+ dd = pred_spec.shape[1]
+ pred_spec = pred_spec.permute(1, 0, 2).contiguous().view(dd, -1).unsqueeze(0)
+ # pred_spec = pred_spec[..., :-padding_len]
+
+ pred_spec = denorm_spec(pred_spec)
+
+ with torch.no_grad():
+ wav_gen = self.bigvgan_model(pred_spec)
+ audio = wav_gen[0][0] # .cpu().detach().numpy()
+
+ audio_fragments = []
+ upsample_rate = 256
+ pos = 0
+
+ while pos < audio.shape[-1]:
+ audio_fragment = audio[pos : pos + chunk_len * upsample_rate]
+ audio_fragments.append(audio_fragment)
+ pos += chunk_len * upsample_rate
+
+ audio = self.sola_algorithm(audio_fragments, overlapped_len * upsample_rate)
+ audio = audio[overlapped_len * upsample_rate : -padding_len * upsample_rate]
+
+ audio_fragments = []
+ for feat_len in feat_lens:
+ audio_fragment = audio[: feat_len * upsample_rate]
+ audio_fragments.append(audio_fragment)
+ audio = audio[feat_len * upsample_rate :]
+
+ return audio_fragments
+
+ def sola_algorithm(
+ self,
+ audio_fragments: List[torch.Tensor],
+ overlap_len: int,
+ ):
+ for i in range(len(audio_fragments) - 1):
+ f1 = audio_fragments[i]
+ f2 = audio_fragments[i + 1]
+ w1 = f1[-overlap_len:]
+ w2 = f2[:overlap_len]
+ assert w1.shape == w2.shape
+ corr = F.conv1d(w1.view(1, 1, -1), w2.view(1, 1, -1), padding=w2.shape[-1] // 2).view(-1)[:-1]
+ idx = corr.argmax()
+ f1_ = f1[: -(overlap_len - idx)]
+ audio_fragments[i] = f1_
+
+ f2_ = f2[idx:]
+ window = torch.hann_window((overlap_len - idx) * 2, device=f1.device, dtype=f1.dtype)
+ f2_[: (overlap_len - idx)] = (
+ window[: (overlap_len - idx)] * f2_[: (overlap_len - idx)]
+ + window[(overlap_len - idx) :] * f1[-(overlap_len - idx) :]
+ )
+ audio_fragments[i + 1] = f2_
+
+ return torch.cat(audio_fragments, 0)
diff --git a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..426929f8bdfbde172d584c51810751e3d3635fac
--- /dev/null
+++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
@@ -0,0 +1,237 @@
+import os
+import sys
+import threading
+
+from tqdm import tqdm
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+import re
+import torch
+from text.LangSegmenter import LangSegmenter
+from text import chinese
+from typing import Dict, List, Tuple
+from text.cleaner import clean_text
+from text import cleaned_text_to_sequence
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+from TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_method as get_seg_method
+
+from tools.i18n.i18n import I18nAuto, scan_language_list
+
+language = os.environ.get("language", "Auto")
+language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
+i18n = I18nAuto(language=language)
+punctuation = set(["!", "?", "…", ",", ".", "-"])
+
+
+def get_first(text: str) -> str:
+ pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]"
+ text = re.split(pattern, text)[0].strip()
+ return text
+
+
+def merge_short_text_in_array(texts: str, threshold: int) -> list:
+ if (len(texts)) < 2:
+ return texts
+ result = []
+ text = ""
+ for ele in texts:
+ text += ele
+ if len(text) >= threshold:
+ result.append(text)
+ text = ""
+ if len(text) > 0:
+ if len(result) == 0:
+ result.append(text)
+ else:
+ result[len(result) - 1] += text
+ return result
+
+
+class TextPreprocessor:
+ def __init__(self, bert_model: AutoModelForMaskedLM, tokenizer: AutoTokenizer, device: torch.device):
+ self.bert_model = bert_model
+ self.tokenizer = tokenizer
+ self.device = device
+ self.bert_lock = threading.RLock()
+
+ def preprocess(self, text: str, lang: str, text_split_method: str, version: str = "v2") -> List[Dict]:
+ print(f"############ {i18n('切分文本')} ############")
+ text = self.replace_consecutive_punctuation(text)
+ texts = self.pre_seg_text(text, lang, text_split_method)
+ result = []
+ print(f"############ {i18n('提取文本Bert特征')} ############")
+ for text in tqdm(texts):
+ phones, bert_features, norm_text = self.segment_and_extract_feature_for_text(text, lang, version)
+ if phones is None or norm_text == "":
+ continue
+ res = {
+ "phones": phones,
+ "bert_features": bert_features,
+ "norm_text": norm_text,
+ }
+ result.append(res)
+ return result
+
+ def pre_seg_text(self, text: str, lang: str, text_split_method: str):
+ text = text.strip("\n")
+ if len(text) == 0:
+ return []
+ if text[0] not in splits and len(get_first(text)) < 4:
+ text = "。" + text if lang != "en" else "." + text
+ print(i18n("实际输入的目标文本:"))
+ print(text)
+
+ seg_method = get_seg_method(text_split_method)
+ text = seg_method(text)
+
+ while "\n\n" in text:
+ text = text.replace("\n\n", "\n")
+
+ _texts = text.split("\n")
+ _texts = self.filter_text(_texts)
+ _texts = merge_short_text_in_array(_texts, 5)
+ texts = []
+
+ for text in _texts:
+ # 解决输入目标文本的空行导致报错的问题
+ if len(text.strip()) == 0:
+ continue
+ if not re.sub("\W+", "", text):
+ # 检测一下,如果是纯符号,就跳过。
+ continue
+ if text[-1] not in splits:
+ text += "。" if lang != "en" else "."
+
+ # 解决句子过长导致Bert报错的问题
+ if len(text) > 510:
+ texts.extend(split_big_text(text))
+ else:
+ texts.append(text)
+
+ print(i18n("实际输入的目标文本(切句后):"))
+ print(texts)
+ return texts
+
+ def segment_and_extract_feature_for_text(
+ self, text: str, language: str, version: str = "v1"
+ ) -> Tuple[list, torch.Tensor, str]:
+ return self.get_phones_and_bert(text, language, version)
+
+ def get_phones_and_bert(self, text: str, language: str, version: str, final: bool = False):
+ with self.bert_lock:
+ if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
+ # language = language.replace("all_","")
+ formattext = text
+ while " " in formattext:
+ formattext = formattext.replace(" ", " ")
+ if language == "all_zh":
+ if re.search(r"[A-Za-z]", formattext):
+ formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext)
+ formattext = chinese.mix_text_normalize(formattext)
+ return self.get_phones_and_bert(formattext, "zh", version)
+ else:
+ phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version)
+ bert = self.get_bert_feature(norm_text, word2ph).to(self.device)
+ elif language == "all_yue" and re.search(r"[A-Za-z]", formattext):
+ formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext)
+ formattext = chinese.mix_text_normalize(formattext)
+ return self.get_phones_and_bert(formattext, "yue", version)
+ else:
+ phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version)
+ bert = torch.zeros(
+ (1024, len(phones)),
+ dtype=torch.float32,
+ ).to(self.device)
+ elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}:
+ textlist = []
+ langlist = []
+ if language == "auto":
+ for tmp in LangSegmenter.getTexts(text):
+ langlist.append(tmp["lang"])
+ textlist.append(tmp["text"])
+ elif language == "auto_yue":
+ for tmp in LangSegmenter.getTexts(text):
+ if tmp["lang"] == "zh":
+ tmp["lang"] = "yue"
+ langlist.append(tmp["lang"])
+ textlist.append(tmp["text"])
+ else:
+ for tmp in LangSegmenter.getTexts(text):
+ if tmp["lang"] == "en":
+ langlist.append(tmp["lang"])
+ else:
+ # 因无法区别中日韩文汉字,以用户输入为准
+ langlist.append(language)
+ textlist.append(tmp["text"])
+ # print(textlist)
+ # print(langlist)
+ phones_list = []
+ bert_list = []
+ norm_text_list = []
+ for i in range(len(textlist)):
+ lang = langlist[i]
+ phones, word2ph, norm_text = self.clean_text_inf(textlist[i], lang, version)
+ bert = self.get_bert_inf(phones, word2ph, norm_text, lang)
+ phones_list.append(phones)
+ norm_text_list.append(norm_text)
+ bert_list.append(bert)
+ bert = torch.cat(bert_list, dim=1)
+ phones = sum(phones_list, [])
+ norm_text = "".join(norm_text_list)
+
+ if not final and len(phones) < 6:
+ return self.get_phones_and_bert("." + text, language, version, final=True)
+
+ return phones, bert, norm_text
+
+ def get_bert_feature(self, text: str, word2ph: list) -> torch.Tensor:
+ with torch.no_grad():
+ inputs = self.tokenizer(text, return_tensors="pt")
+ for i in inputs:
+ inputs[i] = inputs[i].to(self.device)
+ res = self.bert_model(**inputs, output_hidden_states=True)
+ res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
+ assert len(word2ph) == len(text)
+ phone_level_feature = []
+ for i in range(len(word2ph)):
+ repeat_feature = res[i].repeat(word2ph[i], 1)
+ phone_level_feature.append(repeat_feature)
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
+ return phone_level_feature.T
+
+ def clean_text_inf(self, text: str, language: str, version: str = "v2"):
+ language = language.replace("all_", "")
+ phones, word2ph, norm_text = clean_text(text, language, version)
+ phones = cleaned_text_to_sequence(phones, version)
+ return phones, word2ph, norm_text
+
+ def get_bert_inf(self, phones: list, word2ph: list, norm_text: str, language: str):
+ language = language.replace("all_", "")
+ if language == "zh":
+ feature = self.get_bert_feature(norm_text, word2ph).to(self.device)
+ else:
+ feature = torch.zeros(
+ (1024, len(phones)),
+ dtype=torch.float32,
+ ).to(self.device)
+
+ return feature
+
+ def filter_text(self, texts):
+ _text = []
+ if all(text in [None, " ", "\n", ""] for text in texts):
+ raise ValueError(i18n("请输入有效文本"))
+ for text in texts:
+ if text in [None, " ", ""]:
+ pass
+ else:
+ _text.append(text)
+ return _text
+
+ def replace_consecutive_punctuation(self, text):
+ punctuations = "".join(re.escape(p) for p in punctuation)
+ pattern = f"([{punctuations}])([{punctuations}])+"
+ result = re.sub(pattern, r"\1", text)
+ return result
diff --git a/GPT_SoVITS/TTS_infer_pack/__init__.py b/GPT_SoVITS/TTS_infer_pack/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8579a63215aba6c2dc5674c4e8711256c7e904a7
--- /dev/null
+++ b/GPT_SoVITS/TTS_infer_pack/__init__.py
@@ -0,0 +1 @@
+from . import TTS, text_segmentation_method
diff --git a/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py b/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
new file mode 100644
index 0000000000000000000000000000000000000000..fda70a49834ea43c2a3a55154705e111b24fa196
--- /dev/null
+++ b/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
@@ -0,0 +1,189 @@
+import re
+from typing import Callable
+
+punctuation = set(["!", "?", "…", ",", ".", "-", " "])
+METHODS = dict()
+
+
+def get_method(name: str) -> Callable:
+ method = METHODS.get(name, None)
+ if method is None:
+ raise ValueError(f"Method {name} not found")
+ return method
+
+
+def get_method_names() -> list:
+ return list(METHODS.keys())
+
+
+def register_method(name):
+ def decorator(func):
+ METHODS[name] = func
+ return func
+
+ return decorator
+
+
+splits = {
+ ",",
+ "。",
+ "?",
+ "!",
+ ",",
+ ".",
+ "?",
+ "!",
+ "~",
+ ":",
+ ":",
+ "—",
+ "…",
+}
+
+
+def split_big_text(text, max_len=510):
+ # 定义全角和半角标点符号
+ punctuation = "".join(splits)
+
+ # 切割文本
+ segments = re.split("([" + punctuation + "])", text)
+
+ # 初始化结果列表和当前片段
+ result = []
+ current_segment = ""
+
+ for segment in segments:
+ # 如果当前片段加上新的片段长度超过max_len,就将当前片段加入结果列表,并重置当前片段
+ if len(current_segment + segment) > max_len:
+ result.append(current_segment)
+ current_segment = segment
+ else:
+ current_segment += segment
+
+ # 将最后一个片段加入结果列表
+ if current_segment:
+ result.append(current_segment)
+
+ return result
+
+
+def split(todo_text):
+ todo_text = todo_text.replace("……", "。").replace("——", ",")
+ if todo_text[-1] not in splits:
+ todo_text += "。"
+ i_split_head = i_split_tail = 0
+ len_text = len(todo_text)
+ todo_texts = []
+ while 1:
+ if i_split_head >= len_text:
+ break # 结尾一定有标点,所以直接跳出即可,最后一段在上次已加入
+ if todo_text[i_split_head] in splits:
+ i_split_head += 1
+ todo_texts.append(todo_text[i_split_tail:i_split_head])
+ i_split_tail = i_split_head
+ else:
+ i_split_head += 1
+ return todo_texts
+
+
+# 不切
+@register_method("cut0")
+def cut0(inp):
+ if not set(inp).issubset(punctuation):
+ return inp
+ else:
+ return "/n"
+
+
+# 凑四句一切
+@register_method("cut1")
+def cut1(inp):
+ inp = inp.strip("\n")
+ inps = split(inp)
+ split_idx = list(range(0, len(inps), 4))
+ split_idx[-1] = None
+ if len(split_idx) > 1:
+ opts = []
+ for idx in range(len(split_idx) - 1):
+ opts.append("".join(inps[split_idx[idx] : split_idx[idx + 1]]))
+ else:
+ opts = [inp]
+ opts = [item for item in opts if not set(item).issubset(punctuation)]
+ return "\n".join(opts)
+
+
+# 凑50字一切
+@register_method("cut2")
+def cut2(inp):
+ inp = inp.strip("\n")
+ inps = split(inp)
+ if len(inps) < 2:
+ return inp
+ opts = []
+ summ = 0
+ tmp_str = ""
+ for i in range(len(inps)):
+ summ += len(inps[i])
+ tmp_str += inps[i]
+ if summ > 50:
+ summ = 0
+ opts.append(tmp_str)
+ tmp_str = ""
+ if tmp_str != "":
+ opts.append(tmp_str)
+ # print(opts)
+ if len(opts) > 1 and len(opts[-1]) < 50: ##如果最后一个太短了,和前一个合一起
+ opts[-2] = opts[-2] + opts[-1]
+ opts = opts[:-1]
+ opts = [item for item in opts if not set(item).issubset(punctuation)]
+ return "\n".join(opts)
+
+
+# 按中文句号。切
+@register_method("cut3")
+def cut3(inp):
+ inp = inp.strip("\n")
+ opts = ["%s" % item for item in inp.strip("。").split("。")]
+ opts = [item for item in opts if not set(item).issubset(punctuation)]
+ return "\n".join(opts)
+
+
+# 按英文句号.切
+@register_method("cut4")
+def cut4(inp):
+ inp = inp.strip("\n")
+ opts = re.split(r"(? 0 and i < len(inp) - 1 and inp[i - 1].isdigit() and inp[i + 1].isdigit():
+ items.append(char)
+ else:
+ items.append(char)
+ mergeitems.append("".join(items))
+ items = []
+ else:
+ items.append(char)
+
+ if items:
+ mergeitems.append("".join(items))
+
+ opt = [item for item in mergeitems if not set(item).issubset(punds)]
+ return "\n".join(opt)
+
+
+if __name__ == "__main__":
+ method = get_method("cut5")
+ print(method("你好,我是小明。你好,我是小红。你好,我是小刚。你好,我是小张。"))
diff --git a/GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin b/GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a73dedefbc10383bc70e289fe0897d2157c6f66b
--- /dev/null
+++ b/GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24164f129c66499d1346e2aa55f183250c223161ec2770c0da3d3b08cf432d3c
+size 188811417
diff --git a/GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin b/GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9d67762821a81a27af2c60d8094ad86cad6510c3
--- /dev/null
+++ b/GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e53a693acc59ace251d143d068096ae0d7b79e4b1b503fa84c9dcf576448c1d8
+size 651225145
diff --git a/GPT_SoVITS/pretrained_models/fast_langdetect/lid.176.bin b/GPT_SoVITS/pretrained_models/fast_langdetect/lid.176.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f8707035ea3cc86ac248a4e31fa6368cd845476a
--- /dev/null
+++ b/GPT_SoVITS/pretrained_models/fast_langdetect/lid.176.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e69ec5451bc261cc7844e49e4792a85d7f09c06789ec800fc4a44aec362764e
+size 131266198
diff --git a/GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt b/GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..3b8a968297b05d69f441e5981111d48426fdcb9b
--- /dev/null
+++ b/GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:732f94e63b148066e24c7f9d2637f3374083e637635f07fbdb695dee20ddbe1f
+size 155315150
diff --git a/GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2D2333k.pth b/GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2D2333k.pth
new file mode 100644
index 0000000000000000000000000000000000000000..5eecfd7267624bfb64cd8e3be88f1e1d2d99e19b
--- /dev/null
+++ b/GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2D2333k.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ae7fe8dd8c8f2e718de359e00edac88b0c71ab2fd10b07ad4cc45070eb8a836
+size 93534164
diff --git a/GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth b/GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
new file mode 100644
index 0000000000000000000000000000000000000000..51d3419ff8403b6d73334b4dba7aabdd79ed38fe
--- /dev/null
+++ b/GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:924fdccaa3c574bf139c25c9759aa1ed3b3f99e19a7c529ee996c2bc17663695
+size 106035259
diff --git a/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth b/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth
new file mode 100644
index 0000000000000000000000000000000000000000..0176922b6a3ddd823b3357fd934c14279980c583
--- /dev/null
+++ b/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:906fe22f48c3e037a389df291d4d32a9414e15dbb8f9628643e83aaced109ea4
+size 769025545
diff --git a/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth b/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6691a922691bb7fab617b5aed395b1566e4cec87
--- /dev/null
+++ b/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d611913df7b12d49e8976c944558d2d096816365edfc6c35a9e85b67dd14ed9
+size 57781109
diff --git a/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x/bigvgan_generator.pt b/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x/bigvgan_generator.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a0fe198f23ef09727e6b0dfdebd5ea14e1d16a9f
--- /dev/null
+++ b/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x/bigvgan_generator.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee5e2f9cd60b51db75e1806f4fe7621733757586c541c78cb3dd369d5ba24476
+size 225179685
diff --git a/GPT_SoVITS/pretrained_models/s1v3.ckpt b/GPT_SoVITS/pretrained_models/s1v3.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..54a87a3557ca48356e31cf3651155a6dbb8a05da
--- /dev/null
+++ b/GPT_SoVITS/pretrained_models/s1v3.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87133414860ea14ff6620c483a3db5ed07b44be42e2c3fcdad65523a729a745a
+size 155284856
diff --git a/GPT_SoVITS/pretrained_models/s2G488k.pth b/GPT_SoVITS/pretrained_models/s2G488k.pth
new file mode 100644
index 0000000000000000000000000000000000000000..a63e37a8755ad141ba68db9ef0109bf36ec020c4
--- /dev/null
+++ b/GPT_SoVITS/pretrained_models/s2G488k.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:020a014e1e01e550e510f2f61fae5e5f5b6aab40f15c22f1f12f724df507e835
+size 105973721
diff --git a/GPT_SoVITS/pretrained_models/s2Gv3.pth b/GPT_SoVITS/pretrained_models/s2Gv3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..e0f4efe7cf733b418fe16663d7c2fd90abd5a2bc
--- /dev/null
+++ b/GPT_SoVITS/pretrained_models/s2Gv3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f33abb1920076d988e1711d5f41b5c9c6d7f92575b4acf0ad4fae6a4ebf0cf19
+size 769035145
diff --git a/GPT_SoVITS/text/G2PWModel/g2pW.onnx b/GPT_SoVITS/text/G2PWModel/g2pW.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..0fbd9f7affb0fb08156a85be11ef7ba3602f5f44
--- /dev/null
+++ b/GPT_SoVITS/text/G2PWModel/g2pW.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2eb3c71fd95117b2e1abef8d2d0cd78aae894bbe7f0fac105ddc9c32ce63cbd0
+size 635212732
diff --git a/GPT_SoVITS/text/engdict_cache.pickle b/GPT_SoVITS/text/engdict_cache.pickle
new file mode 100644
index 0000000000000000000000000000000000000000..cdca61190dbe75c37ce3a05e8d5c942fc1a2d726
--- /dev/null
+++ b/GPT_SoVITS/text/engdict_cache.pickle
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9bff9393f4b192d873a11335efc8f124771087b6dc847d34fd240c2846889d2b
+size 5965909
diff --git a/GPT_SoVITS/text/g2pw/polyphonic.pickle b/GPT_SoVITS/text/g2pw/polyphonic.pickle
new file mode 100644
index 0000000000000000000000000000000000000000..0a8912d1b2cb3016eac46dc55ec6e10a7b442f7b
--- /dev/null
+++ b/GPT_SoVITS/text/g2pw/polyphonic.pickle
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f425246160a32c578557cd3151cd0bb97f5f44c3aaf65e718dd2c3213c04fb4b
+size 1322387
diff --git a/GPT_SoVITS/text/ja_userdic/userdict.csv b/GPT_SoVITS/text/ja_userdic/userdict.csv
new file mode 100644
index 0000000000000000000000000000000000000000..d9aa2378c16a27eaa512a2a7642d7241bd050e8b
--- /dev/null
+++ b/GPT_SoVITS/text/ja_userdic/userdict.csv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d857e443ee48d9641096816a98996669602895411e4330d7d91d1dbe1103389f
+size 17180971
diff --git a/GPT_SoVITS/text/namedict_cache.pickle b/GPT_SoVITS/text/namedict_cache.pickle
new file mode 100644
index 0000000000000000000000000000000000000000..9ad2ed9a6478077f8fe4469192597fa8f6442cf6
--- /dev/null
+++ b/GPT_SoVITS/text/namedict_cache.pickle
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:559552094c4a6e995213e3fa586330e078ef8cb3a7a95a3109e945111cd2bfc1
+size 760663
diff --git a/docs/cn/Changelog_CN.md b/docs/cn/Changelog_CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..cd1d50d0ef4caec723f651c84f616ae8cc0b8f50
--- /dev/null
+++ b/docs/cn/Changelog_CN.md
@@ -0,0 +1,302 @@
+### 20240121更新
+
+1-config添加is_share, 诸如colab等场景可以将此改为True, 来使得webui映射到公网
+
+2-WebUI添加英文系统英文翻译适配
+
+3-cmd-asr自动判断是否已自带damo模型, 如不在默认目录上将从modelscope自带下载
+
+4-[SoVITS训练报错ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) 尝试修复(过滤长度0的样本等)
+
+5-清理TEMP文件夹缓存音频等文件
+
+6-大幅削弱合成音频包含参考音频结尾的问题
+
+### 20240122更新
+
+1-修复过短输出文件返回重复参考音频的问题.
+
+2-经测试, 英文日文训练原生支持(日文训练需要根目录不含非英文等特殊字符).
+
+3-音频路径检查.如果尝试读取输入错的路径报错路径不存在, 而非ffmpeg错误.
+
+### 20240123更新
+
+1-解决hubert提取nan导致SoVITS/GPT训练报错ZeroDivisionError的问题
+
+2-支持推理界面快速切换模型
+
+3-优化模型文件排序逻辑
+
+4-中文分词使用jieba_fast代替jieba
+
+### 20240126更新
+
+1-支持输出文本中英混合、日英混合
+
+2-输出可选切分模式
+
+3-修复uvr5读取到目录自动跳出的问题
+
+4-修复多个换行导致推理报错
+
+5-去除推理界面大量冗余log
+
+6-支持mac训练推理
+
+7-自动识别不支持半精度的卡强制单精度.cpu推理下强制单精度.
+
+### 20240128更新
+
+1-修复数字转汉字念法问题
+
+2-修复句首少量字容易吞字的问题
+
+3-通过限制排除不合理的参考音频长度
+
+4-修复GPT训练不保存ckpt的问题
+
+5-完善Dockerfile的下载模型流程
+
+### 20240129更新
+
+1-16系等半精度训练有问题的显卡把训练配置改为单精度训练
+
+2-测试更新可用的colab版本
+
+3-修复git clone modelscope funasr仓库+老版本funasr导致接口不对齐报错的问题
+
+
+### 20240130更新
+
+1-所有涉及路径的地方双引号自动去除,小白复制路径带双引号不会报错
+
+2-修复中英文标点切割问题和句首句尾补标点的问题
+
+3-增加按标点符号切分
+
+### 20240201更新
+
+1-修复uvr5读取格式错误导致分离失败的问题
+
+2-支持中日英混合多种文本自动切分识别语种
+
+### 20240202更新
+
+1-修复asr路径尾缀带/保存文件名报错
+
+2-引入paddlespeech的Normalizer https://github.com/RVC-Boss/GPT-SoVITS/pull/377 修复一些问题, 例如: xx.xx%(带百分号类), 元/吨 会读成 元吨 而不是元每吨,下划线不再会报错
+
+### 20240207更新
+
+1-修正语种传参混乱导致中文推理效果下降 https://github.com/RVC-Boss/GPT-SoVITS/issues/391
+
+2-uvr5适配高版本librosa https://github.com/RVC-Boss/GPT-SoVITS/pull/403
+
+3-[修复uvr5 inf everywhere报错的问题(is_half传参未转换bool导致恒定半精度推理, 16系显卡会inf)](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8)
+
+4-优化英文文本前端
+
+5-修复gradio依赖
+
+6-支持三连根目录留空自动读取.list全路径
+
+7-集成faster whisper ASR日文英文
+
+### 20240208更新
+
+1-GPT训练卡死 (win10 1909) 和https://github.com/RVC-Boss/GPT-SoVITS/issues/232 (系统语言繁体) GPT训练报错, [尝试修复](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b).
+
+### 20240212更新
+
+1-faster whisper和funasr逻辑优化.faster whisper转镜像站下载, 规避huggingface连不上的问题.
+
+2-DPO Loss实验性训练选项开启, 通过构造负样本训练缓解GPT重复漏字问题.推理界面公开几个推理参数. https://github.com/RVC-Boss/GPT-SoVITS/pull/457
+
+### 20240214更新
+
+1-训练支持中文实验名 (原来会报错)
+
+2-DPO训练改为可勾选选项而非必须.如勾选batch size自动减半.修复推理界面新参数不传参的问题.
+
+### 20240216更新
+
+1-支持无参考文本输入
+
+2-修复中文文本前端bug https://github.com/RVC-Boss/GPT-SoVITS/issues/475
+
+### 20240221更新
+
+1-数据处理添加语音降噪选项 (降噪为只剩16k采样率, 除非底噪很大先不急着用哦).
+
+2-中文日文前端处理优化 https://github.com/RVC-Boss/GPT-SoVITS/pull/559 https://github.com/RVC-Boss/GPT-SoVITS/pull/556 https://github.com/RVC-Boss/GPT-SoVITS/pull/532 https://github.com/RVC-Boss/GPT-SoVITS/pull/507 https://github.com/RVC-Boss/GPT-SoVITS/pull/509
+
+3-mac CPU推理更快因此把推理设备从mps改到CPU
+
+4-colab修复不开启公网url
+
+### 20240306更新
+
+1-推理加速50% (RTX3090+pytorch2.2.1+cu11.8+win10+py39 tested) https://github.com/RVC-Boss/GPT-SoVITS/pull/672
+
+2-如果用faster whisper非中文ASR不再需要先下中文funasr模型
+
+3-修复uvr5去混响模型 是否混响 反的 https://github.com/RVC-Boss/GPT-SoVITS/pull/610
+
+4-faster whisper如果无cuda可用自动cpu推理 https://github.com/RVC-Boss/GPT-SoVITS/pull/675
+
+5-修改is_half的判断使在Mac上能正常CPU推理 https://github.com/RVC-Boss/GPT-SoVITS/pull/573
+
+### 202403/202404/202405更新
+
+2个重点
+
+1-修复sovits训练未冻结vq的问题 (可能造成效果下降)
+
+2-增加一个快速推理分支
+
+以下都是小修补
+
+1-修复无参考文本模式问题
+
+2-优化中英文文本前端
+
+3-api格式优化
+
+4-cmd格式问题修复
+
+5-训练数据处理阶段不支持的语言提示报错
+
+6-nan自动转fp32阶段的hubert提取bug修复
+
+### 20240610
+
+小问题修复:
+
+1-完善纯标点、多标点文本输入的判断逻辑 https://github.com/RVC-Boss/GPT-SoVITS/pull/1168 https://github.com/RVC-Boss/GPT-SoVITS/pull/1169
+
+2-uvr5中的mdxnet去混响cmd格式修复, 兼容路径带空格 [#501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232)
+
+3-s2训练进度条逻辑修复 https://github.com/RVC-Boss/GPT-SoVITS/pull/1159
+
+大问题修复:
+
+4-修复了webui的GPT中文微调没读到bert导致和推理不一致, 训练太多可能效果还会变差的问题.如果大量数据微调的建议重新微调模型得到质量优化 [#99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a)
+
+### 20240706
+
+小问题修复:
+
+1-[修正CPU推理默认bs小数](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041)
+
+2-修复降噪、asr中途遇到异常跳出所有需处理的音频文件的问题 https://github.com/RVC-Boss/GPT-SoVITS/pull/1258 https://github.com/RVC-Boss/GPT-SoVITS/pull/1265 https://github.com/RVC-Boss/GPT-SoVITS/pull/1267
+
+3-修复按标点符号切分时小数会被切分 https://github.com/RVC-Boss/GPT-SoVITS/pull/1253
+
+4-[多卡训练多进程保存逻辑修复](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca)
+
+5-移除冗余my_utils https://github.com/RVC-Boss/GPT-SoVITS/pull/1251
+
+重点:
+
+6-倍速推理代码经过验证后推理效果和base完全一致, 合并进main.使用的代码: https://github.com/RVC-Boss/GPT-SoVITS/pull/672 .支持无参考文本模式也倍速.
+
+后面会逐渐验证快速推理分支的推理改动的一致性
+
+### 20240727
+
+1-清理冗余i18n代码 https://github.com/RVC-Boss/GPT-SoVITS/pull/1298
+
+2-修复用户打文件及路径在结尾添加/会导致命令行报错的问题 https://github.com/RVC-Boss/GPT-SoVITS/pull/1299
+
+3-修复GPT训练的step计算逻辑 https://github.com/RVC-Boss/GPT-SoVITS/pull/756
+
+重点:
+
+4-[支持合成语速调节.支持冻结随机性只调节语速, ](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2)并将其更新到api.py上https://github.com/RVC-Boss/GPT-SoVITS/pull/1340
+
+
+### 20240806
+
+1-增加bs-roformer人声伴奏分离模型支持. https://github.com/RVC-Boss/GPT-SoVITS/pull/1306 https://github.com/RVC-Boss/GPT-SoVITS/pull/1356 [支持fp16推理.](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c)
+
+2-更好的中文文本前端. https://github.com/RVC-Boss/GPT-SoVITS/pull/987 https://github.com/RVC-Boss/GPT-SoVITS/pull/1351 https://github.com/RVC-Boss/GPT-SoVITS/pull/1404 优化多音字逻辑 (v2版本特供). https://github.com/RVC-Boss/GPT-SoVITS/pull/488
+
+3-自动填充下一步的文件路径 https://github.com/RVC-Boss/GPT-SoVITS/pull/1355
+
+4-增加喂饭逻辑, 用户瞎写显卡序号也可以正常运作 [bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299) [4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78)
+
+5-增加粤语ASR支持 [8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3)
+
+6-GPT-SoVITS-v2支持
+
+7-计时逻辑优化 https://github.com/RVC-Boss/GPT-SoVITS/pull/1387
+
+### 20240821
+
+1-fast_inference分支合并进main: https://github.com/RVC-Boss/GPT-SoVITS/pull/1490
+
+2-支持通过ssml标签优化数字、电话、时间日期等: https://github.com/RVC-Boss/GPT-SoVITS/issues/1508
+
+3-api修复优化: https://github.com/RVC-Boss/GPT-SoVITS/pull/1503
+
+4-修复了参考音频混合只能上传一条的bug:https://github.com/RVC-Boss/GPT-SoVITS/pull/1422
+
+5-增加了各种数据集检查,若缺失会弹出warning:https://github.com/RVC-Boss/GPT-SoVITS/pull/1422
+
+### 20250211
+
+增加gpt-sovits-v3模型, 需要14G显存可以微调
+
+### 20250212
+
+sovits-v3微调支持开启梯度检查点, 需要12G显存可以微调https://github.com/RVC-Boss/GPT-SoVITS/pull/2040
+
+### 20250214
+
+优化多语种混合文本切分策略a https://github.com/RVC-Boss/GPT-SoVITS/pull/2047
+
+### 20250217
+
+优化文本里的数字和英文处理逻辑https://github.com/RVC-Boss/GPT-SoVITS/pull/2062
+
+### 20250218
+
+优化多语种混合文本切分策略b https://github.com/RVC-Boss/GPT-SoVITS/pull/2073
+
+### 20250223
+
+1-sovits-v3微调支持lora训练, 需要8G显存可以微调, 效果比全参微调更好
+
+2-人声背景音分离增加mel band roformer模型支持https://github.com/RVC-Boss/GPT-SoVITS/pull/2078
+
+### 20250226
+
+https://github.com/RVC-Boss/GPT-SoVITS/pull/2112 https://github.com/RVC-Boss/GPT-SoVITS/pull/2114
+
+修复中文路径下mecab的报错 (具体表现为日文韩文、文本混合语种切分可能会遇到的报错)
+
+### 20250227
+
+针对v3生成24k音频感觉闷的问题https://github.com/RVC-Boss/GPT-SoVITS/issues/2085 https://github.com/RVC-Boss/GPT-SoVITS/issues/2117 ,支持使用24k to 48k的音频超分模型缓解.
+
+
+### 20250228
+
+修复短文本语种选择出错 https://github.com/RVC-Boss/GPT-SoVITS/pull/2122
+
+修复v3sovits未传参以支持调节语速
+
+### 202503
+
+修复一批由依赖的库版本不对导致的问题https://github.com/RVC-Boss/GPT-SoVITS/commit/6c468583c5566e5fbb4fb805e4cc89c403e997b8
+
+修复模型加载异步逻辑https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa
+
+修复其他若干bug
+
+重点更新:
+
+1-v3支持并行推理 https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa
+
+2-整合包修复onnxruntime GPU推理的支持, 影响: (1) g2pw有个onnx模型原先是CPU推理现在用GPU, 显著降低推理的CPU瓶颈 (2) foxjoy去混响模型现在可使用GPU推理
diff --git a/docs/cn/README.md b/docs/cn/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cc72b893eef3f44fb6e564e354dc0e66ba623fb4
--- /dev/null
+++ b/docs/cn/README.md
@@ -0,0 +1,395 @@
+
+
+
GPT-SoVITS-WebUI
+强大的少样本语音转换与语音合成Web用户界面.
+
+[](https://github.com/RVC-Boss/GPT-SoVITS)
+
+

+
+
+
+[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
+[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
+[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
+[](https://discord.gg/dnrgs5GHfG)
+
+[**English**](../../README.md) | **中文简体** | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)
+
+
+
+---
+
+## 功能:
+
+1. **零样本文本到语音 (TTS): ** 输入 5 秒的声音样本, 即刻体验文本到语音转换.
+
+2. **少样本 TTS: ** 仅需 1 分钟的训练数据即可微调模型, 提升声音相似度和真实感.
+
+3. **跨语言支持: ** 支持与训练数据集不同语言的推理, 目前支持英语、日语、韩语、粤语和中文.
+
+4. **WebUI 工具: ** 集成工具包括声音伴奏分离、自动训练集分割、中文自动语音识别(ASR)和文本标注, 协助初学者创建训练数据集和 GPT/SoVITS 模型.
+
+**查看我们的介绍视频 [demo video](https://www.bilibili.com/video/BV12g4y1m7Uw)**
+
+未见过的说话者 few-shot 微调演示:
+
+https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
+
+**用户手册: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
+
+## 安装
+
+中国地区的用户可[点击此处](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official)使用 AutoDL 云端镜像进行体验.
+
+### 测试通过的环境
+
+| Python Version | PyTorch Version | Device |
+|----------------|------------------|-----------------|
+| Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 |
+| Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 |
+| Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 |
+| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
+| Python 3.11 | PyTorch 2.6.0 | Apple silicon |
+| Python 3.9 | PyTorch 2.2.2 | CPU |
+
+### Windows
+
+如果你是 Windows 用户 (已在 win>=10 上测试), 可以下载[整合包](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true), 解压后双击 go-webui.bat 即可启动 GPT-SoVITS-WebUI.
+
+**中国地区的用户可以[在此处下载整合包](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).**
+
+### Linux
+
+```bash
+conda create -n GPTSoVits python=3.9
+conda activate GPTSoVits
+bash install.sh --source [--download-uvr5]
+```
+
+### macOS
+
+**注: 在 Mac 上使用 GPU 训练的模型效果显著低于其他设备训练的模型, 所以我们暂时使用 CPU 进行训练.**
+
+1. 运行 `xcode-select --install` 安装 Xcode command-line tools.
+2. 运行以下的命令来安装本项目:
+
+```bash
+conda create -n GPTSoVits python=3.9
+conda activate GPTSoVits
+bash install.sh --source [--download-uvr5]
+```
+
+### 手动安装
+
+#### 安装 FFmpeg
+
+##### Conda 用户
+
+```bash
+conda install ffmpeg
+```
+
+##### Ubuntu/Debian 用户
+
+```bash
+sudo apt install ffmpeg
+sudo apt install libsox-dev
+conda install -c conda-forge 'ffmpeg<7'
+```
+
+##### Windows 用户
+
+下载并将 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) 和 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) 放置在 GPT-SoVITS 根目录下.
+
+安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语 TTS)
+
+##### MacOS 用户
+
+```bash
+brew install ffmpeg
+```
+
+#### 安装依赖
+
+```bash
+pip install -r extra-req.txt --no-deps
+pip install -r requirements.txt
+```
+
+### 在 Docker 中使用
+
+#### docker-compose.yaml 设置
+
+0. image 的标签: 由于代码库更新很快, 镜像的打包和测试又很慢, 所以请自行在 [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(旧版本) 查看当前打包好的最新的镜像并根据自己的情况选用, 或者在本地根据您自己的需求通过 Dockerfile 进行构建.
+1. 环境变量:
+
+- is_half: 半精度/双精度控制.在进行 "SSL extracting" 步骤时如果无法正确生成 4-cnhubert/5-wav32k 目录下的内容时, 一般都是它引起的, 可以根据实际情况来调整为 True 或者 False.
+
+2. Volume 设置, 容器内的应用根目录设置为 /workspace. 默认的 docker-compose.yaml 中列出了一些实际的例子, 便于上传/下载内容.
+3. shm_size: Windows 下的 Docker Desktop 默认可用内存过小, 会导致运行异常, 根据自己情况酌情设置.
+4. deploy 小节下的 gpu 相关内容, 请根据您的系统和实际情况酌情设置.
+
+#### 通过 docker compose 运行
+
+```
+docker compose -f "docker-compose.yaml" up -d
+```
+
+#### 通过 docker 命令运行
+
+同上, 根据您自己的实际情况修改对应的参数, 然后运行如下命令:
+
+```
+docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx
+```
+
+## 预训练模型
+
+**若成功运行`install.sh`可跳过 No.1,2,3**
+
+**中国地区的用户可以[在此处下载这些模型](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#nVNhX).**
+
+1. 从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型, 并将其放置在 `GPT_SoVITS/pretrained_models` 目录中.
+
+2. 从 [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) 下载模型, 解压并重命名为 `G2PWModel`, 然后将其放置在 `GPT_SoVITS/text` 目录中. (仅限中文 TTS)
+
+3. 对于 UVR5 (人声/伴奏分离和混响移除, 额外功能), 从 [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 下载模型, 并将其放置在 `tools/uvr5/uvr5_weights` 目录中.
+
+ - 如果你在 UVR5 中使用 `bs_roformer` 或 `mel_band_roformer`模型, 你可以手动下载模型和相应的配置文件, 并将它们放在 `tools/UVR5/UVR5_weights` 中.**重命名模型文件和配置文件, 确保除后缀外**, 模型和配置文件具有相同且对应的名称.此外, 模型和配置文件名**必须包含"roformer"**, 才能被识别为 roformer 类的模型.
+
+ - 建议在模型名称和配置文件名中**直接指定模型类型**, 例如`mel_mand_roformer`、`bs_roformer`.如果未指定, 将从配置文中比对特征, 以确定它是哪种类型的模型.例如, 模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对.`kim_mel_band_roformer.ckpt` 和 `kim_mel_band_roformer.yaml` 也是一对.
+
+4. 对于中文 ASR (额外功能), 从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型, 并将它们放置在 `tools/asr/models` 目录中.
+
+5. 对于英语或日语 ASR (额外功能), 从 [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 下载模型, 并将其放置在 `tools/asr/models` 目录中.此外, [其他模型](https://huggingface.co/Systran) 可能具有类似效果且占用更少的磁盘空间.
+
+## 数据集格式
+
+文本到语音 (TTS) 注释 .list 文件格式:
+
+```
+vocal_path|speaker_name|language|text
+```
+
+语言字典:
+
+- 'zh': 中文
+- 'ja': 日语
+- 'en': 英语
+- 'ko': 韩语
+- 'yue': 粤语
+
+示例:
+
+```
+D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神.
+```
+
+## 微调与推理
+
+### 打开 WebUI
+
+#### 整合包用户
+
+双击`go-webui.bat`或者使用`go-webui.ps1`
+若想使用 V1,则双击`go-webui-v1.bat`或者使用`go-webui-v1.ps1`
+
+#### 其他
+
+```bash
+python webui.py
+```
+
+若想使用 V1,则
+
+```bash
+python webui.py v1
+```
+
+或者在 webUI 内动态切换
+
+### 微调
+
+#### 现已支持自动填充路径
+
+ 1. 填入训练音频路径
+ 2. 切割音频
+ 3. 进行降噪(可选)
+ 4. 进行ASR
+ 5. 校对标注
+ 6. 前往下一个窗口,点击训练
+
+### 打开推理 WebUI
+
+#### 整合包用户
+
+双击 `go-webui.bat` 或者使用 `go-webui.ps1` ,然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
+
+#### 其他
+
+```bash
+python GPT_SoVITS/inference_webui.py
+```
+
+或者
+
+```bash
+python webui.py
+```
+
+然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
+
+## V2 发布说明
+
+新特性:
+
+1. 支持韩语及粤语
+
+2. 更好的文本前端
+
+3. 底模由 2k 小时扩展至 5k 小时
+
+4. 对低音质参考音频 (尤其是来源于网络的高频严重缺失、听着很闷的音频) 合成出来音质更好
+
+ 详见[wiki]()
+
+从 v1 环境迁移至 v2
+
+1. 需要 pip 安装 requirements.txt 更新环境
+
+2. 需要克隆 github 上的最新代码
+
+3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到 GPT_SoVITS\pretrained_models\gsv-v2final-pretrained 下
+
+ 中文额外需要下载[G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (下载 G2PW 模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下)
+
+## V3 更新说明
+
+新模型特点:
+
+1. 音色相似度更像, 需要更少训练集来逼近本人 (不训练直接使用底模模式下音色相似性提升更大)
+
+2. GPT 合成更稳定, 重复漏字更少, 也更容易跑出丰富情感
+
+ 详见[wiki]()
+
+从 v2 环境迁移至 v3
+
+1. 需要 pip 安装 requirements.txt 更新环境
+
+2. 需要克隆 github 上的最新代码
+
+3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些 v3 新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS\pretrained_models`目录下
+
+ 如果想用音频超分功能缓解 v3 模型生成 24k 音频觉得闷的问题, 需要下载额外的模型参数, 参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt)
+
+## 待办事项清单
+
+- [x] **高优先级: **
+
+ - [x] 日语和英语的本地化.
+ - [x] 用户指南.
+ - [x] 日语和英语数据集微调训练.
+
+- [ ] **功能:**
+ - [x] 零样本声音转换 (5 秒) / 少样本声音转换 (1 分钟).
+ - [x] TTS 语速控制.
+ - [ ] ~~增强的 TTS 情感控制.~~
+ - [ ] 尝试将 SoVITS 令牌输入更改为词汇的概率分布.
+ - [x] 改进英语和日语文本前端.
+ - [ ] 开发体积小和更大的 TTS 模型.
+ - [x] Colab 脚本.
+ - [x] 扩展训练数据集 (从 2k 小时到 10k 小时).
+ - [x] 更好的 sovits 基础模型 (增强的音频质量).
+ - [ ] 模型混合.
+
+## (附加) 命令行运行方式
+
+使用命令行打开 UVR5 的 WebUI
+
+```
+python tools/uvr5/webui.py ""
+```
+
+
+
+这是使用命令行完成数据集的音频切分的方式
+
+```
+python audio_slicer.py \
+ --input_path "" \
+ --output_root "" \
+ --threshold \
+ --min_length \
+ --min_interval
+ --hop_size
+```
+
+这是使用命令行完成数据集 ASR 处理的方式 (仅限中文)
+
+```
+python tools/asr/funasr_asr.py -i -o